diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5124 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.0, + "eval_steps": 1, + "global_step": 48810, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03073140749846343, + "grad_norm": 5.49942684173584, + "learning_rate": 0.0001, + "loss": 11.0245, + "step": 100, + "train_loss_gtc": 4.04046875, + "train_loss_gtm": 0.6575, + "train_loss_lm": 6.331875 + }, + { + "epoch": 0.06146281499692686, + "grad_norm": 6.315459251403809, + "learning_rate": 9.999896007507038e-05, + "loss": 6.9106, + "step": 200, + "train_loss_gtc": 2.85453125, + "train_loss_gtm": 0.6456640625, + "train_loss_lm": 3.41375 + }, + { + "epoch": 0.09219422249539029, + "grad_norm": 4.463039398193359, + "learning_rate": 9.999584034353926e-05, + "loss": 5.6825, + "step": 300, + "train_loss_gtc": 2.053828125, + "train_loss_gtm": 0.625390625, + "train_loss_lm": 3.02734375 + }, + { + "epoch": 0.12292562999385372, + "grad_norm": 8.906160354614258, + "learning_rate": 9.999064093517811e-05, + "loss": 4.8225, + "step": 400, + "train_loss_gtc": 1.5528125, + "train_loss_gtm": 0.550078125, + "train_loss_lm": 2.7178125 + }, + { + "epoch": 0.15365703749231716, + "grad_norm": 5.962785720825195, + "learning_rate": 9.99833620662667e-05, + "loss": 4.3188, + "step": 500, + "train_loss_gtc": 1.3146875, + "train_loss_gtm": 0.4680859375, + "train_loss_lm": 2.55734375 + }, + { + "epoch": 0.18438844499078058, + "grad_norm": 15.813605308532715, + "learning_rate": 9.997400403958414e-05, + "loss": 3.9968, + "step": 600, + "train_loss_gtc": 1.151953125, + "train_loss_gtm": 0.404775390625, + "train_loss_lm": 2.43796875 + }, + { + "epoch": 0.215119852489244, + "grad_norm": 9.086814880371094, + "learning_rate": 9.99625672443962e-05, + "loss": 3.8064, + "step": 700, + "train_loss_gtc": 1.0492578125, + "train_loss_gtm": 0.37134765625, + "train_loss_lm": 2.38078125 + }, + { + "epoch": 0.24585125998770743, + "grad_norm": 10.451958656311035, + "learning_rate": 9.994905215643926e-05, + "loss": 3.6012, + "step": 800, + "train_loss_gtc": 0.9308203125, + "train_loss_gtm": 0.342978515625, + "train_loss_lm": 2.32328125 + }, + { + "epoch": 0.2765826674861709, + "grad_norm": 2.9532716274261475, + "learning_rate": 9.993345933790036e-05, + "loss": 3.4027, + "step": 900, + "train_loss_gtc": 0.80484375, + "train_loss_gtm": 0.313017578125, + "train_loss_lm": 2.28640625 + }, + { + "epoch": 0.3073140749846343, + "grad_norm": 5.513271331787109, + "learning_rate": 9.991578943739396e-05, + "loss": 3.2882, + "step": 1000, + "train_loss_gtc": 0.748984375, + "train_loss_gtm": 0.29857421875, + "train_loss_lm": 2.23609375 + }, + { + "epoch": 0.33804548248309774, + "grad_norm": 9.901054382324219, + "learning_rate": 9.989604318993484e-05, + "loss": 3.1962, + "step": 1100, + "train_loss_gtc": 0.694609375, + "train_loss_gtm": 0.293974609375, + "train_loss_lm": 2.22203125 + }, + { + "epoch": 0.36877688998156116, + "grad_norm": 4.640697956085205, + "learning_rate": 9.987422141690761e-05, + "loss": 3.0563, + "step": 1200, + "train_loss_gtc": 0.62521484375, + "train_loss_gtm": 0.266708984375, + "train_loss_lm": 2.163359375 + }, + { + "epoch": 0.3995082974800246, + "grad_norm": 6.046248435974121, + "learning_rate": 9.98503250260325e-05, + "loss": 2.9871, + "step": 1300, + "train_loss_gtc": 0.60119140625, + "train_loss_gtm": 0.250830078125, + "train_loss_lm": 2.1440625 + }, + { + "epoch": 0.430239704978488, + "grad_norm": 6.06462287902832, + "learning_rate": 9.982435501132761e-05, + "loss": 2.918, + "step": 1400, + "train_loss_gtc": 0.54515625, + "train_loss_gtm": 0.228916015625, + "train_loss_lm": 2.139453125 + }, + { + "epoch": 0.46097111247695144, + "grad_norm": 4.272490978240967, + "learning_rate": 9.979631245306756e-05, + "loss": 2.8624, + "step": 1500, + "train_loss_gtc": 0.5327734375, + "train_loss_gtm": 0.236171875, + "train_loss_lm": 2.10140625 + }, + { + "epoch": 0.49170251997541486, + "grad_norm": 2.8239645957946777, + "learning_rate": 9.976619851773859e-05, + "loss": 2.7952, + "step": 1600, + "train_loss_gtc": 0.49181640625, + "train_loss_gtm": 0.216806640625, + "train_loss_lm": 2.087890625 + }, + { + "epoch": 0.5224339274738783, + "grad_norm": 4.718524932861328, + "learning_rate": 9.973401445798997e-05, + "loss": 2.744, + "step": 1700, + "train_loss_gtc": 0.47638671875, + "train_loss_gtm": 0.2067578125, + "train_loss_lm": 2.06015625 + }, + { + "epoch": 0.5531653349723418, + "grad_norm": 7.191675662994385, + "learning_rate": 9.969976161258194e-05, + "loss": 2.6875, + "step": 1800, + "train_loss_gtc": 0.4446484375, + "train_loss_gtm": 0.1912353515625, + "train_loss_lm": 2.05390625 + }, + { + "epoch": 0.5838967424708051, + "grad_norm": 4.1135640144348145, + "learning_rate": 9.966344140633001e-05, + "loss": 2.6366, + "step": 1900, + "train_loss_gtc": 0.42078125, + "train_loss_gtm": 0.187529296875, + "train_loss_lm": 2.03140625 + }, + { + "epoch": 0.6146281499692686, + "grad_norm": 4.423370361328125, + "learning_rate": 9.962505535004571e-05, + "loss": 2.6245, + "step": 2000, + "train_loss_gtc": 0.40998046875, + "train_loss_gtm": 0.1915087890625, + "train_loss_lm": 2.026328125 + }, + { + "epoch": 0.645359557467732, + "grad_norm": 3.297764778137207, + "learning_rate": 9.958460504047372e-05, + "loss": 2.5585, + "step": 2100, + "train_loss_gtc": 0.38455078125, + "train_loss_gtm": 0.176318359375, + "train_loss_lm": 1.99703125 + }, + { + "epoch": 0.6760909649661955, + "grad_norm": 8.061768531799316, + "learning_rate": 9.954209216022543e-05, + "loss": 2.5188, + "step": 2200, + "train_loss_gtc": 0.36263671875, + "train_loss_gtm": 0.1635595703125, + "train_loss_lm": 1.987578125 + }, + { + "epoch": 0.7068223724646588, + "grad_norm": 4.367369174957275, + "learning_rate": 9.949751847770904e-05, + "loss": 2.5078, + "step": 2300, + "train_loss_gtc": 0.3640234375, + "train_loss_gtm": 0.174541015625, + "train_loss_lm": 1.973359375 + }, + { + "epoch": 0.7375537799631223, + "grad_norm": 1.8874632120132446, + "learning_rate": 9.945088584705584e-05, + "loss": 2.4485, + "step": 2400, + "train_loss_gtc": 0.33607421875, + "train_loss_gtm": 0.14620361328125, + "train_loss_lm": 1.95671875 + }, + { + "epoch": 0.7682851874615857, + "grad_norm": 2.437286376953125, + "learning_rate": 9.940219620804327e-05, + "loss": 2.4232, + "step": 2500, + "train_loss_gtc": 0.3200390625, + "train_loss_gtm": 0.149814453125, + "train_loss_lm": 1.95375 + }, + { + "epoch": 0.7990165949600492, + "grad_norm": 2.414090871810913, + "learning_rate": 9.935145158601411e-05, + "loss": 2.4102, + "step": 2600, + "train_loss_gtc": 0.317177734375, + "train_loss_gtm": 0.153583984375, + "train_loss_lm": 1.937109375 + }, + { + "epoch": 0.8297480024585125, + "grad_norm": 3.1979939937591553, + "learning_rate": 9.929865409179224e-05, + "loss": 2.3885, + "step": 2700, + "train_loss_gtc": 0.30353515625, + "train_loss_gtm": 0.135172119140625, + "train_loss_lm": 1.940078125 + }, + { + "epoch": 0.860479409956976, + "grad_norm": 5.63021993637085, + "learning_rate": 9.92438059215949e-05, + "loss": 2.3431, + "step": 2800, + "train_loss_gtc": 0.286796875, + "train_loss_gtm": 0.1349951171875, + "train_loss_lm": 1.92109375 + }, + { + "epoch": 0.8912108174554395, + "grad_norm": 2.3982977867126465, + "learning_rate": 9.918690935694126e-05, + "loss": 2.3297, + "step": 2900, + "train_loss_gtc": 0.28318359375, + "train_loss_gtm": 0.134306640625, + "train_loss_lm": 1.91375 + }, + { + "epoch": 0.9219422249539029, + "grad_norm": 3.208214282989502, + "learning_rate": 9.912796676455757e-05, + "loss": 2.3016, + "step": 3000, + "train_loss_gtc": 0.272578125, + "train_loss_gtm": 0.124122314453125, + "train_loss_lm": 1.913046875 + }, + { + "epoch": 0.9526736324523664, + "grad_norm": 2.7042226791381836, + "learning_rate": 9.906698059627866e-05, + "loss": 2.2748, + "step": 3100, + "train_loss_gtc": 0.25408203125, + "train_loss_gtm": 0.121632080078125, + "train_loss_lm": 1.896484375 + }, + { + "epoch": 0.9834050399508297, + "grad_norm": 3.007841110229492, + "learning_rate": 9.900395338894601e-05, + "loss": 2.2726, + "step": 3200, + "train_loss_gtc": 0.254970703125, + "train_loss_gtm": 0.1286865234375, + "train_loss_lm": 1.881171875 + }, + { + "epoch": 1.0, + "eval_loss": 2.549999952316284, + "eval_runtime": 4.223, + "eval_samples_per_second": 235.615, + "eval_steps_per_second": 2.605, + "step": 3254, + "train_loss_gtc": 0.26175491898148145, + "train_loss_gtm": 0.12819191261574073, + "train_loss_lm": 1.8826678240740742, + "val_loss_gtc": 0.39130859375, + "val_loss_gtm": 0.2362060546875, + "val_loss_lm": 1.94765625 + }, + { + "epoch": 1.014136447449293, + "grad_norm": 3.212726354598999, + "learning_rate": 9.89388877643022e-05, + "loss": 2.2582, + "step": 3300, + "train_loss_gtc": 0.24630604619565216, + "train_loss_gtm": 0.11464259935461957, + "train_loss_lm": 1.8685461956521738 + }, + { + "epoch": 1.0448678549477566, + "grad_norm": 2.5640087127685547, + "learning_rate": 9.887178642888182e-05, + "loss": 2.2174, + "step": 3400, + "train_loss_gtc": 0.23337890625, + "train_loss_gtm": 0.11256591796875, + "train_loss_lm": 1.86703125 + }, + { + "epoch": 1.07559926244622, + "grad_norm": 1.9021731615066528, + "learning_rate": 9.880265217389893e-05, + "loss": 2.2195, + "step": 3500, + "train_loss_gtc": 0.234921875, + "train_loss_gtm": 0.125850830078125, + "train_loss_lm": 1.86671875 + }, + { + "epoch": 1.1063306699446835, + "grad_norm": 2.093270778656006, + "learning_rate": 9.873148787513093e-05, + "loss": 2.2154, + "step": 3600, + "train_loss_gtc": 0.241884765625, + "train_loss_gtm": 0.12220458984375, + "train_loss_lm": 1.8584375 + }, + { + "epoch": 1.1370620774431468, + "grad_norm": 4.833855152130127, + "learning_rate": 9.865829649279898e-05, + "loss": 2.1983, + "step": 3700, + "train_loss_gtc": 0.2290625, + "train_loss_gtm": 0.108033447265625, + "train_loss_lm": 1.843671875 + }, + { + "epoch": 1.1677934849416103, + "grad_norm": 4.2011518478393555, + "learning_rate": 9.858308107144479e-05, + "loss": 2.1765, + "step": 3800, + "train_loss_gtc": 0.223466796875, + "train_loss_gtm": 0.109500732421875, + "train_loss_lm": 1.837578125 + }, + { + "epoch": 1.1985248924400738, + "grad_norm": 6.385040760040283, + "learning_rate": 9.850584473980405e-05, + "loss": 2.1558, + "step": 3900, + "train_loss_gtc": 0.21083984375, + "train_loss_gtm": 0.102752685546875, + "train_loss_lm": 1.843203125 + }, + { + "epoch": 1.2292562999385372, + "grad_norm": 2.9829607009887695, + "learning_rate": 9.84265907106762e-05, + "loss": 2.1519, + "step": 4000, + "train_loss_gtc": 0.208603515625, + "train_loss_gtm": 0.11494384765625, + "train_loss_lm": 1.833828125 + }, + { + "epoch": 1.2599877074370007, + "grad_norm": 1.9834630489349365, + "learning_rate": 9.834532228079088e-05, + "loss": 2.1325, + "step": 4100, + "train_loss_gtc": 0.201494140625, + "train_loss_gtm": 0.102637939453125, + "train_loss_lm": 1.829453125 + }, + { + "epoch": 1.290719114935464, + "grad_norm": 2.685356378555298, + "learning_rate": 9.826204283067073e-05, + "loss": 2.1218, + "step": 4200, + "train_loss_gtc": 0.19677734375, + "train_loss_gtm": 0.100631103515625, + "train_loss_lm": 1.82484375 + }, + { + "epoch": 1.3214505224339275, + "grad_norm": 1.5945948362350464, + "learning_rate": 9.817675582449082e-05, + "loss": 2.1261, + "step": 4300, + "train_loss_gtc": 0.19837890625, + "train_loss_gtm": 0.11297119140625, + "train_loss_lm": 1.828828125 + }, + { + "epoch": 1.352181929932391, + "grad_norm": 2.5570931434631348, + "learning_rate": 9.80894648099345e-05, + "loss": 2.1039, + "step": 4400, + "train_loss_gtc": 0.18681640625, + "train_loss_gtm": 0.09926513671875, + "train_loss_lm": 1.81125 + }, + { + "epoch": 1.3829133374308542, + "grad_norm": 1.3465452194213867, + "learning_rate": 9.800017341804584e-05, + "loss": 2.0879, + "step": 4500, + "train_loss_gtc": 0.1895263671875, + "train_loss_gtm": 0.101627197265625, + "train_loss_lm": 1.799453125 + }, + { + "epoch": 1.4136447449293177, + "grad_norm": 1.4922404289245605, + "learning_rate": 9.790888536307865e-05, + "loss": 2.0743, + "step": 4600, + "train_loss_gtc": 0.1802734375, + "train_loss_gtm": 0.0869610595703125, + "train_loss_lm": 1.797890625 + }, + { + "epoch": 1.4443761524277812, + "grad_norm": 3.5552456378936768, + "learning_rate": 9.781560444234187e-05, + "loss": 2.077, + "step": 4700, + "train_loss_gtc": 0.1821533203125, + "train_loss_gtm": 0.10408935546875, + "train_loss_lm": 1.79203125 + }, + { + "epoch": 1.4751075599262446, + "grad_norm": 4.11555290222168, + "learning_rate": 9.77203345360417e-05, + "loss": 2.0674, + "step": 4800, + "train_loss_gtc": 0.178369140625, + "train_loss_gtm": 0.09654052734375, + "train_loss_lm": 1.797109375 + }, + { + "epoch": 1.5058389674247081, + "grad_norm": 1.795981764793396, + "learning_rate": 9.762307960712018e-05, + "loss": 2.0636, + "step": 4900, + "train_loss_gtc": 0.179599609375, + "train_loss_gtm": 0.10112548828125, + "train_loss_lm": 1.788359375 + }, + { + "epoch": 1.5365703749231714, + "grad_norm": 1.9280930757522583, + "learning_rate": 9.75238437010903e-05, + "loss": 2.0431, + "step": 5000, + "train_loss_gtc": 0.170859375, + "train_loss_gtm": 0.0848968505859375, + "train_loss_lm": 1.785546875 + }, + { + "epoch": 1.5673017824216349, + "grad_norm": 1.1412756443023682, + "learning_rate": 9.742263094586775e-05, + "loss": 2.0316, + "step": 5100, + "train_loss_gtc": 0.171064453125, + "train_loss_gtm": 0.0872491455078125, + "train_loss_lm": 1.7815625 + }, + { + "epoch": 1.5980331899200984, + "grad_norm": 1.9907689094543457, + "learning_rate": 9.731944555159926e-05, + "loss": 2.0229, + "step": 5200, + "train_loss_gtc": 0.1637890625, + "train_loss_gtm": 0.080010986328125, + "train_loss_lm": 1.778359375 + }, + { + "epoch": 1.6287645974185616, + "grad_norm": 1.9761877059936523, + "learning_rate": 9.721429181048736e-05, + "loss": 2.0141, + "step": 5300, + "train_loss_gtc": 0.163154296875, + "train_loss_gtm": 0.08893798828125, + "train_loss_lm": 1.76640625 + }, + { + "epoch": 1.6594960049170253, + "grad_norm": 2.6358389854431152, + "learning_rate": 9.710717409661191e-05, + "loss": 2.0137, + "step": 5400, + "train_loss_gtc": 0.1599267578125, + "train_loss_gtm": 0.08393280029296875, + "train_loss_lm": 1.7646875 + }, + { + "epoch": 1.6902274124154886, + "grad_norm": 1.707661509513855, + "learning_rate": 9.699809686574819e-05, + "loss": 2.0079, + "step": 5500, + "train_loss_gtc": 0.157626953125, + "train_loss_gtm": 0.0876666259765625, + "train_loss_lm": 1.757421875 + }, + { + "epoch": 1.720958819913952, + "grad_norm": 2.0161757469177246, + "learning_rate": 9.688706465518145e-05, + "loss": 2.0002, + "step": 5600, + "train_loss_gtc": 0.1531884765625, + "train_loss_gtm": 0.0824560546875, + "train_loss_lm": 1.757421875 + }, + { + "epoch": 1.7516902274124155, + "grad_norm": 1.4757572412490845, + "learning_rate": 9.677408208351822e-05, + "loss": 1.9837, + "step": 5700, + "train_loss_gtc": 0.152705078125, + "train_loss_gtm": 0.080008544921875, + "train_loss_lm": 1.756640625 + }, + { + "epoch": 1.7824216349108788, + "grad_norm": 1.820297360420227, + "learning_rate": 9.665915385049424e-05, + "loss": 1.9852, + "step": 5800, + "train_loss_gtc": 0.1525732421875, + "train_loss_gtm": 0.0837579345703125, + "train_loss_lm": 1.7565625 + }, + { + "epoch": 1.8131530424093425, + "grad_norm": 2.550199031829834, + "learning_rate": 9.65422847367789e-05, + "loss": 1.9628, + "step": 5900, + "train_loss_gtc": 0.1431494140625, + "train_loss_gtm": 0.070325927734375, + "train_loss_lm": 1.748359375 + }, + { + "epoch": 1.8438844499078058, + "grad_norm": 3.284813642501831, + "learning_rate": 9.642347960377638e-05, + "loss": 1.9785, + "step": 6000, + "train_loss_gtc": 0.149755859375, + "train_loss_gtm": 0.085838623046875, + "train_loss_lm": 1.74515625 + }, + { + "epoch": 1.8746158574062692, + "grad_norm": 2.071235418319702, + "learning_rate": 9.630274339342344e-05, + "loss": 1.9699, + "step": 6100, + "train_loss_gtc": 0.1496533203125, + "train_loss_gtm": 0.08025115966796875, + "train_loss_lm": 1.75 + }, + { + "epoch": 1.9053472649047327, + "grad_norm": 1.5470776557922363, + "learning_rate": 9.618008112798393e-05, + "loss": 1.9727, + "step": 6200, + "train_loss_gtc": 0.1493896484375, + "train_loss_gtm": 0.080283203125, + "train_loss_lm": 1.739921875 + }, + { + "epoch": 1.936078672403196, + "grad_norm": 6.53104305267334, + "learning_rate": 9.605549790983973e-05, + "loss": 1.9612, + "step": 6300, + "train_loss_gtc": 0.1470556640625, + "train_loss_gtm": 0.0838421630859375, + "train_loss_lm": 1.736328125 + }, + { + "epoch": 1.9668100799016595, + "grad_norm": 1.8970906734466553, + "learning_rate": 9.592899892127863e-05, + "loss": 1.9457, + "step": 6400, + "train_loss_gtc": 0.1391943359375, + "train_loss_gtm": 0.0737689208984375, + "train_loss_lm": 1.730859375 + }, + { + "epoch": 1.997541487400123, + "grad_norm": 1.675215721130371, + "learning_rate": 9.580058942427867e-05, + "loss": 1.9364, + "step": 6500, + "train_loss_gtc": 0.1374560546875, + "train_loss_gtm": 0.0687762451171875, + "train_loss_lm": 1.72921875 + }, + { + "epoch": 2.0, + "eval_loss": 2.19921875, + "eval_runtime": 3.8844, + "eval_samples_per_second": 256.154, + "eval_steps_per_second": 2.832, + "step": 6508, + "train_loss_gtc": 0.137939453125, + "train_loss_gtm": 0.0904998779296875, + "train_loss_lm": 1.70703125, + "val_loss_gtc": 0.2357421875, + "val_loss_gtm": 0.210223388671875, + "val_loss_lm": 1.7640625 + }, + { + "epoch": 2.028272894898586, + "grad_norm": 1.4447669982910156, + "learning_rate": 9.567027476028937e-05, + "loss": 1.9201, + "step": 6600, + "train_loss_gtc": 0.13188901154891305, + "train_loss_gtm": 0.06937939187754756, + "train_loss_lm": 1.7201086956521738 + }, + { + "epoch": 2.05900430239705, + "grad_norm": 2.3873608112335205, + "learning_rate": 9.553806035000945e-05, + "loss": 1.9203, + "step": 6700, + "train_loss_gtc": 0.12853515625, + "train_loss_gtm": 0.0747589111328125, + "train_loss_lm": 1.7165625 + }, + { + "epoch": 2.089735709895513, + "grad_norm": 3.420974016189575, + "learning_rate": 9.540395169316132e-05, + "loss": 1.9248, + "step": 6800, + "train_loss_gtc": 0.1289501953125, + "train_loss_gtm": 0.0793853759765625, + "train_loss_lm": 1.7153125 + }, + { + "epoch": 2.120467117393977, + "grad_norm": 2.3101561069488525, + "learning_rate": 9.526795436826242e-05, + "loss": 1.9149, + "step": 6900, + "train_loss_gtc": 0.128876953125, + "train_loss_gtm": 0.071492919921875, + "train_loss_lm": 1.713671875 + }, + { + "epoch": 2.15119852489244, + "grad_norm": 1.0823053121566772, + "learning_rate": 9.513007403239311e-05, + "loss": 1.8968, + "step": 7000, + "train_loss_gtc": 0.1226025390625, + "train_loss_gtm": 0.0626873779296875, + "train_loss_lm": 1.707890625 + }, + { + "epoch": 2.1819299323909034, + "grad_norm": 2.2665770053863525, + "learning_rate": 9.49903164209613e-05, + "loss": 1.9036, + "step": 7100, + "train_loss_gtc": 0.128154296875, + "train_loss_gtm": 0.06544097900390625, + "train_loss_lm": 1.702578125 + }, + { + "epoch": 2.212661339889367, + "grad_norm": 0.9536680579185486, + "learning_rate": 9.484868734746399e-05, + "loss": 1.8943, + "step": 7200, + "train_loss_gtc": 0.119287109375, + "train_loss_gtm": 0.0679217529296875, + "train_loss_lm": 1.700703125 + }, + { + "epoch": 2.2433927473878303, + "grad_norm": 1.799402117729187, + "learning_rate": 9.470519270324532e-05, + "loss": 1.8917, + "step": 7300, + "train_loss_gtc": 0.121845703125, + "train_loss_gtm": 0.05860137939453125, + "train_loss_lm": 1.7115625 + }, + { + "epoch": 2.2741241548862936, + "grad_norm": 1.3167532682418823, + "learning_rate": 9.455983845725164e-05, + "loss": 1.8896, + "step": 7400, + "train_loss_gtc": 0.12458984375, + "train_loss_gtm": 0.06630035400390626, + "train_loss_lm": 1.698984375 + }, + { + "epoch": 2.3048555623847573, + "grad_norm": 3.1567189693450928, + "learning_rate": 9.441263065578308e-05, + "loss": 1.8849, + "step": 7500, + "train_loss_gtc": 0.120859375, + "train_loss_gtm": 0.063575439453125, + "train_loss_lm": 1.69515625 + }, + { + "epoch": 2.3355869698832206, + "grad_norm": 2.949071168899536, + "learning_rate": 9.426357542224215e-05, + "loss": 1.8767, + "step": 7600, + "train_loss_gtc": 0.1182275390625, + "train_loss_gtm": 0.067989501953125, + "train_loss_lm": 1.691875 + }, + { + "epoch": 2.3663183773816843, + "grad_norm": 2.110520362854004, + "learning_rate": 9.411267895687898e-05, + "loss": 1.8791, + "step": 7700, + "train_loss_gtc": 0.11953125, + "train_loss_gtm": 0.068671875, + "train_loss_lm": 1.693046875 + }, + { + "epoch": 2.3970497848801475, + "grad_norm": 1.1845890283584595, + "learning_rate": 9.395994753653343e-05, + "loss": 1.8692, + "step": 7800, + "train_loss_gtc": 0.1122509765625, + "train_loss_gtm": 0.06687744140625, + "train_loss_lm": 1.690078125 + }, + { + "epoch": 2.427781192378611, + "grad_norm": 1.572401762008667, + "learning_rate": 9.380538751437396e-05, + "loss": 1.869, + "step": 7900, + "train_loss_gtc": 0.1185498046875, + "train_loss_gtm": 0.05891082763671875, + "train_loss_lm": 1.69109375 + }, + { + "epoch": 2.4585125998770745, + "grad_norm": 1.395868182182312, + "learning_rate": 9.364900531963336e-05, + "loss": 1.8866, + "step": 8000, + "train_loss_gtc": 0.125126953125, + "train_loss_gtm": 0.0729669189453125, + "train_loss_lm": 1.689609375 + }, + { + "epoch": 2.4892440073755377, + "grad_norm": 1.1641755104064941, + "learning_rate": 9.349080745734135e-05, + "loss": 1.867, + "step": 8100, + "train_loss_gtc": 0.1189306640625, + "train_loss_gtm": 0.06694183349609376, + "train_loss_lm": 1.68921875 + }, + { + "epoch": 2.5199754148740015, + "grad_norm": 2.092716932296753, + "learning_rate": 9.333080050805396e-05, + "loss": 1.8538, + "step": 8200, + "train_loss_gtc": 0.114306640625, + "train_loss_gtm": 0.0646319580078125, + "train_loss_lm": 1.68078125 + }, + { + "epoch": 2.5507068223724647, + "grad_norm": 1.8535902500152588, + "learning_rate": 9.316899112757982e-05, + "loss": 1.8524, + "step": 8300, + "train_loss_gtc": 0.1098681640625, + "train_loss_gtm": 0.06549957275390625, + "train_loss_lm": 1.6834375 + }, + { + "epoch": 2.581438229870928, + "grad_norm": 1.1401584148406982, + "learning_rate": 9.300538604670325e-05, + "loss": 1.8498, + "step": 8400, + "train_loss_gtc": 0.109970703125, + "train_loss_gtm": 0.0634844970703125, + "train_loss_lm": 1.677734375 + }, + { + "epoch": 2.6121696373693917, + "grad_norm": 1.7290570735931396, + "learning_rate": 9.283999207090439e-05, + "loss": 1.8523, + "step": 8500, + "train_loss_gtc": 0.1066796875, + "train_loss_gtm": 0.06089630126953125, + "train_loss_lm": 1.683203125 + }, + { + "epoch": 2.642901044867855, + "grad_norm": 0.7238840460777283, + "learning_rate": 9.267281608007592e-05, + "loss": 1.8537, + "step": 8600, + "train_loss_gtc": 0.1138037109375, + "train_loss_gtm": 0.066612548828125, + "train_loss_lm": 1.6834375 + }, + { + "epoch": 2.673632452366318, + "grad_norm": 2.260568380355835, + "learning_rate": 9.250386502823712e-05, + "loss": 1.8303, + "step": 8700, + "train_loss_gtc": 0.102099609375, + "train_loss_gtm": 0.0620458984375, + "train_loss_lm": 1.6715625 + }, + { + "epoch": 2.704363859864782, + "grad_norm": 1.1292821168899536, + "learning_rate": 9.233314594324437e-05, + "loss": 1.8346, + "step": 8800, + "train_loss_gtc": 0.106123046875, + "train_loss_gtm": 0.06123687744140625, + "train_loss_lm": 1.66734375 + }, + { + "epoch": 2.735095267363245, + "grad_norm": 1.3726723194122314, + "learning_rate": 9.216066592649899e-05, + "loss": 1.835, + "step": 8900, + "train_loss_gtc": 0.1031640625, + "train_loss_gtm": 0.055390625, + "train_loss_lm": 1.670390625 + }, + { + "epoch": 2.7658266748617084, + "grad_norm": 0.7613235712051392, + "learning_rate": 9.198643215265175e-05, + "loss": 1.8289, + "step": 9000, + "train_loss_gtc": 0.1032861328125, + "train_loss_gtm": 0.05791168212890625, + "train_loss_lm": 1.670234375 + }, + { + "epoch": 2.796558082360172, + "grad_norm": 1.4104223251342773, + "learning_rate": 9.181045186930446e-05, + "loss": 1.8226, + "step": 9100, + "train_loss_gtc": 0.10169921875, + "train_loss_gtm": 0.05517242431640625, + "train_loss_lm": 1.665 + }, + { + "epoch": 2.8272894898586354, + "grad_norm": 2.643277406692505, + "learning_rate": 9.163273239670845e-05, + "loss": 1.8278, + "step": 9200, + "train_loss_gtc": 0.1060205078125, + "train_loss_gtm": 0.0587158203125, + "train_loss_lm": 1.66890625 + }, + { + "epoch": 2.858020897357099, + "grad_norm": 0.7709031105041504, + "learning_rate": 9.145328112746013e-05, + "loss": 1.8159, + "step": 9300, + "train_loss_gtc": 0.10208984375, + "train_loss_gtm": 0.05981475830078125, + "train_loss_lm": 1.654921875 + }, + { + "epoch": 2.8887523048555623, + "grad_norm": 1.2432808876037598, + "learning_rate": 9.127210552619346e-05, + "loss": 1.8186, + "step": 9400, + "train_loss_gtc": 0.104443359375, + "train_loss_gtm": 0.05574615478515625, + "train_loss_lm": 1.65515625 + }, + { + "epoch": 2.919483712354026, + "grad_norm": 0.7463958859443665, + "learning_rate": 9.108921312926937e-05, + "loss": 1.8259, + "step": 9500, + "train_loss_gtc": 0.1081005859375, + "train_loss_gtm": 0.06411102294921875, + "train_loss_lm": 1.660234375 + }, + { + "epoch": 2.9502151198524893, + "grad_norm": 1.5550811290740967, + "learning_rate": 9.090461154446243e-05, + "loss": 1.8085, + "step": 9600, + "train_loss_gtc": 0.0987890625, + "train_loss_gtm": 0.0575555419921875, + "train_loss_lm": 1.650625 + }, + { + "epoch": 2.9809465273509526, + "grad_norm": 1.3831332921981812, + "learning_rate": 9.071830845064421e-05, + "loss": 1.8021, + "step": 9700, + "train_loss_gtc": 0.093544921875, + "train_loss_gtm": 0.04638153076171875, + "train_loss_lm": 1.653125 + }, + { + "epoch": 3.0, + "eval_loss": 2.033203125, + "eval_runtime": 3.9269, + "eval_samples_per_second": 253.383, + "eval_steps_per_second": 2.801, + "step": 9762, + "train_loss_gtc": 0.09482500630040322, + "train_loss_gtm": 0.03868521413495464, + "train_loss_lm": 1.6529737903225807, + "val_loss_gtc": 0.195166015625, + "val_loss_gtm": 0.1783203125, + "val_loss_lm": 1.67734375 + }, + { + "epoch": 3.0116779348494163, + "grad_norm": 0.6389613747596741, + "learning_rate": 9.0530311597464e-05, + "loss": 1.7867, + "step": 9800, + "train_loss_gtc": 0.0934094880756579, + "train_loss_gtm": 0.05592105263157895, + "train_loss_lm": 1.6383634868421053 + }, + { + "epoch": 3.0424093423478795, + "grad_norm": 4.752490520477295, + "learning_rate": 9.034062880502636e-05, + "loss": 1.8109, + "step": 9900, + "train_loss_gtc": 0.106484375, + "train_loss_gtm": 0.065299072265625, + "train_loss_lm": 1.645078125 + }, + { + "epoch": 3.073140749846343, + "grad_norm": 0.8840903043746948, + "learning_rate": 9.014926796356588e-05, + "loss": 1.7886, + "step": 10000, + "train_loss_gtc": 0.0953857421875, + "train_loss_gtm": 0.05199127197265625, + "train_loss_lm": 1.638125 + }, + { + "epoch": 3.1038721573448065, + "grad_norm": 1.790175199508667, + "learning_rate": 8.995623703311894e-05, + "loss": 1.7777, + "step": 10100, + "train_loss_gtc": 0.09615478515625, + "train_loss_gtm": 0.04876113891601563, + "train_loss_lm": 1.631875 + }, + { + "epoch": 3.1346035648432697, + "grad_norm": 1.167677879333496, + "learning_rate": 8.976154404319261e-05, + "loss": 1.7916, + "step": 10200, + "train_loss_gtc": 0.0942919921875, + "train_loss_gtm": 0.0605712890625, + "train_loss_lm": 1.639140625 + }, + { + "epoch": 3.1653349723417334, + "grad_norm": 1.2573415040969849, + "learning_rate": 8.956519709243065e-05, + "loss": 1.7905, + "step": 10300, + "train_loss_gtc": 0.097001953125, + "train_loss_gtm": 0.06192230224609375, + "train_loss_lm": 1.638203125 + }, + { + "epoch": 3.1960663798401967, + "grad_norm": 0.9823325276374817, + "learning_rate": 8.93672043482766e-05, + "loss": 1.7674, + "step": 10400, + "train_loss_gtc": 0.09020751953125, + "train_loss_gtm": 0.0457440185546875, + "train_loss_lm": 1.631875 + }, + { + "epoch": 3.22679778733866, + "grad_norm": 0.8545394539833069, + "learning_rate": 8.91675740466341e-05, + "loss": 1.7727, + "step": 10500, + "train_loss_gtc": 0.09113525390625, + "train_loss_gtm": 0.04704864501953125, + "train_loss_lm": 1.63609375 + }, + { + "epoch": 3.2575291948371237, + "grad_norm": 0.8178197741508484, + "learning_rate": 8.896631449152425e-05, + "loss": 1.7856, + "step": 10600, + "train_loss_gtc": 0.09496826171875, + "train_loss_gtm": 0.058369140625, + "train_loss_lm": 1.636484375 + }, + { + "epoch": 3.288260602335587, + "grad_norm": 1.6585794687271118, + "learning_rate": 8.876343405474018e-05, + "loss": 1.7747, + "step": 10700, + "train_loss_gtc": 0.0904931640625, + "train_loss_gtm": 0.0434954833984375, + "train_loss_lm": 1.6378125 + }, + { + "epoch": 3.3189920098340506, + "grad_norm": 1.3470587730407715, + "learning_rate": 8.855894117549885e-05, + "loss": 1.7657, + "step": 10800, + "train_loss_gtc": 0.08614501953125, + "train_loss_gtm": 0.04624908447265625, + "train_loss_lm": 1.6303125 + }, + { + "epoch": 3.349723417332514, + "grad_norm": 0.6378850936889648, + "learning_rate": 8.835284436009e-05, + "loss": 1.7683, + "step": 10900, + "train_loss_gtc": 0.0873779296875, + "train_loss_gtm": 0.04869110107421875, + "train_loss_lm": 1.6296875 + }, + { + "epoch": 3.380454824830977, + "grad_norm": 1.9016733169555664, + "learning_rate": 8.814515218152226e-05, + "loss": 1.7686, + "step": 11000, + "train_loss_gtc": 0.08774169921875, + "train_loss_gtm": 0.04988037109375, + "train_loss_lm": 1.6275 + }, + { + "epoch": 3.411186232329441, + "grad_norm": 6.191075325012207, + "learning_rate": 8.793587327916661e-05, + "loss": 1.7661, + "step": 11100, + "train_loss_gtc": 0.08765625, + "train_loss_gtm": 0.04734375, + "train_loss_lm": 1.626875 + }, + { + "epoch": 3.441917639827904, + "grad_norm": 1.3823864459991455, + "learning_rate": 8.772501635839694e-05, + "loss": 1.7634, + "step": 11200, + "train_loss_gtc": 0.0852392578125, + "train_loss_gtm": 0.047673492431640624, + "train_loss_lm": 1.630546875 + }, + { + "epoch": 3.4726490473263674, + "grad_norm": 0.9048540592193604, + "learning_rate": 8.751259019022801e-05, + "loss": 1.763, + "step": 11300, + "train_loss_gtc": 0.0891015625, + "train_loss_gtm": 0.0477813720703125, + "train_loss_lm": 1.626171875 + }, + { + "epoch": 3.503380454824831, + "grad_norm": 1.5553096532821655, + "learning_rate": 8.729860361095056e-05, + "loss": 1.7607, + "step": 11400, + "train_loss_gtc": 0.086826171875, + "train_loss_gtm": 0.05027008056640625, + "train_loss_lm": 1.627890625 + }, + { + "epoch": 3.5341118623232943, + "grad_norm": 1.639862060546875, + "learning_rate": 8.708306552176368e-05, + "loss": 1.7502, + "step": 11500, + "train_loss_gtc": 0.0829248046875, + "train_loss_gtm": 0.046551055908203125, + "train_loss_lm": 1.6209375 + }, + { + "epoch": 3.5648432698217576, + "grad_norm": 0.8203203678131104, + "learning_rate": 8.68659848884047e-05, + "loss": 1.7439, + "step": 11600, + "train_loss_gtc": 0.0837255859375, + "train_loss_gtm": 0.04283203125, + "train_loss_lm": 1.6215625 + }, + { + "epoch": 3.5955746773202213, + "grad_norm": 0.7728437781333923, + "learning_rate": 8.664737074077606e-05, + "loss": 1.7501, + "step": 11700, + "train_loss_gtc": 0.08592041015625, + "train_loss_gtm": 0.0465325927734375, + "train_loss_lm": 1.62140625 + }, + { + "epoch": 3.6263060848186845, + "grad_norm": 2.764263391494751, + "learning_rate": 8.642723217256991e-05, + "loss": 1.748, + "step": 11800, + "train_loss_gtc": 0.08718017578125, + "train_loss_gtm": 0.05325942993164062, + "train_loss_lm": 1.614921875 + }, + { + "epoch": 3.6570374923171483, + "grad_norm": 1.381459355354309, + "learning_rate": 8.620557834088962e-05, + "loss": 1.7502, + "step": 11900, + "train_loss_gtc": 0.08507568359375, + "train_loss_gtm": 0.04700302124023437, + "train_loss_lm": 1.6196875 + }, + { + "epoch": 3.6877688998156115, + "grad_norm": 3.3425018787384033, + "learning_rate": 8.598241846586899e-05, + "loss": 1.7493, + "step": 12000, + "train_loss_gtc": 0.0854345703125, + "train_loss_gtm": 0.04688232421875, + "train_loss_lm": 1.61484375 + }, + { + "epoch": 3.718500307314075, + "grad_norm": 1.8849352598190308, + "learning_rate": 8.575776183028873e-05, + "loss": 1.7444, + "step": 12100, + "train_loss_gtc": 0.08189208984375, + "train_loss_gtm": 0.05115646362304688, + "train_loss_lm": 1.619765625 + }, + { + "epoch": 3.7492317148125385, + "grad_norm": 0.8946220278739929, + "learning_rate": 8.553161777919028e-05, + "loss": 1.7366, + "step": 12200, + "train_loss_gtc": 0.08007568359375, + "train_loss_gtm": 0.04531814575195312, + "train_loss_lm": 1.614140625 + }, + { + "epoch": 3.7799631223110017, + "grad_norm": 0.682151734828949, + "learning_rate": 8.530399571948708e-05, + "loss": 1.7215, + "step": 12300, + "train_loss_gtc": 0.0752392578125, + "train_loss_gtm": 0.034449920654296876, + "train_loss_lm": 1.606171875 + }, + { + "epoch": 3.8106945298094654, + "grad_norm": 0.6967170238494873, + "learning_rate": 8.507490511957333e-05, + "loss": 1.7367, + "step": 12400, + "train_loss_gtc": 0.0777197265625, + "train_loss_gtm": 0.045133056640625, + "train_loss_lm": 1.610703125 + }, + { + "epoch": 3.8414259373079287, + "grad_norm": 1.3819748163223267, + "learning_rate": 8.484435550893006e-05, + "loss": 1.7275, + "step": 12500, + "train_loss_gtc": 0.07723388671875, + "train_loss_gtm": 0.037061309814453124, + "train_loss_lm": 1.611875 + }, + { + "epoch": 3.872157344806392, + "grad_norm": 1.2844618558883667, + "learning_rate": 8.461235647772877e-05, + "loss": 1.7219, + "step": 12600, + "train_loss_gtc": 0.07914794921875, + "train_loss_gtm": 0.039013671875, + "train_loss_lm": 1.610703125 + }, + { + "epoch": 3.9028887523048557, + "grad_norm": 1.2111929655075073, + "learning_rate": 8.437891767643251e-05, + "loss": 1.7278, + "step": 12700, + "train_loss_gtc": 0.078544921875, + "train_loss_gtm": 0.038914642333984374, + "train_loss_lm": 1.610078125 + }, + { + "epoch": 3.933620159803319, + "grad_norm": 0.8348441123962402, + "learning_rate": 8.414404881539443e-05, + "loss": 1.7255, + "step": 12800, + "train_loss_gtc": 0.0780419921875, + "train_loss_gtm": 0.044105224609375, + "train_loss_lm": 1.602890625 + }, + { + "epoch": 3.964351567301782, + "grad_norm": 1.4092820882797241, + "learning_rate": 8.39077596644539e-05, + "loss": 1.7195, + "step": 12900, + "train_loss_gtc": 0.07587646484375, + "train_loss_gtm": 0.0384991455078125, + "train_loss_lm": 1.60671875 + }, + { + "epoch": 3.995082974800246, + "grad_norm": 3.6042683124542236, + "learning_rate": 8.367006005253006e-05, + "loss": 1.7295, + "step": 13000, + "train_loss_gtc": 0.08053466796875, + "train_loss_gtm": 0.04183273315429688, + "train_loss_lm": 1.60703125 + }, + { + "epoch": 4.0, + "eval_loss": 1.9796874523162842, + "eval_runtime": 3.8758, + "eval_samples_per_second": 256.721, + "eval_steps_per_second": 2.838, + "step": 13016, + "train_loss_gtc": 0.079498291015625, + "train_loss_gtm": 0.044208526611328125, + "train_loss_lm": 1.59814453125, + "val_loss_gtc": 0.159912109375, + "val_loss_gtm": 0.183050537109375, + "val_loss_lm": 1.61953125 + }, + { + "epoch": 4.02581438229871, + "grad_norm": 1.8121000528335571, + "learning_rate": 8.343095986721301e-05, + "loss": 1.7206, + "step": 13100, + "train_loss_gtc": 0.07986014229910714, + "train_loss_gtm": 0.055745079403831846, + "train_loss_lm": 1.5932849702380953 + }, + { + "epoch": 4.056545789797172, + "grad_norm": 1.3698956966400146, + "learning_rate": 8.319046905435246e-05, + "loss": 1.7096, + "step": 13200, + "train_loss_gtc": 0.0751611328125, + "train_loss_gtm": 0.042149658203125, + "train_loss_lm": 1.59296875 + }, + { + "epoch": 4.087277197295636, + "grad_norm": 1.6034717559814453, + "learning_rate": 8.294859761764408e-05, + "loss": 1.7046, + "step": 13300, + "train_loss_gtc": 0.072431640625, + "train_loss_gtm": 0.046780548095703124, + "train_loss_lm": 1.58609375 + }, + { + "epoch": 4.1180086047941, + "grad_norm": 1.3316949605941772, + "learning_rate": 8.270535561821336e-05, + "loss": 1.7095, + "step": 13400, + "train_loss_gtc": 0.077919921875, + "train_loss_gtm": 0.04380218505859375, + "train_loss_lm": 1.591953125 + }, + { + "epoch": 4.148740012292563, + "grad_norm": 0.6827447414398193, + "learning_rate": 8.246075317419706e-05, + "loss": 1.7173, + "step": 13500, + "train_loss_gtc": 0.07958740234375, + "train_loss_gtm": 0.0477728271484375, + "train_loss_lm": 1.598125 + }, + { + "epoch": 4.179471419791026, + "grad_norm": 1.5629603862762451, + "learning_rate": 8.221480046032233e-05, + "loss": 1.6964, + "step": 13600, + "train_loss_gtc": 0.07267578125, + "train_loss_gtm": 0.03870758056640625, + "train_loss_lm": 1.582734375 + }, + { + "epoch": 4.21020282728949, + "grad_norm": 1.0047539472579956, + "learning_rate": 8.196750770748355e-05, + "loss": 1.7064, + "step": 13700, + "train_loss_gtc": 0.0744775390625, + "train_loss_gtm": 0.0351849365234375, + "train_loss_lm": 1.5890625 + }, + { + "epoch": 4.240934234787954, + "grad_norm": 0.7914025187492371, + "learning_rate": 8.171888520231666e-05, + "loss": 1.7175, + "step": 13800, + "train_loss_gtc": 0.0821630859375, + "train_loss_gtm": 0.05238189697265625, + "train_loss_lm": 1.58953125 + }, + { + "epoch": 4.2716656422864165, + "grad_norm": 1.05272376537323, + "learning_rate": 8.146894328677128e-05, + "loss": 1.6928, + "step": 13900, + "train_loss_gtc": 0.0702734375, + "train_loss_gtm": 0.033878173828125, + "train_loss_lm": 1.58375 + }, + { + "epoch": 4.30239704978488, + "grad_norm": 1.6808894872665405, + "learning_rate": 8.12176923576806e-05, + "loss": 1.6968, + "step": 14000, + "train_loss_gtc": 0.07257080078125, + "train_loss_gtm": 0.034125747680664065, + "train_loss_lm": 1.585 + }, + { + "epoch": 4.333128457283344, + "grad_norm": 0.815800130367279, + "learning_rate": 8.096514286632879e-05, + "loss": 1.6977, + "step": 14100, + "train_loss_gtc": 0.070693359375, + "train_loss_gtm": 0.030710296630859377, + "train_loss_lm": 1.585390625 + }, + { + "epoch": 4.363859864781807, + "grad_norm": 0.7311274409294128, + "learning_rate": 8.071130531801635e-05, + "loss": 1.7137, + "step": 14200, + "train_loss_gtc": 0.079658203125, + "train_loss_gtm": 0.0499749755859375, + "train_loss_lm": 1.59171875 + }, + { + "epoch": 4.3945912722802705, + "grad_norm": 0.8525009155273438, + "learning_rate": 8.045619027162303e-05, + "loss": 1.6995, + "step": 14300, + "train_loss_gtc": 0.07261474609375, + "train_loss_gtm": 0.03937774658203125, + "train_loss_lm": 1.588359375 + }, + { + "epoch": 4.425322679778734, + "grad_norm": 1.277293086051941, + "learning_rate": 8.019980833916874e-05, + "loss": 1.6991, + "step": 14400, + "train_loss_gtc": 0.07212158203125, + "train_loss_gtm": 0.042982177734375, + "train_loss_lm": 1.585625 + }, + { + "epoch": 4.456054087277197, + "grad_norm": 0.7727832794189453, + "learning_rate": 7.994217018537195e-05, + "loss": 1.6925, + "step": 14500, + "train_loss_gtc": 0.07016845703125, + "train_loss_gtm": 0.041646270751953124, + "train_loss_lm": 1.580859375 + }, + { + "epoch": 4.486785494775661, + "grad_norm": 1.5050898790359497, + "learning_rate": 7.968328652720627e-05, + "loss": 1.6898, + "step": 14600, + "train_loss_gtc": 0.07381591796875, + "train_loss_gtm": 0.035602569580078125, + "train_loss_lm": 1.583203125 + }, + { + "epoch": 4.517516902274124, + "grad_norm": 0.8743451833724976, + "learning_rate": 7.942316813345447e-05, + "loss": 1.6976, + "step": 14700, + "train_loss_gtc": 0.07141357421875, + "train_loss_gtm": 0.03574203491210937, + "train_loss_lm": 1.58015625 + }, + { + "epoch": 4.548248309772587, + "grad_norm": 4.071852684020996, + "learning_rate": 7.916182582426064e-05, + "loss": 1.6793, + "step": 14800, + "train_loss_gtc": 0.065556640625, + "train_loss_gtm": 0.03599624633789063, + "train_loss_lm": 1.578359375 + }, + { + "epoch": 4.578979717271051, + "grad_norm": 1.2412759065628052, + "learning_rate": 7.88992704706801e-05, + "loss": 1.6891, + "step": 14900, + "train_loss_gtc": 0.07296875, + "train_loss_gtm": 0.04167098999023437, + "train_loss_lm": 1.58109375 + }, + { + "epoch": 4.609711124769515, + "grad_norm": 1.0076960325241089, + "learning_rate": 7.863551299422714e-05, + "loss": 1.6928, + "step": 15000, + "train_loss_gtc": 0.07355712890625, + "train_loss_gtm": 0.040593414306640624, + "train_loss_lm": 1.5796875 + }, + { + "epoch": 4.640442532267977, + "grad_norm": 1.8155709505081177, + "learning_rate": 7.837056436642077e-05, + "loss": 1.6972, + "step": 15100, + "train_loss_gtc": 0.07208251953125, + "train_loss_gtm": 0.03765533447265625, + "train_loss_lm": 1.5828125 + }, + { + "epoch": 4.671173939766441, + "grad_norm": 4.2761101722717285, + "learning_rate": 7.810443560832832e-05, + "loss": 1.6779, + "step": 15200, + "train_loss_gtc": 0.0666650390625, + "train_loss_gtm": 0.03232818603515625, + "train_loss_lm": 1.5771875 + }, + { + "epoch": 4.701905347264905, + "grad_norm": 1.0301436185836792, + "learning_rate": 7.783713779010697e-05, + "loss": 1.6814, + "step": 15300, + "train_loss_gtc": 0.0691845703125, + "train_loss_gtm": 0.03757865905761719, + "train_loss_lm": 1.57953125 + }, + { + "epoch": 4.7326367547633685, + "grad_norm": 3.180100679397583, + "learning_rate": 7.756868203054334e-05, + "loss": 1.6773, + "step": 15400, + "train_loss_gtc": 0.06718994140625, + "train_loss_gtm": 0.030146408081054687, + "train_loss_lm": 1.57796875 + }, + { + "epoch": 4.763368162261831, + "grad_norm": 0.845735490322113, + "learning_rate": 7.729907949659089e-05, + "loss": 1.6662, + "step": 15500, + "train_loss_gtc": 0.06385986328125, + "train_loss_gtm": 0.027723541259765627, + "train_loss_lm": 1.573125 + }, + { + "epoch": 4.794099569760295, + "grad_norm": 0.8206067681312561, + "learning_rate": 7.702834140290547e-05, + "loss": 1.6742, + "step": 15600, + "train_loss_gtc": 0.067158203125, + "train_loss_gtm": 0.035130157470703124, + "train_loss_lm": 1.571953125 + }, + { + "epoch": 4.824830977258759, + "grad_norm": 0.7254693508148193, + "learning_rate": 7.675647901137879e-05, + "loss": 1.6833, + "step": 15700, + "train_loss_gtc": 0.06796142578125, + "train_loss_gtm": 0.03723342895507813, + "train_loss_lm": 1.573984375 + }, + { + "epoch": 4.855562384757222, + "grad_norm": 1.2930517196655273, + "learning_rate": 7.648350363066998e-05, + "loss": 1.6783, + "step": 15800, + "train_loss_gtc": 0.0690478515625, + "train_loss_gtm": 0.03417861938476562, + "train_loss_lm": 1.574296875 + }, + { + "epoch": 4.886293792255685, + "grad_norm": 0.46316060423851013, + "learning_rate": 7.620942661573523e-05, + "loss": 1.6772, + "step": 15900, + "train_loss_gtc": 0.0691015625, + "train_loss_gtm": 0.03562210083007813, + "train_loss_lm": 1.568046875 + }, + { + "epoch": 4.917025199754149, + "grad_norm": 1.149546504020691, + "learning_rate": 7.59342593673553e-05, + "loss": 1.668, + "step": 16000, + "train_loss_gtc": 0.0667919921875, + "train_loss_gtm": 0.035623931884765626, + "train_loss_lm": 1.56515625 + }, + { + "epoch": 4.947756607252612, + "grad_norm": 0.4385952949523926, + "learning_rate": 7.56580133316615e-05, + "loss": 1.6674, + "step": 16100, + "train_loss_gtc": 0.06619873046875, + "train_loss_gtm": 0.034098358154296876, + "train_loss_lm": 1.56875 + }, + { + "epoch": 4.9784880147510755, + "grad_norm": 0.670734167098999, + "learning_rate": 7.538069999965934e-05, + "loss": 1.6746, + "step": 16200, + "train_loss_gtc": 0.067392578125, + "train_loss_gtm": 0.04040283203125, + "train_loss_lm": 1.568984375 + }, + { + "epoch": 5.0, + "eval_loss": 1.859765648841858, + "eval_runtime": 3.9207, + "eval_samples_per_second": 253.782, + "eval_steps_per_second": 2.806, + "step": 16270, + "train_loss_gtc": 0.06170131138392857, + "train_loss_gtm": 0.02779693603515625, + "train_loss_lm": 1.574330357142857, + "val_loss_gtc": 0.124658203125, + "val_loss_gtm": 0.1553466796875, + "val_loss_lm": 1.5875 + }, + { + "epoch": 5.009219422249539, + "grad_norm": 0.8819429278373718, + "learning_rate": 7.510233090675076e-05, + "loss": 1.6639, + "step": 16300, + "train_loss_gtc": 0.06167805989583333, + "train_loss_gtm": 0.03746388753255208, + "train_loss_lm": 1.55 + }, + { + "epoch": 5.039950829748003, + "grad_norm": 1.934515118598938, + "learning_rate": 7.482291763225411e-05, + "loss": 1.6471, + "step": 16400, + "train_loss_gtc": 0.0614111328125, + "train_loss_gtm": 0.0246038818359375, + "train_loss_lm": 1.560078125 + }, + { + "epoch": 5.070682237246466, + "grad_norm": 0.4644189476966858, + "learning_rate": 7.454247179892258e-05, + "loss": 1.6539, + "step": 16500, + "train_loss_gtc": 0.06216796875, + "train_loss_gtm": 0.029619293212890627, + "train_loss_lm": 1.55984375 + }, + { + "epoch": 5.101413644744929, + "grad_norm": 0.6986903548240662, + "learning_rate": 7.426100507246073e-05, + "loss": 1.654, + "step": 16600, + "train_loss_gtc": 0.06435791015625, + "train_loss_gtm": 0.029658050537109376, + "train_loss_lm": 1.554609375 + }, + { + "epoch": 5.132145052243393, + "grad_norm": 1.399057149887085, + "learning_rate": 7.397852916103918e-05, + "loss": 1.6514, + "step": 16700, + "train_loss_gtc": 0.06365234375, + "train_loss_gtm": 0.032920303344726565, + "train_loss_lm": 1.556171875 + }, + { + "epoch": 5.162876459741856, + "grad_norm": 1.2212918996810913, + "learning_rate": 7.369505581480761e-05, + "loss": 1.6591, + "step": 16800, + "train_loss_gtc": 0.06535888671875, + "train_loss_gtm": 0.03793792724609375, + "train_loss_lm": 1.55328125 + }, + { + "epoch": 5.19360786724032, + "grad_norm": 1.227950930595398, + "learning_rate": 7.341059682540601e-05, + "loss": 1.6542, + "step": 16900, + "train_loss_gtc": 0.06419921875, + "train_loss_gtm": 0.03715614318847656, + "train_loss_lm": 1.558828125 + }, + { + "epoch": 5.224339274738783, + "grad_norm": 0.7415390610694885, + "learning_rate": 7.312516402547418e-05, + "loss": 1.6535, + "step": 17000, + "train_loss_gtc": 0.06427001953125, + "train_loss_gtm": 0.038449249267578124, + "train_loss_lm": 1.5575 + }, + { + "epoch": 5.255070682237246, + "grad_norm": 0.4711204767227173, + "learning_rate": 7.283876928815944e-05, + "loss": 1.6536, + "step": 17100, + "train_loss_gtc": 0.062666015625, + "train_loss_gtm": 0.032582550048828124, + "train_loss_lm": 1.558359375 + }, + { + "epoch": 5.28580208973571, + "grad_norm": 0.8353385925292969, + "learning_rate": 7.255142452662295e-05, + "loss": 1.6433, + "step": 17200, + "train_loss_gtc": 0.0605859375, + "train_loss_gtm": 0.029074554443359376, + "train_loss_lm": 1.557421875 + }, + { + "epoch": 5.316533497234174, + "grad_norm": 0.8035210371017456, + "learning_rate": 7.226314169354391e-05, + "loss": 1.6511, + "step": 17300, + "train_loss_gtc": 0.0600830078125, + "train_loss_gtm": 0.029854888916015624, + "train_loss_lm": 1.558125 + }, + { + "epoch": 5.347264904732636, + "grad_norm": 1.1498240232467651, + "learning_rate": 7.197393278062251e-05, + "loss": 1.6475, + "step": 17400, + "train_loss_gtc": 0.0640478515625, + "train_loss_gtm": 0.039508514404296875, + "train_loss_lm": 1.549453125 + }, + { + "epoch": 5.3779963122311, + "grad_norm": 0.510197639465332, + "learning_rate": 7.168380981808108e-05, + "loss": 1.6438, + "step": 17500, + "train_loss_gtc": 0.06174072265625, + "train_loss_gtm": 0.0270050048828125, + "train_loss_lm": 1.551328125 + }, + { + "epoch": 5.408727719729564, + "grad_norm": 1.0153284072875977, + "learning_rate": 7.139278487416369e-05, + "loss": 1.6418, + "step": 17600, + "train_loss_gtc": 0.05983154296875, + "train_loss_gtm": 0.031028366088867186, + "train_loss_lm": 1.553203125 + }, + { + "epoch": 5.439459127228027, + "grad_norm": 3.0183732509613037, + "learning_rate": 7.110087005463413e-05, + "loss": 1.6466, + "step": 17700, + "train_loss_gtc": 0.0623046875, + "train_loss_gtm": 0.03655166625976562, + "train_loss_lm": 1.55625 + }, + { + "epoch": 5.47019053472649, + "grad_norm": 0.8955859541893005, + "learning_rate": 7.080807750227229e-05, + "loss": 1.6351, + "step": 17800, + "train_loss_gtc": 0.05905029296875, + "train_loss_gtm": 0.029703750610351562, + "train_loss_lm": 1.544765625 + }, + { + "epoch": 5.500921942224954, + "grad_norm": 0.4188254773616791, + "learning_rate": 7.051441939636915e-05, + "loss": 1.6359, + "step": 17900, + "train_loss_gtc": 0.05901123046875, + "train_loss_gtm": 0.02728248596191406, + "train_loss_lm": 1.54921875 + }, + { + "epoch": 5.531653349723418, + "grad_norm": 0.733801543712616, + "learning_rate": 7.021990795222015e-05, + "loss": 1.6387, + "step": 18000, + "train_loss_gtc": 0.0610791015625, + "train_loss_gtm": 0.033560562133789065, + "train_loss_lm": 1.550390625 + }, + { + "epoch": 5.5623847572218805, + "grad_norm": 1.336493968963623, + "learning_rate": 6.992455542061697e-05, + "loss": 1.6385, + "step": 18100, + "train_loss_gtc": 0.0579931640625, + "train_loss_gtm": 0.0305810546875, + "train_loss_lm": 1.5415625 + }, + { + "epoch": 5.593116164720344, + "grad_norm": 0.9108180999755859, + "learning_rate": 6.962837408733806e-05, + "loss": 1.6326, + "step": 18200, + "train_loss_gtc": 0.0611328125, + "train_loss_gtm": 0.027354583740234376, + "train_loss_lm": 1.54671875 + }, + { + "epoch": 5.623847572218808, + "grad_norm": 0.6176537871360779, + "learning_rate": 6.933137627263747e-05, + "loss": 1.6387, + "step": 18300, + "train_loss_gtc": 0.05935791015625, + "train_loss_gtm": 0.027886962890625, + "train_loss_lm": 1.550859375 + }, + { + "epoch": 5.654578979717271, + "grad_norm": 0.6086634993553162, + "learning_rate": 6.903357433073251e-05, + "loss": 1.6463, + "step": 18400, + "train_loss_gtc": 0.06116943359375, + "train_loss_gtm": 0.029478378295898437, + "train_loss_lm": 1.5525 + }, + { + "epoch": 5.6853103872157345, + "grad_norm": 0.6187928318977356, + "learning_rate": 6.873498064928969e-05, + "loss": 1.6362, + "step": 18500, + "train_loss_gtc": 0.059130859375, + "train_loss_gtm": 0.0319903564453125, + "train_loss_lm": 1.546171875 + }, + { + "epoch": 5.716041794714198, + "grad_norm": 0.7199295163154602, + "learning_rate": 6.843560764890953e-05, + "loss": 1.6304, + "step": 18600, + "train_loss_gtc": 0.0573388671875, + "train_loss_gtm": 0.026798248291015625, + "train_loss_lm": 1.54875 + }, + { + "epoch": 5.746773202212661, + "grad_norm": 0.48185673356056213, + "learning_rate": 6.81354677826099e-05, + "loss": 1.6356, + "step": 18700, + "train_loss_gtc": 0.05507568359375, + "train_loss_gtm": 0.023614425659179688, + "train_loss_lm": 1.552265625 + }, + { + "epoch": 5.777504609711125, + "grad_norm": 0.44653263688087463, + "learning_rate": 6.783457353530797e-05, + "loss": 1.629, + "step": 18800, + "train_loss_gtc": 0.057138671875, + "train_loss_gtm": 0.02418853759765625, + "train_loss_lm": 1.546953125 + }, + { + "epoch": 5.808236017209588, + "grad_norm": 0.6734046936035156, + "learning_rate": 6.75329374233009e-05, + "loss": 1.6311, + "step": 18900, + "train_loss_gtc": 0.05802734375, + "train_loss_gtm": 0.025522842407226562, + "train_loss_lm": 1.54484375 + }, + { + "epoch": 5.838967424708052, + "grad_norm": 1.3264803886413574, + "learning_rate": 6.723057199374518e-05, + "loss": 1.6371, + "step": 19000, + "train_loss_gtc": 0.057373046875, + "train_loss_gtm": 0.030865325927734374, + "train_loss_lm": 1.548671875 + }, + { + "epoch": 5.869698832206515, + "grad_norm": 1.0278254747390747, + "learning_rate": 6.692748982413474e-05, + "loss": 1.6338, + "step": 19100, + "train_loss_gtc": 0.05820068359375, + "train_loss_gtm": 0.030936508178710936, + "train_loss_lm": 1.550859375 + }, + { + "epoch": 5.900430239704979, + "grad_norm": 0.6339811086654663, + "learning_rate": 6.662370352177774e-05, + "loss": 1.6301, + "step": 19200, + "train_loss_gtc": 0.0586328125, + "train_loss_gtm": 0.02888214111328125, + "train_loss_lm": 1.5375 + }, + { + "epoch": 5.931161647203442, + "grad_norm": 0.892484724521637, + "learning_rate": 6.631922572327213e-05, + "loss": 1.6294, + "step": 19300, + "train_loss_gtc": 0.05766357421875, + "train_loss_gtm": 0.03270828247070313, + "train_loss_lm": 1.542890625 + }, + { + "epoch": 5.961893054701905, + "grad_norm": 0.8719256520271301, + "learning_rate": 6.601406909398007e-05, + "loss": 1.6334, + "step": 19400, + "train_loss_gtc": 0.05709716796875, + "train_loss_gtm": 0.034244384765625, + "train_loss_lm": 1.544453125 + }, + { + "epoch": 5.992624462200369, + "grad_norm": 0.5898253917694092, + "learning_rate": 6.570824632750099e-05, + "loss": 1.6308, + "step": 19500, + "train_loss_gtc": 0.05718017578125, + "train_loss_gtm": 0.028017425537109376, + "train_loss_lm": 1.5475 + }, + { + "epoch": 6.0, + "eval_loss": 1.8289062976837158, + "eval_runtime": 3.955, + "eval_samples_per_second": 251.581, + "eval_steps_per_second": 2.781, + "step": 19524, + "train_loss_gtc": 0.059417724609375, + "train_loss_gtm": 0.021376291910807293, + "train_loss_lm": 1.5485026041666667, + "val_loss_gtc": 0.1081298828125, + "val_loss_gtm": 0.14790267944335939, + "val_loss_lm": 1.559375 + }, + { + "epoch": 6.0233558696988325, + "grad_norm": 0.5566962361335754, + "learning_rate": 6.540177014514361e-05, + "loss": 1.6229, + "step": 19600, + "train_loss_gtc": 0.05809583162006579, + "train_loss_gtm": 0.027290545011821547, + "train_loss_lm": 1.5270353618421053 + }, + { + "epoch": 6.054087277197295, + "grad_norm": 0.7923322319984436, + "learning_rate": 6.509465329539689e-05, + "loss": 1.6096, + "step": 19700, + "train_loss_gtc": 0.0559228515625, + "train_loss_gtm": 0.02381988525390625, + "train_loss_lm": 1.527734375 + }, + { + "epoch": 6.084818684695759, + "grad_norm": 0.633963942527771, + "learning_rate": 6.478690855339953e-05, + "loss": 1.6261, + "step": 19800, + "train_loss_gtc": 0.0565234375, + "train_loss_gtm": 0.03267807006835938, + "train_loss_lm": 1.532734375 + }, + { + "epoch": 6.115550092194223, + "grad_norm": 0.9739740490913391, + "learning_rate": 6.44785487204087e-05, + "loss": 1.6157, + "step": 19900, + "train_loss_gtc": 0.05468017578125, + "train_loss_gtm": 0.029143524169921876, + "train_loss_lm": 1.53 + }, + { + "epoch": 6.146281499692686, + "grad_norm": 1.191219449043274, + "learning_rate": 6.416958662326749e-05, + "loss": 1.6127, + "step": 20000, + "train_loss_gtc": 0.05240966796875, + "train_loss_gtm": 0.02735198974609375, + "train_loss_lm": 1.531171875 + }, + { + "epoch": 6.177012907191149, + "grad_norm": 0.9735581278800964, + "learning_rate": 6.38600351138714e-05, + "loss": 1.6113, + "step": 20100, + "train_loss_gtc": 0.0530419921875, + "train_loss_gtm": 0.027030487060546875, + "train_loss_lm": 1.5346875 + }, + { + "epoch": 6.207744314689613, + "grad_norm": 1.2206913232803345, + "learning_rate": 6.35499070686337e-05, + "loss": 1.6212, + "step": 20200, + "train_loss_gtc": 0.055166015625, + "train_loss_gtm": 0.026912918090820314, + "train_loss_lm": 1.535625 + }, + { + "epoch": 6.238475722188076, + "grad_norm": 0.8422713279724121, + "learning_rate": 6.323921538794981e-05, + "loss": 1.6118, + "step": 20300, + "train_loss_gtc": 0.05383056640625, + "train_loss_gtm": 0.029865264892578125, + "train_loss_lm": 1.52765625 + }, + { + "epoch": 6.2692071296865395, + "grad_norm": 1.286847472190857, + "learning_rate": 6.292797299566072e-05, + "loss": 1.6112, + "step": 20400, + "train_loss_gtc": 0.055625, + "train_loss_gtm": 0.0314874267578125, + "train_loss_lm": 1.525234375 + }, + { + "epoch": 6.299938537185003, + "grad_norm": 0.5895647406578064, + "learning_rate": 6.261619283851527e-05, + "loss": 1.6021, + "step": 20500, + "train_loss_gtc": 0.050849609375, + "train_loss_gtm": 0.027188568115234374, + "train_loss_lm": 1.52734375 + }, + { + "epoch": 6.330669944683467, + "grad_norm": 0.6928810477256775, + "learning_rate": 6.230388788563187e-05, + "loss": 1.6008, + "step": 20600, + "train_loss_gtc": 0.05047119140625, + "train_loss_gtm": 0.02188018798828125, + "train_loss_lm": 1.530703125 + }, + { + "epoch": 6.36140135218193, + "grad_norm": 1.0124385356903076, + "learning_rate": 6.199107112795872e-05, + "loss": 1.6071, + "step": 20700, + "train_loss_gtc": 0.05262939453125, + "train_loss_gtm": 0.028003463745117186, + "train_loss_lm": 1.52765625 + }, + { + "epoch": 6.392132759680393, + "grad_norm": 1.7495094537734985, + "learning_rate": 6.167775557773363e-05, + "loss": 1.6069, + "step": 20800, + "train_loss_gtc": 0.0532470703125, + "train_loss_gtm": 0.027218780517578124, + "train_loss_lm": 1.525546875 + }, + { + "epoch": 6.422864167178857, + "grad_norm": 0.7303450703620911, + "learning_rate": 6.136395426794261e-05, + "loss": 1.5961, + "step": 20900, + "train_loss_gtc": 0.04982177734375, + "train_loss_gtm": 0.019435043334960937, + "train_loss_lm": 1.521875 + }, + { + "epoch": 6.45359557467732, + "grad_norm": 0.797379732131958, + "learning_rate": 6.104968025177791e-05, + "loss": 1.607, + "step": 21000, + "train_loss_gtc": 0.0555908203125, + "train_loss_gtm": 0.024838104248046874, + "train_loss_lm": 1.529375 + }, + { + "epoch": 6.484326982175784, + "grad_norm": 0.5462325811386108, + "learning_rate": 6.073494660209491e-05, + "loss": 1.6088, + "step": 21100, + "train_loss_gtc": 0.0543115234375, + "train_loss_gtm": 0.03119972229003906, + "train_loss_lm": 1.52734375 + }, + { + "epoch": 6.515058389674247, + "grad_norm": 0.46476686000823975, + "learning_rate": 6.0419766410868294e-05, + "loss": 1.6075, + "step": 21200, + "train_loss_gtc": 0.05191650390625, + "train_loss_gtm": 0.027312164306640626, + "train_loss_lm": 1.5278125 + }, + { + "epoch": 6.54578979717271, + "grad_norm": 0.6704521179199219, + "learning_rate": 6.010415278864762e-05, + "loss": 1.6081, + "step": 21300, + "train_loss_gtc": 0.05267822265625, + "train_loss_gtm": 0.025740814208984376, + "train_loss_lm": 1.522734375 + }, + { + "epoch": 6.576521204671174, + "grad_norm": 0.513566792011261, + "learning_rate": 5.978811886401183e-05, + "loss": 1.6077, + "step": 21400, + "train_loss_gtc": 0.05446533203125, + "train_loss_gtm": 0.03446975708007813, + "train_loss_lm": 1.523359375 + }, + { + "epoch": 6.6072526121696376, + "grad_norm": 1.2353570461273193, + "learning_rate": 5.947167778302323e-05, + "loss": 1.5954, + "step": 21500, + "train_loss_gtc": 0.04914306640625, + "train_loss_gtm": 0.019849777221679688, + "train_loss_lm": 1.52546875 + }, + { + "epoch": 6.637984019668101, + "grad_norm": 2.1153972148895264, + "learning_rate": 5.9154842708680544e-05, + "loss": 1.6048, + "step": 21600, + "train_loss_gtc": 0.052568359375, + "train_loss_gtm": 0.028261795043945312, + "train_loss_lm": 1.52453125 + }, + { + "epoch": 6.668715427166564, + "grad_norm": 1.2410842180252075, + "learning_rate": 5.8837626820371486e-05, + "loss": 1.6103, + "step": 21700, + "train_loss_gtc": 0.0537890625, + "train_loss_gtm": 0.027198944091796875, + "train_loss_lm": 1.52640625 + }, + { + "epoch": 6.699446834665028, + "grad_norm": 0.39238986372947693, + "learning_rate": 5.852004331332443e-05, + "loss": 1.6068, + "step": 21800, + "train_loss_gtc": 0.05417724609375, + "train_loss_gtm": 0.025730323791503907, + "train_loss_lm": 1.5265625 + }, + { + "epoch": 6.7301782421634915, + "grad_norm": 0.8881044983863831, + "learning_rate": 5.820210539805968e-05, + "loss": 1.5946, + "step": 21900, + "train_loss_gtc": 0.0499072265625, + "train_loss_gtm": 0.019407730102539062, + "train_loss_lm": 1.521875 + }, + { + "epoch": 6.760909649661954, + "grad_norm": 0.5124359130859375, + "learning_rate": 5.788382629983977e-05, + "loss": 1.612, + "step": 22000, + "train_loss_gtc": 0.0574853515625, + "train_loss_gtm": 0.031860885620117185, + "train_loss_lm": 1.523984375 + }, + { + "epoch": 6.791641057160418, + "grad_norm": 0.6098849773406982, + "learning_rate": 5.7565219258119455e-05, + "loss": 1.5961, + "step": 22100, + "train_loss_gtc": 0.05323974609375, + "train_loss_gtm": 0.02882041931152344, + "train_loss_lm": 1.521953125 + }, + { + "epoch": 6.822372464658882, + "grad_norm": 1.027600884437561, + "learning_rate": 5.724629752599495e-05, + "loss": 1.5928, + "step": 22200, + "train_loss_gtc": 0.0508203125, + "train_loss_gtm": 0.02281818389892578, + "train_loss_lm": 1.52421875 + }, + { + "epoch": 6.8531038721573445, + "grad_norm": 1.004398226737976, + "learning_rate": 5.692707436965267e-05, + "loss": 1.5929, + "step": 22300, + "train_loss_gtc": 0.04905517578125, + "train_loss_gtm": 0.025001983642578125, + "train_loss_lm": 1.521328125 + }, + { + "epoch": 6.883835279655808, + "grad_norm": 0.8874416351318359, + "learning_rate": 5.660756306781733e-05, + "loss": 1.5983, + "step": 22400, + "train_loss_gtc": 0.04990234375, + "train_loss_gtm": 0.025154190063476564, + "train_loss_lm": 1.52375 + }, + { + "epoch": 6.914566687154272, + "grad_norm": 0.5866090059280396, + "learning_rate": 5.628777691119965e-05, + "loss": 1.5958, + "step": 22500, + "train_loss_gtc": 0.0502880859375, + "train_loss_gtm": 0.024204254150390625, + "train_loss_lm": 1.521328125 + }, + { + "epoch": 6.945298094652735, + "grad_norm": 0.48130372166633606, + "learning_rate": 5.59677292019435e-05, + "loss": 1.594, + "step": 22600, + "train_loss_gtc": 0.05079833984375, + "train_loss_gtm": 0.02796661376953125, + "train_loss_lm": 1.51875 + }, + { + "epoch": 6.976029502151198, + "grad_norm": 0.6554698944091797, + "learning_rate": 5.564743325307254e-05, + "loss": 1.5964, + "step": 22700, + "train_loss_gtc": 0.0513427734375, + "train_loss_gtm": 0.025988922119140626, + "train_loss_lm": 1.521171875 + }, + { + "epoch": 7.0, + "eval_loss": 1.8093750476837158, + "eval_runtime": 3.9611, + "eval_samples_per_second": 251.194, + "eval_steps_per_second": 2.777, + "step": 22778, + "train_loss_gtc": 0.051851712740384616, + "train_loss_gtm": 0.024179898775540866, + "train_loss_lm": 1.5157251602564104, + "val_loss_gtc": 0.11328125, + "val_loss_gtm": 0.15882568359375, + "val_loss_lm": 1.53984375 + }, + { + "epoch": 7.006760909649662, + "grad_norm": 1.2214024066925049, + "learning_rate": 5.5326902387936454e-05, + "loss": 1.5932, + "step": 22800, + "train_loss_gtc": 0.04629794034090909, + "train_loss_gtm": 0.011040774258700285, + "train_loss_lm": 1.5095880681818181 + }, + { + "epoch": 7.037492317148125, + "grad_norm": 0.78125, + "learning_rate": 5.500614993965673e-05, + "loss": 1.5774, + "step": 22900, + "train_loss_gtc": 0.048642578125, + "train_loss_gtm": 0.028121871948242186, + "train_loss_lm": 1.504296875 + }, + { + "epoch": 7.068223724646589, + "grad_norm": 0.5814157724380493, + "learning_rate": 5.468518925057203e-05, + "loss": 1.5826, + "step": 23000, + "train_loss_gtc": 0.049710693359375, + "train_loss_gtm": 0.02605010986328125, + "train_loss_lm": 1.508125 + }, + { + "epoch": 7.098955132145052, + "grad_norm": 0.7798097133636475, + "learning_rate": 5.4364033671683304e-05, + "loss": 1.5849, + "step": 23100, + "train_loss_gtc": 0.049805908203125, + "train_loss_gtm": 0.024519424438476562, + "train_loss_lm": 1.512890625 + }, + { + "epoch": 7.129686539643516, + "grad_norm": 0.8778783679008484, + "learning_rate": 5.404269656209819e-05, + "loss": 1.5775, + "step": 23200, + "train_loss_gtc": 0.04724853515625, + "train_loss_gtm": 0.021280136108398438, + "train_loss_lm": 1.509140625 + }, + { + "epoch": 7.160417947141979, + "grad_norm": 0.8768311142921448, + "learning_rate": 5.3721191288475595e-05, + "loss": 1.5768, + "step": 23300, + "train_loss_gtc": 0.0488720703125, + "train_loss_gtm": 0.020770683288574218, + "train_loss_lm": 1.50484375 + }, + { + "epoch": 7.191149354640443, + "grad_norm": 1.3236780166625977, + "learning_rate": 5.3399531224469424e-05, + "loss": 1.5761, + "step": 23400, + "train_loss_gtc": 0.047967529296875, + "train_loss_gtm": 0.016504249572753905, + "train_loss_lm": 1.507578125 + }, + { + "epoch": 7.221880762138906, + "grad_norm": 0.4845696985721588, + "learning_rate": 5.307772975017249e-05, + "loss": 1.58, + "step": 23500, + "train_loss_gtc": 0.04843017578125, + "train_loss_gtm": 0.021038818359375, + "train_loss_lm": 1.51203125 + }, + { + "epoch": 7.252612169637369, + "grad_norm": 0.6816074848175049, + "learning_rate": 5.2755800251559794e-05, + "loss": 1.5807, + "step": 23600, + "train_loss_gtc": 0.0488525390625, + "train_loss_gtm": 0.025988388061523437, + "train_loss_lm": 1.50859375 + }, + { + "epoch": 7.283343577135833, + "grad_norm": 0.8071028590202332, + "learning_rate": 5.24337561199318e-05, + "loss": 1.5757, + "step": 23700, + "train_loss_gtc": 0.0470068359375, + "train_loss_gtm": 0.02268218994140625, + "train_loss_lm": 1.510703125 + }, + { + "epoch": 7.3140749846342965, + "grad_norm": 1.132927656173706, + "learning_rate": 5.211161075135733e-05, + "loss": 1.5746, + "step": 23800, + "train_loss_gtc": 0.04585205078125, + "train_loss_gtm": 0.020586471557617187, + "train_loss_lm": 1.508203125 + }, + { + "epoch": 7.344806392132759, + "grad_norm": 0.6981713771820068, + "learning_rate": 5.178937754611637e-05, + "loss": 1.5759, + "step": 23900, + "train_loss_gtc": 0.045491943359375, + "train_loss_gtm": 0.0174371337890625, + "train_loss_lm": 1.508671875 + }, + { + "epoch": 7.375537799631223, + "grad_norm": 0.689810574054718, + "learning_rate": 5.1467069908142684e-05, + "loss": 1.5719, + "step": 24000, + "train_loss_gtc": 0.046361083984375, + "train_loss_gtm": 0.02007720947265625, + "train_loss_lm": 1.50734375 + }, + { + "epoch": 7.406269207129687, + "grad_norm": 0.5761317610740662, + "learning_rate": 5.1144701244466144e-05, + "loss": 1.5774, + "step": 24100, + "train_loss_gtc": 0.047037353515625, + "train_loss_gtm": 0.025342483520507813, + "train_loss_lm": 1.505390625 + }, + { + "epoch": 7.43700061462815, + "grad_norm": 0.9547802805900574, + "learning_rate": 5.082228496465517e-05, + "loss": 1.5723, + "step": 24200, + "train_loss_gtc": 0.046898193359375, + "train_loss_gtm": 0.019998626708984377, + "train_loss_lm": 1.5040625 + }, + { + "epoch": 7.467732022126613, + "grad_norm": 1.58182954788208, + "learning_rate": 5.049983448025881e-05, + "loss": 1.5752, + "step": 24300, + "train_loss_gtc": 0.047181396484375, + "train_loss_gtm": 0.019326019287109374, + "train_loss_lm": 1.5034375 + }, + { + "epoch": 7.498463429625077, + "grad_norm": 1.1392496824264526, + "learning_rate": 5.0177363204249016e-05, + "loss": 1.567, + "step": 24400, + "train_loss_gtc": 0.0444873046875, + "train_loss_gtm": 0.02104278564453125, + "train_loss_lm": 1.503828125 + }, + { + "epoch": 7.529194837123541, + "grad_norm": 0.9969751238822937, + "learning_rate": 4.985488455046249e-05, + "loss": 1.5918, + "step": 24500, + "train_loss_gtc": 0.05201904296875, + "train_loss_gtm": 0.026438446044921876, + "train_loss_lm": 1.50671875 + }, + { + "epoch": 7.5599262446220035, + "grad_norm": 0.6485080122947693, + "learning_rate": 4.953241193304291e-05, + "loss": 1.5678, + "step": 24600, + "train_loss_gtc": 0.04556884765625, + "train_loss_gtm": 0.01871406555175781, + "train_loss_lm": 1.50484375 + }, + { + "epoch": 7.590657652120467, + "grad_norm": 0.5488921403884888, + "learning_rate": 4.920995876588286e-05, + "loss": 1.5709, + "step": 24700, + "train_loss_gtc": 0.045516357421875, + "train_loss_gtm": 0.017727508544921874, + "train_loss_lm": 1.507890625 + }, + { + "epoch": 7.621389059618931, + "grad_norm": 1.2782403230667114, + "learning_rate": 4.888753846206578e-05, + "loss": 1.5708, + "step": 24800, + "train_loss_gtc": 0.045699462890625, + "train_loss_gtm": 0.019001045227050782, + "train_loss_lm": 1.5021875 + }, + { + "epoch": 7.652120467117394, + "grad_norm": 1.2111992835998535, + "learning_rate": 4.856516443330818e-05, + "loss": 1.5671, + "step": 24900, + "train_loss_gtc": 0.04524169921875, + "train_loss_gtm": 0.015474700927734375, + "train_loss_lm": 1.50671875 + }, + { + "epoch": 7.682851874615857, + "grad_norm": 0.9381042122840881, + "learning_rate": 4.824285008940159e-05, + "loss": 1.5682, + "step": 25000, + "train_loss_gtc": 0.04477783203125, + "train_loss_gtm": 0.016591415405273438, + "train_loss_lm": 1.50328125 + }, + { + "epoch": 7.713583282114321, + "grad_norm": 0.41880643367767334, + "learning_rate": 4.79206088376549e-05, + "loss": 1.5699, + "step": 25100, + "train_loss_gtc": 0.04564697265625, + "train_loss_gtm": 0.022302398681640623, + "train_loss_lm": 1.50265625 + }, + { + "epoch": 7.744314689612784, + "grad_norm": 0.41994112730026245, + "learning_rate": 4.7598454082336525e-05, + "loss": 1.5593, + "step": 25200, + "train_loss_gtc": 0.0431494140625, + "train_loss_gtm": 0.01353099822998047, + "train_loss_lm": 1.501328125 + }, + { + "epoch": 7.775046097111248, + "grad_norm": 0.41959813237190247, + "learning_rate": 4.727639922411693e-05, + "loss": 1.5675, + "step": 25300, + "train_loss_gtc": 0.045030517578125, + "train_loss_gtm": 0.018340682983398436, + "train_loss_lm": 1.498359375 + }, + { + "epoch": 7.805777504609711, + "grad_norm": 1.3286911249160767, + "learning_rate": 4.695445765951113e-05, + "loss": 1.5671, + "step": 25400, + "train_loss_gtc": 0.044442138671875, + "train_loss_gtm": 0.017482261657714843, + "train_loss_lm": 1.50640625 + }, + { + "epoch": 7.836508912108174, + "grad_norm": 0.5046520233154297, + "learning_rate": 4.6632642780321506e-05, + "loss": 1.5625, + "step": 25500, + "train_loss_gtc": 0.04425048828125, + "train_loss_gtm": 0.01410266876220703, + "train_loss_lm": 1.501953125 + }, + { + "epoch": 7.867240319606638, + "grad_norm": 0.7728056907653809, + "learning_rate": 4.631096797308068e-05, + "loss": 1.5739, + "step": 25600, + "train_loss_gtc": 0.048016357421875, + "train_loss_gtm": 0.026591949462890625, + "train_loss_lm": 1.502890625 + }, + { + "epoch": 7.8979717271051015, + "grad_norm": 0.549649178981781, + "learning_rate": 4.598944661849467e-05, + "loss": 1.5654, + "step": 25700, + "train_loss_gtc": 0.045203857421875, + "train_loss_gtm": 0.019275131225585936, + "train_loss_lm": 1.500703125 + }, + { + "epoch": 7.928703134603564, + "grad_norm": 0.4454677999019623, + "learning_rate": 4.566809209088641e-05, + "loss": 1.5661, + "step": 25800, + "train_loss_gtc": 0.044942626953125, + "train_loss_gtm": 0.01573017120361328, + "train_loss_lm": 1.50234375 + }, + { + "epoch": 7.959434542102028, + "grad_norm": 0.5023268461227417, + "learning_rate": 4.534691775763923e-05, + "loss": 1.5643, + "step": 25900, + "train_loss_gtc": 0.045194091796875, + "train_loss_gtm": 0.020731773376464844, + "train_loss_lm": 1.498359375 + }, + { + "epoch": 7.990165949600492, + "grad_norm": 0.4675215780735016, + "learning_rate": 4.5025936978640993e-05, + "loss": 1.5646, + "step": 26000, + "train_loss_gtc": 0.04420166015625, + "train_loss_gtm": 0.0233331298828125, + "train_loss_lm": 1.50140625 + }, + { + "epoch": 8.0, + "eval_loss": 1.7390625476837158, + "eval_runtime": 3.9419, + "eval_samples_per_second": 252.418, + "eval_steps_per_second": 2.791, + "step": 26032, + "train_loss_gtc": 0.043849945068359375, + "train_loss_gtm": 0.01909458637237549, + "train_loss_lm": 1.50341796875, + "val_loss_gtc": 0.085546875, + "val_loss_gtm": 0.1235870361328125, + "val_loss_lm": 1.51875 + }, + { + "epoch": 8.020897357098955, + "grad_norm": 0.9887075424194336, + "learning_rate": 4.470516310572825e-05, + "loss": 1.5523, + "step": 26100, + "train_loss_gtc": 0.04299388212316176, + "train_loss_gtm": 0.015683286330279184, + "train_loss_lm": 1.4872472426470589 + }, + { + "epoch": 8.05162876459742, + "grad_norm": 0.7514944076538086, + "learning_rate": 4.43846094821309e-05, + "loss": 1.5613, + "step": 26200, + "train_loss_gtc": 0.04583251953125, + "train_loss_gtm": 0.026337127685546875, + "train_loss_lm": 1.490859375 + }, + { + "epoch": 8.082360172095882, + "grad_norm": 1.092617154121399, + "learning_rate": 4.406428944191709e-05, + "loss": 1.5533, + "step": 26300, + "train_loss_gtc": 0.04384765625, + "train_loss_gtm": 0.016444091796875, + "train_loss_lm": 1.488046875 + }, + { + "epoch": 8.113091579594345, + "grad_norm": 1.1750010251998901, + "learning_rate": 4.374421630943868e-05, + "loss": 1.5543, + "step": 26400, + "train_loss_gtc": 0.043507080078125, + "train_loss_gtm": 0.018485107421875, + "train_loss_lm": 1.493203125 + }, + { + "epoch": 8.14382298709281, + "grad_norm": 0.5995994806289673, + "learning_rate": 4.3424403398776835e-05, + "loss": 1.5558, + "step": 26500, + "train_loss_gtc": 0.045775146484375, + "train_loss_gtm": 0.0213360595703125, + "train_loss_lm": 1.486953125 + }, + { + "epoch": 8.174554394591272, + "grad_norm": 0.40138596296310425, + "learning_rate": 4.310486401318829e-05, + "loss": 1.5414, + "step": 26600, + "train_loss_gtc": 0.04089599609375, + "train_loss_gtm": 0.011089859008789062, + "train_loss_lm": 1.488828125 + }, + { + "epoch": 8.205285802089735, + "grad_norm": 0.4291875958442688, + "learning_rate": 4.278561144455199e-05, + "loss": 1.5511, + "step": 26700, + "train_loss_gtc": 0.0429052734375, + "train_loss_gtm": 0.014610671997070312, + "train_loss_lm": 1.4884375 + }, + { + "epoch": 8.2360172095882, + "grad_norm": 0.5274336934089661, + "learning_rate": 4.246665897281612e-05, + "loss": 1.5493, + "step": 26800, + "train_loss_gtc": 0.04279296875, + "train_loss_gtm": 0.015193328857421876, + "train_loss_lm": 1.49359375 + }, + { + "epoch": 8.266748617086662, + "grad_norm": 0.7654374837875366, + "learning_rate": 4.214801986544575e-05, + "loss": 1.5566, + "step": 26900, + "train_loss_gtc": 0.042926025390625, + "train_loss_gtm": 0.018515548706054687, + "train_loss_lm": 1.49296875 + }, + { + "epoch": 8.297480024585125, + "grad_norm": 0.9065292477607727, + "learning_rate": 4.182970737687093e-05, + "loss": 1.5538, + "step": 27000, + "train_loss_gtc": 0.04357177734375, + "train_loss_gtm": 0.016671829223632813, + "train_loss_lm": 1.491875 + }, + { + "epoch": 8.32821143208359, + "grad_norm": 1.0985864400863647, + "learning_rate": 4.151173474793534e-05, + "loss": 1.5566, + "step": 27100, + "train_loss_gtc": 0.045074462890625, + "train_loss_gtm": 0.02417022705078125, + "train_loss_lm": 1.488515625 + }, + { + "epoch": 8.358942839582053, + "grad_norm": 0.43155065178871155, + "learning_rate": 4.1194115205345574e-05, + "loss": 1.5593, + "step": 27200, + "train_loss_gtc": 0.04392822265625, + "train_loss_gtm": 0.024323196411132814, + "train_loss_lm": 1.490078125 + }, + { + "epoch": 8.389674247080515, + "grad_norm": 0.6603362560272217, + "learning_rate": 4.0876861961120806e-05, + "loss": 1.5456, + "step": 27300, + "train_loss_gtc": 0.043385009765625, + "train_loss_gtm": 0.011190872192382812, + "train_loss_lm": 1.486015625 + }, + { + "epoch": 8.42040565457898, + "grad_norm": 0.5204278826713562, + "learning_rate": 4.055998821204337e-05, + "loss": 1.5511, + "step": 27400, + "train_loss_gtc": 0.04381103515625, + "train_loss_gtm": 0.017749443054199218, + "train_loss_lm": 1.491953125 + }, + { + "epoch": 8.451137062077443, + "grad_norm": 0.7329652309417725, + "learning_rate": 4.024350713910969e-05, + "loss": 1.5452, + "step": 27500, + "train_loss_gtc": 0.041251220703125, + "train_loss_gtm": 0.012794952392578125, + "train_loss_lm": 1.48953125 + }, + { + "epoch": 8.481868469575907, + "grad_norm": 1.1227164268493652, + "learning_rate": 3.9927431906982095e-05, + "loss": 1.5508, + "step": 27600, + "train_loss_gtc": 0.04261962890625, + "train_loss_gtm": 0.01765655517578125, + "train_loss_lm": 1.48875 + }, + { + "epoch": 8.51259987707437, + "grad_norm": 0.6496936678886414, + "learning_rate": 3.9611775663441094e-05, + "loss": 1.5491, + "step": 27700, + "train_loss_gtc": 0.04417724609375, + "train_loss_gtm": 0.023344078063964845, + "train_loss_lm": 1.48734375 + }, + { + "epoch": 8.543331284572833, + "grad_norm": 0.4676097333431244, + "learning_rate": 3.92965515388386e-05, + "loss": 1.5494, + "step": 27800, + "train_loss_gtc": 0.0420458984375, + "train_loss_gtm": 0.020555419921875, + "train_loss_lm": 1.48703125 + }, + { + "epoch": 8.574062692071298, + "grad_norm": 1.0823791027069092, + "learning_rate": 3.8981772645551595e-05, + "loss": 1.5512, + "step": 27900, + "train_loss_gtc": 0.042501220703125, + "train_loss_gtm": 0.022169036865234373, + "train_loss_lm": 1.4890625 + }, + { + "epoch": 8.60479409956976, + "grad_norm": 0.40729042887687683, + "learning_rate": 3.866745207743683e-05, + "loss": 1.543, + "step": 28000, + "train_loss_gtc": 0.03969482421875, + "train_loss_gtm": 0.009343986511230468, + "train_loss_lm": 1.487421875 + }, + { + "epoch": 8.635525507068223, + "grad_norm": 1.4600690603256226, + "learning_rate": 3.835360290928612e-05, + "loss": 1.549, + "step": 28100, + "train_loss_gtc": 0.04197265625, + "train_loss_gtm": 0.016862869262695312, + "train_loss_lm": 1.484921875 + }, + { + "epoch": 8.666256914566688, + "grad_norm": 0.43790164589881897, + "learning_rate": 3.8040238196282395e-05, + "loss": 1.5401, + "step": 28200, + "train_loss_gtc": 0.03960205078125, + "train_loss_gtm": 0.01627326965332031, + "train_loss_lm": 1.482890625 + }, + { + "epoch": 8.69698832206515, + "grad_norm": 0.4079265892505646, + "learning_rate": 3.772737097345676e-05, + "loss": 1.5519, + "step": 28300, + "train_loss_gtc": 0.04443603515625, + "train_loss_gtm": 0.01917346954345703, + "train_loss_lm": 1.486328125 + }, + { + "epoch": 8.727719729563614, + "grad_norm": 2.1502716541290283, + "learning_rate": 3.741501425514618e-05, + "loss": 1.5453, + "step": 28400, + "train_loss_gtc": 0.04140380859375, + "train_loss_gtm": 0.016539077758789062, + "train_loss_lm": 1.489453125 + }, + { + "epoch": 8.758451137062078, + "grad_norm": 2.0536539554595947, + "learning_rate": 3.710318103445223e-05, + "loss": 1.5478, + "step": 28500, + "train_loss_gtc": 0.04205078125, + "train_loss_gtm": 0.019853744506835937, + "train_loss_lm": 1.48765625 + }, + { + "epoch": 8.789182544560541, + "grad_norm": 0.8067043423652649, + "learning_rate": 3.6791884282700464e-05, + "loss": 1.5401, + "step": 28600, + "train_loss_gtc": 0.042589111328125, + "train_loss_gtm": 0.01223979949951172, + "train_loss_lm": 1.487265625 + }, + { + "epoch": 8.819913952059004, + "grad_norm": 1.0549793243408203, + "learning_rate": 3.6481136948901016e-05, + "loss": 1.5449, + "step": 28700, + "train_loss_gtc": 0.039984130859375, + "train_loss_gtm": 0.013403701782226562, + "train_loss_lm": 1.4865625 + }, + { + "epoch": 8.850645359557468, + "grad_norm": 0.3913937211036682, + "learning_rate": 3.617095195920983e-05, + "loss": 1.5392, + "step": 28800, + "train_loss_gtc": 0.038916015625, + "train_loss_gtm": 0.014700355529785157, + "train_loss_lm": 1.48515625 + }, + { + "epoch": 8.881376767055931, + "grad_norm": 0.6485953330993652, + "learning_rate": 3.5861342216391083e-05, + "loss": 1.5398, + "step": 28900, + "train_loss_gtc": 0.0403515625, + "train_loss_gtm": 0.008778877258300781, + "train_loss_lm": 1.48671875 + }, + { + "epoch": 8.912108174554394, + "grad_norm": 0.42979031801223755, + "learning_rate": 3.555232059928037e-05, + "loss": 1.5443, + "step": 29000, + "train_loss_gtc": 0.040491943359375, + "train_loss_gtm": 0.020406494140625, + "train_loss_lm": 1.487421875 + }, + { + "epoch": 8.942839582052859, + "grad_norm": 0.4814371168613434, + "learning_rate": 3.524389996224899e-05, + "loss": 1.5404, + "step": 29100, + "train_loss_gtc": 0.038388671875, + "train_loss_gtm": 0.01521839141845703, + "train_loss_lm": 1.486171875 + }, + { + "epoch": 8.973570989551321, + "grad_norm": 1.2739533185958862, + "learning_rate": 3.4936093134669375e-05, + "loss": 1.5411, + "step": 29200, + "train_loss_gtc": 0.04125, + "train_loss_gtm": 0.01296173095703125, + "train_loss_lm": 1.482421875 + }, + { + "epoch": 9.0, + "eval_loss": 1.7078125476837158, + "eval_runtime": 3.925, + "eval_samples_per_second": 253.505, + "eval_steps_per_second": 2.803, + "step": 29286, + "train_loss_gtc": 0.03917747320130814, + "train_loss_gtm": 0.016063379686932232, + "train_loss_lm": 1.4876453488372092, + "val_loss_gtc": 0.0817138671875, + "val_loss_gtm": 0.13273239135742188, + "val_loss_lm": 1.5046875 + }, + { + "epoch": 9.004302397049784, + "grad_norm": 0.49189862608909607, + "learning_rate": 3.4628912920381206e-05, + "loss": 1.5363, + "step": 29300, + "train_loss_gtc": 0.03465053013392857, + "train_loss_gtm": 0.006429399762834821, + "train_loss_lm": 1.4709821428571428 + }, + { + "epoch": 9.035033804548249, + "grad_norm": 0.4804949164390564, + "learning_rate": 3.432237209715904e-05, + "loss": 1.5311, + "step": 29400, + "train_loss_gtc": 0.040050048828125, + "train_loss_gtm": 0.015148849487304687, + "train_loss_lm": 1.471484375 + }, + { + "epoch": 9.065765212046712, + "grad_norm": 2.241997241973877, + "learning_rate": 3.40164834161806e-05, + "loss": 1.5311, + "step": 29500, + "train_loss_gtc": 0.039757080078125, + "train_loss_gtm": 0.017725067138671877, + "train_loss_lm": 1.476484375 + }, + { + "epoch": 9.096496619545174, + "grad_norm": 0.48794323205947876, + "learning_rate": 3.371125960149651e-05, + "loss": 1.5284, + "step": 29600, + "train_loss_gtc": 0.040618896484375, + "train_loss_gtm": 0.014891014099121094, + "train_loss_lm": 1.47359375 + }, + { + "epoch": 9.127228027043639, + "grad_norm": 0.9154407978057861, + "learning_rate": 3.340671334950091e-05, + "loss": 1.5308, + "step": 29700, + "train_loss_gtc": 0.04009521484375, + "train_loss_gtm": 0.01613304138183594, + "train_loss_lm": 1.476953125 + }, + { + "epoch": 9.157959434542102, + "grad_norm": 0.3826013505458832, + "learning_rate": 3.31028573284034e-05, + "loss": 1.5269, + "step": 29800, + "train_loss_gtc": 0.03857177734375, + "train_loss_gtm": 0.012269973754882812, + "train_loss_lm": 1.47546875 + }, + { + "epoch": 9.188690842040565, + "grad_norm": 0.4480116665363312, + "learning_rate": 3.279970417770206e-05, + "loss": 1.5314, + "step": 29900, + "train_loss_gtc": 0.0409716796875, + "train_loss_gtm": 0.01894462585449219, + "train_loss_lm": 1.477578125 + }, + { + "epoch": 9.21942224953903, + "grad_norm": 0.5610605478286743, + "learning_rate": 3.24972665076576e-05, + "loss": 1.5302, + "step": 30000, + "train_loss_gtc": 0.037857666015625, + "train_loss_gtm": 0.015255851745605469, + "train_loss_lm": 1.47703125 + }, + { + "epoch": 9.250153657037492, + "grad_norm": 0.4201144576072693, + "learning_rate": 3.219555689876896e-05, + "loss": 1.5277, + "step": 30100, + "train_loss_gtc": 0.03964599609375, + "train_loss_gtm": 0.012595443725585938, + "train_loss_lm": 1.47546875 + }, + { + "epoch": 9.280885064535955, + "grad_norm": 1.114909291267395, + "learning_rate": 3.1894587901249875e-05, + "loss": 1.5265, + "step": 30200, + "train_loss_gtc": 0.036279296875, + "train_loss_gtm": 0.009384765625, + "train_loss_lm": 1.475390625 + }, + { + "epoch": 9.31161647203442, + "grad_norm": 0.41764217615127563, + "learning_rate": 3.159437203450691e-05, + "loss": 1.5256, + "step": 30300, + "train_loss_gtc": 0.037474365234375, + "train_loss_gtm": 0.010728912353515625, + "train_loss_lm": 1.4775 + }, + { + "epoch": 9.342347879532882, + "grad_norm": 1.1266087293624878, + "learning_rate": 3.1294921786618595e-05, + "loss": 1.522, + "step": 30400, + "train_loss_gtc": 0.036798095703125, + "train_loss_gtm": 0.007729339599609375, + "train_loss_lm": 1.4746875 + }, + { + "epoch": 9.373079287031347, + "grad_norm": 0.4223707616329193, + "learning_rate": 3.099624961381606e-05, + "loss": 1.5262, + "step": 30500, + "train_loss_gtc": 0.039088134765625, + "train_loss_gtm": 0.013626289367675782, + "train_loss_lm": 1.471875 + }, + { + "epoch": 9.40381069452981, + "grad_norm": 0.4733109176158905, + "learning_rate": 3.069836793996486e-05, + "loss": 1.5268, + "step": 30600, + "train_loss_gtc": 0.038968505859375, + "train_loss_gtm": 0.015171966552734374, + "train_loss_lm": 1.4746875 + }, + { + "epoch": 9.434542102028272, + "grad_norm": 0.8515746593475342, + "learning_rate": 3.0401289156048117e-05, + "loss": 1.524, + "step": 30700, + "train_loss_gtc": 0.038099365234375, + "train_loss_gtm": 0.011698036193847657, + "train_loss_lm": 1.475234375 + }, + { + "epoch": 9.465273509526737, + "grad_norm": 0.3740207850933075, + "learning_rate": 3.0105025619651193e-05, + "loss": 1.5272, + "step": 30800, + "train_loss_gtc": 0.038729248046875, + "train_loss_gtm": 0.012548446655273438, + "train_loss_lm": 1.474765625 + }, + { + "epoch": 9.4960049170252, + "grad_norm": 0.42126893997192383, + "learning_rate": 2.9809589654447555e-05, + "loss": 1.5232, + "step": 30900, + "train_loss_gtc": 0.0394775390625, + "train_loss_gtm": 0.012857398986816405, + "train_loss_lm": 1.47125 + }, + { + "epoch": 9.526736324523663, + "grad_norm": 0.4131476581096649, + "learning_rate": 2.951499354968623e-05, + "loss": 1.5289, + "step": 31000, + "train_loss_gtc": 0.03717041015625, + "train_loss_gtm": 0.010989189147949219, + "train_loss_lm": 1.476328125 + }, + { + "epoch": 9.557467732022127, + "grad_norm": 1.3864574432373047, + "learning_rate": 2.922124955968054e-05, + "loss": 1.5302, + "step": 31100, + "train_loss_gtc": 0.040264892578125, + "train_loss_gtm": 0.014952011108398437, + "train_loss_lm": 1.4771875 + }, + { + "epoch": 9.58819913952059, + "grad_norm": 0.6983849406242371, + "learning_rate": 2.892836990329844e-05, + "loss": 1.5228, + "step": 31200, + "train_loss_gtc": 0.037857666015625, + "train_loss_gtm": 0.014338626861572265, + "train_loss_lm": 1.475390625 + }, + { + "epoch": 9.618930547019053, + "grad_norm": 0.9399222731590271, + "learning_rate": 2.8636366763454153e-05, + "loss": 1.5205, + "step": 31300, + "train_loss_gtc": 0.03775146484375, + "train_loss_gtm": 0.011002845764160156, + "train_loss_lm": 1.4725 + }, + { + "epoch": 9.649661954517518, + "grad_norm": 0.7803316712379456, + "learning_rate": 2.8345252286601448e-05, + "loss": 1.5214, + "step": 31400, + "train_loss_gtc": 0.03853271484375, + "train_loss_gtm": 0.014136924743652343, + "train_loss_lm": 1.473671875 + }, + { + "epoch": 9.68039336201598, + "grad_norm": 1.0166672468185425, + "learning_rate": 2.805503858222842e-05, + "loss": 1.525, + "step": 31500, + "train_loss_gtc": 0.03795654296875, + "train_loss_gtm": 0.013683624267578125, + "train_loss_lm": 1.472109375 + }, + { + "epoch": 9.711124769514443, + "grad_norm": 1.386081576347351, + "learning_rate": 2.7765737722353725e-05, + "loss": 1.5211, + "step": 31600, + "train_loss_gtc": 0.037562255859375, + "train_loss_gtm": 0.0162908935546875, + "train_loss_lm": 1.471484375 + }, + { + "epoch": 9.741856177012908, + "grad_norm": 1.487998366355896, + "learning_rate": 2.747736174102441e-05, + "loss": 1.5211, + "step": 31700, + "train_loss_gtc": 0.037666015625, + "train_loss_gtm": 0.009608421325683594, + "train_loss_lm": 1.4721875 + }, + { + "epoch": 9.77258758451137, + "grad_norm": 0.4993577301502228, + "learning_rate": 2.7189922633815346e-05, + "loss": 1.5286, + "step": 31800, + "train_loss_gtc": 0.04015380859375, + "train_loss_gtm": 0.015623245239257812, + "train_loss_lm": 1.476171875 + }, + { + "epoch": 9.803318992009833, + "grad_norm": 2.035013437271118, + "learning_rate": 2.690343235733026e-05, + "loss": 1.5297, + "step": 31900, + "train_loss_gtc": 0.03919921875, + "train_loss_gtm": 0.01642772674560547, + "train_loss_lm": 1.4703125 + }, + { + "epoch": 9.834050399508298, + "grad_norm": 0.44986504316329956, + "learning_rate": 2.66179028287044e-05, + "loss": 1.5191, + "step": 32000, + "train_loss_gtc": 0.0372119140625, + "train_loss_gtm": 0.01147369384765625, + "train_loss_lm": 1.474140625 + }, + { + "epoch": 9.86478180700676, + "grad_norm": 0.44800782203674316, + "learning_rate": 2.633334592510876e-05, + "loss": 1.5229, + "step": 32100, + "train_loss_gtc": 0.037117919921875, + "train_loss_gtm": 0.020374336242675782, + "train_loss_lm": 1.472421875 + }, + { + "epoch": 9.895513214505224, + "grad_norm": 0.4471757113933563, + "learning_rate": 2.6049773483256046e-05, + "loss": 1.5197, + "step": 32200, + "train_loss_gtc": 0.03855224609375, + "train_loss_gtm": 0.012574348449707031, + "train_loss_lm": 1.4709375 + }, + { + "epoch": 9.926244622003688, + "grad_norm": 1.0153461694717407, + "learning_rate": 2.5767197298908296e-05, + "loss": 1.522, + "step": 32300, + "train_loss_gtc": 0.0387353515625, + "train_loss_gtm": 0.013848609924316406, + "train_loss_lm": 1.470703125 + }, + { + "epoch": 9.956976029502151, + "grad_norm": 0.35531821846961975, + "learning_rate": 2.5485629126386323e-05, + "loss": 1.5207, + "step": 32400, + "train_loss_gtc": 0.0349658203125, + "train_loss_gtm": 0.00917278289794922, + "train_loss_lm": 1.471484375 + }, + { + "epoch": 9.987707437000614, + "grad_norm": 0.4289498031139374, + "learning_rate": 2.5205080678080573e-05, + "loss": 1.5159, + "step": 32500, + "train_loss_gtc": 0.03526123046875, + "train_loss_gtm": 0.006147556304931641, + "train_loss_lm": 1.469765625 + }, + { + "epoch": 10.0, + "eval_loss": 1.663671851158142, + "eval_runtime": 3.934, + "eval_samples_per_second": 252.92, + "eval_steps_per_second": 2.796, + "step": 32540, + "train_loss_gtc": 0.0366180419921875, + "train_loss_gtm": 0.016598081588745116, + "train_loss_lm": 1.4673828125, + "val_loss_gtc": 0.075244140625, + "val_loss_gtm": 0.091632080078125, + "val_loss_lm": 1.49296875 + }, + { + "epoch": 10.018438844499078, + "grad_norm": 0.6605350971221924, + "learning_rate": 2.4925563623964055e-05, + "loss": 1.5146, + "step": 32600, + "train_loss_gtc": 0.037335205078125, + "train_loss_gtm": 0.014607747395833334, + "train_loss_lm": 1.459765625 + }, + { + "epoch": 10.049170251997541, + "grad_norm": 0.5100732445716858, + "learning_rate": 2.4647089591106885e-05, + "loss": 1.5074, + "step": 32700, + "train_loss_gtc": 0.035672607421875, + "train_loss_gtm": 0.010171089172363281, + "train_loss_lm": 1.461875 + }, + { + "epoch": 10.079901659496006, + "grad_norm": 1.4332607984542847, + "learning_rate": 2.4369670163192603e-05, + "loss": 1.5097, + "step": 32800, + "train_loss_gtc": 0.038460693359375, + "train_loss_gtm": 0.012800846099853515, + "train_loss_lm": 1.461484375 + }, + { + "epoch": 10.110633066994469, + "grad_norm": 0.3127327263355255, + "learning_rate": 2.409331688003642e-05, + "loss": 1.5074, + "step": 32900, + "train_loss_gtc": 0.037158203125, + "train_loss_gtm": 0.007948532104492187, + "train_loss_lm": 1.46296875 + }, + { + "epoch": 10.141364474492931, + "grad_norm": 0.49944329261779785, + "learning_rate": 2.3818041237105047e-05, + "loss": 1.5138, + "step": 33000, + "train_loss_gtc": 0.036298828125, + "train_loss_gtm": 0.01034515380859375, + "train_loss_lm": 1.46390625 + }, + { + "epoch": 10.172095881991396, + "grad_norm": 0.38113901019096375, + "learning_rate": 2.3543854685038612e-05, + "loss": 1.5096, + "step": 33100, + "train_loss_gtc": 0.035716552734375, + "train_loss_gtm": 0.010895004272460937, + "train_loss_lm": 1.46328125 + }, + { + "epoch": 10.202827289489859, + "grad_norm": 0.8738096952438354, + "learning_rate": 2.3270768629174366e-05, + "loss": 1.5107, + "step": 33200, + "train_loss_gtc": 0.03684326171875, + "train_loss_gtm": 0.01412738800048828, + "train_loss_lm": 1.4628125 + }, + { + "epoch": 10.233558696988322, + "grad_norm": 0.7059551477432251, + "learning_rate": 2.2998794429072228e-05, + "loss": 1.511, + "step": 33300, + "train_loss_gtc": 0.035848388671875, + "train_loss_gtm": 0.010251865386962891, + "train_loss_lm": 1.46515625 + }, + { + "epoch": 10.264290104486786, + "grad_norm": 0.49285122752189636, + "learning_rate": 2.2727943398042223e-05, + "loss": 1.5166, + "step": 33400, + "train_loss_gtc": 0.03899169921875, + "train_loss_gtm": 0.01776031494140625, + "train_loss_lm": 1.46421875 + }, + { + "epoch": 10.295021511985249, + "grad_norm": 0.3343373239040375, + "learning_rate": 2.245822680267391e-05, + "loss": 1.5063, + "step": 33500, + "train_loss_gtc": 0.034970703125, + "train_loss_gtm": 0.00969287872314453, + "train_loss_lm": 1.46109375 + }, + { + "epoch": 10.325752919483712, + "grad_norm": 0.6031121611595154, + "learning_rate": 2.2189655862367736e-05, + "loss": 1.5091, + "step": 33600, + "train_loss_gtc": 0.036680908203125, + "train_loss_gtm": 0.013059463500976563, + "train_loss_lm": 1.46125 + }, + { + "epoch": 10.356484326982176, + "grad_norm": 0.35346755385398865, + "learning_rate": 2.1922241748868395e-05, + "loss": 1.5055, + "step": 33700, + "train_loss_gtc": 0.033951416015625, + "train_loss_gtm": 0.005552330017089844, + "train_loss_lm": 1.4603125 + }, + { + "epoch": 10.38721573448064, + "grad_norm": 1.6642231941223145, + "learning_rate": 2.1655995585799977e-05, + "loss": 1.51, + "step": 33800, + "train_loss_gtc": 0.036239013671875, + "train_loss_gtm": 0.012279739379882812, + "train_loss_lm": 1.460546875 + }, + { + "epoch": 10.417947141979102, + "grad_norm": 1.5294814109802246, + "learning_rate": 2.1390928448203397e-05, + "loss": 1.5046, + "step": 33900, + "train_loss_gtc": 0.03334716796875, + "train_loss_gtm": 0.00482635498046875, + "train_loss_lm": 1.46125 + }, + { + "epoch": 10.448678549477567, + "grad_norm": 0.9640972018241882, + "learning_rate": 2.1127051362075596e-05, + "loss": 1.5085, + "step": 34000, + "train_loss_gtc": 0.03734619140625, + "train_loss_gtm": 0.012679977416992188, + "train_loss_lm": 1.4615625 + }, + { + "epoch": 10.47940995697603, + "grad_norm": 2.8935489654541016, + "learning_rate": 2.086437530391101e-05, + "loss": 1.5037, + "step": 34100, + "train_loss_gtc": 0.034757080078125, + "train_loss_gtm": 0.006779251098632813, + "train_loss_lm": 1.46296875 + }, + { + "epoch": 10.510141364474492, + "grad_norm": 0.4859734773635864, + "learning_rate": 2.0602911200244907e-05, + "loss": 1.5141, + "step": 34200, + "train_loss_gtc": 0.037239990234375, + "train_loss_gtm": 0.014754142761230469, + "train_loss_lm": 1.462109375 + }, + { + "epoch": 10.540872771972957, + "grad_norm": 0.4255363643169403, + "learning_rate": 2.034266992719886e-05, + "loss": 1.5048, + "step": 34300, + "train_loss_gtc": 0.0356005859375, + "train_loss_gtm": 0.009561195373535156, + "train_loss_lm": 1.459375 + }, + { + "epoch": 10.57160417947142, + "grad_norm": 0.41498520970344543, + "learning_rate": 2.008366231002836e-05, + "loss": 1.5094, + "step": 34400, + "train_loss_gtc": 0.0361181640625, + "train_loss_gtm": 0.013895111083984375, + "train_loss_lm": 1.458828125 + }, + { + "epoch": 10.602335586969883, + "grad_norm": 0.4691818058490753, + "learning_rate": 1.9825899122672516e-05, + "loss": 1.5088, + "step": 34500, + "train_loss_gtc": 0.036781005859375, + "train_loss_gtm": 0.016254196166992186, + "train_loss_lm": 1.4590625 + }, + { + "epoch": 10.633066994468347, + "grad_norm": 0.3247811496257782, + "learning_rate": 1.9569391087305944e-05, + "loss": 1.5095, + "step": 34600, + "train_loss_gtc": 0.036104736328125, + "train_loss_gtm": 0.011984748840332031, + "train_loss_lm": 1.458671875 + }, + { + "epoch": 10.66379840196681, + "grad_norm": 0.48939141631126404, + "learning_rate": 1.931414887389265e-05, + "loss": 1.5032, + "step": 34700, + "train_loss_gtc": 0.035, + "train_loss_gtm": 0.00996623992919922, + "train_loss_lm": 1.45953125 + }, + { + "epoch": 10.694529809465273, + "grad_norm": 0.5067106485366821, + "learning_rate": 1.906018309974225e-05, + "loss": 1.5118, + "step": 34800, + "train_loss_gtc": 0.036153564453125, + "train_loss_gtm": 0.017694778442382812, + "train_loss_lm": 1.46171875 + }, + { + "epoch": 10.725261216963737, + "grad_norm": 0.4321945607662201, + "learning_rate": 1.8807504329068377e-05, + "loss": 1.5052, + "step": 34900, + "train_loss_gtc": 0.0354345703125, + "train_loss_gtm": 0.012692756652832031, + "train_loss_lm": 1.461015625 + }, + { + "epoch": 10.7559926244622, + "grad_norm": 0.39166566729545593, + "learning_rate": 1.8556123072549097e-05, + "loss": 1.5078, + "step": 35000, + "train_loss_gtc": 0.037042236328125, + "train_loss_gtm": 0.011860542297363282, + "train_loss_lm": 1.4615625 + }, + { + "epoch": 10.786724031960663, + "grad_norm": 0.4959773123264313, + "learning_rate": 1.8306049786889872e-05, + "loss": 1.5037, + "step": 35100, + "train_loss_gtc": 0.036055908203125, + "train_loss_gtm": 0.007551231384277344, + "train_loss_lm": 1.46109375 + }, + { + "epoch": 10.817455439459128, + "grad_norm": 0.5386573076248169, + "learning_rate": 1.8057294874388443e-05, + "loss": 1.5052, + "step": 35200, + "train_loss_gtc": 0.034755859375, + "train_loss_gtm": 0.011582107543945312, + "train_loss_lm": 1.461328125 + }, + { + "epoch": 10.84818684695759, + "grad_norm": 0.38217893242836, + "learning_rate": 1.78098686825022e-05, + "loss": 1.502, + "step": 35300, + "train_loss_gtc": 0.034422607421875, + "train_loss_gtm": 0.0069885444641113285, + "train_loss_lm": 1.4628125 + }, + { + "epoch": 10.878918254456053, + "grad_norm": 0.3977510929107666, + "learning_rate": 1.7563781503417743e-05, + "loss": 1.5027, + "step": 35400, + "train_loss_gtc": 0.034517822265625, + "train_loss_gtm": 0.012902565002441406, + "train_loss_lm": 1.461015625 + }, + { + "epoch": 10.909649661954518, + "grad_norm": 1.0005662441253662, + "learning_rate": 1.7319043573622796e-05, + "loss": 1.5068, + "step": 35500, + "train_loss_gtc": 0.034649658203125, + "train_loss_gtm": 0.00762664794921875, + "train_loss_lm": 1.46140625 + }, + { + "epoch": 10.94038106945298, + "grad_norm": 0.8638070225715637, + "learning_rate": 1.707566507348032e-05, + "loss": 1.5069, + "step": 35600, + "train_loss_gtc": 0.03516845703125, + "train_loss_gtm": 0.013119163513183594, + "train_loss_lm": 1.46453125 + }, + { + "epoch": 10.971112476951445, + "grad_norm": 0.7276130318641663, + "learning_rate": 1.6833656126805075e-05, + "loss": 1.5038, + "step": 35700, + "train_loss_gtc": 0.034442138671875, + "train_loss_gtm": 0.008318862915039062, + "train_loss_lm": 1.4596875 + }, + { + "epoch": 11.0, + "eval_loss": 1.6515624523162842, + "eval_runtime": 3.9297, + "eval_samples_per_second": 253.199, + "eval_steps_per_second": 2.799, + "step": 35794, + "train_loss_gtc": 0.03423682679521277, + "train_loss_gtm": 0.011414101783265459, + "train_loss_lm": 1.4602726063829787, + "val_loss_gtc": 0.0715576171875, + "val_loss_gtm": 0.08521461486816406, + "val_loss_lm": 1.484375 + }, + { + "epoch": 11.001843884449908, + "grad_norm": 0.355080246925354, + "learning_rate": 1.6593026800442584e-05, + "loss": 1.5059, + "step": 35800, + "train_loss_gtc": 0.038492838541666664, + "train_loss_gtm": 0.0019823710123697915, + "train_loss_lm": 1.4466145833333333 + }, + { + "epoch": 11.03257529194837, + "grad_norm": 0.3804630935192108, + "learning_rate": 1.6353787103850214e-05, + "loss": 1.4999, + "step": 35900, + "train_loss_gtc": 0.034288330078125, + "train_loss_gtm": 0.013097267150878906, + "train_loss_lm": 1.454375 + }, + { + "epoch": 11.063306699446835, + "grad_norm": 0.43270865082740784, + "learning_rate": 1.611594698868099e-05, + "loss": 1.4984, + "step": 36000, + "train_loss_gtc": 0.034847412109375, + "train_loss_gtm": 0.010229988098144531, + "train_loss_lm": 1.45125 + }, + { + "epoch": 11.094038106945298, + "grad_norm": 0.35577720403671265, + "learning_rate": 1.587951634836949e-05, + "loss": 1.4972, + "step": 36100, + "train_loss_gtc": 0.03463623046875, + "train_loss_gtm": 0.006039161682128907, + "train_loss_lm": 1.45390625 + }, + { + "epoch": 11.124769514443761, + "grad_norm": 0.3876980245113373, + "learning_rate": 1.5644505017720396e-05, + "loss": 1.4942, + "step": 36200, + "train_loss_gtc": 0.032666015625, + "train_loss_gtm": 0.00611663818359375, + "train_loss_lm": 1.451171875 + }, + { + "epoch": 11.155500921942226, + "grad_norm": 0.675238847732544, + "learning_rate": 1.5410922772499352e-05, + "loss": 1.503, + "step": 36300, + "train_loss_gtc": 0.035501708984375, + "train_loss_gtm": 0.013241043090820312, + "train_loss_lm": 1.45578125 + }, + { + "epoch": 11.186232329440688, + "grad_norm": 0.4091513752937317, + "learning_rate": 1.5178779329026393e-05, + "loss": 1.5001, + "step": 36400, + "train_loss_gtc": 0.03492431640625, + "train_loss_gtm": 0.013411216735839844, + "train_loss_lm": 1.4534375 + }, + { + "epoch": 11.216963736939151, + "grad_norm": 0.4007122814655304, + "learning_rate": 1.494808434377164e-05, + "loss": 1.4959, + "step": 36500, + "train_loss_gtc": 0.0340380859375, + "train_loss_gtm": 0.010790367126464844, + "train_loss_lm": 1.45296875 + }, + { + "epoch": 11.247695144437616, + "grad_norm": 0.3332425057888031, + "learning_rate": 1.4718847412953784e-05, + "loss": 1.4964, + "step": 36600, + "train_loss_gtc": 0.035784912109375, + "train_loss_gtm": 0.013795166015625, + "train_loss_lm": 1.4509375 + }, + { + "epoch": 11.278426551936079, + "grad_norm": 0.42536449432373047, + "learning_rate": 1.4491078072140779e-05, + "loss": 1.4959, + "step": 36700, + "train_loss_gtc": 0.035238037109375, + "train_loss_gtm": 0.008274612426757812, + "train_loss_lm": 1.453203125 + }, + { + "epoch": 11.309157959434541, + "grad_norm": 0.4789024889469147, + "learning_rate": 1.4264785795853231e-05, + "loss": 1.4947, + "step": 36800, + "train_loss_gtc": 0.0340283203125, + "train_loss_gtm": 0.007297935485839843, + "train_loss_lm": 1.4525 + }, + { + "epoch": 11.339889366933006, + "grad_norm": 0.436238557100296, + "learning_rate": 1.4039979997170349e-05, + "loss": 1.4954, + "step": 36900, + "train_loss_gtc": 0.035128173828125, + "train_loss_gtm": 0.010289707183837891, + "train_loss_lm": 1.45390625 + }, + { + "epoch": 11.370620774431469, + "grad_norm": 0.37121227383613586, + "learning_rate": 1.3816670027338297e-05, + "loss": 1.4961, + "step": 37000, + "train_loss_gtc": 0.0336767578125, + "train_loss_gtm": 0.011312313079833984, + "train_loss_lm": 1.451328125 + }, + { + "epoch": 11.401352181929932, + "grad_norm": 0.3737700581550598, + "learning_rate": 1.3594865175381267e-05, + "loss": 1.4941, + "step": 37100, + "train_loss_gtc": 0.034173583984375, + "train_loss_gtm": 0.011153717041015625, + "train_loss_lm": 1.453828125 + }, + { + "epoch": 11.432083589428396, + "grad_norm": 0.40509167313575745, + "learning_rate": 1.3374574667715033e-05, + "loss": 1.4974, + "step": 37200, + "train_loss_gtc": 0.034654541015625, + "train_loss_gtm": 0.013001708984375, + "train_loss_lm": 1.452421875 + }, + { + "epoch": 11.46281499692686, + "grad_norm": 0.38259902596473694, + "learning_rate": 1.3155807667763265e-05, + "loss": 1.4975, + "step": 37300, + "train_loss_gtc": 0.03426025390625, + "train_loss_gtm": 0.011098213195800781, + "train_loss_lm": 1.45296875 + }, + { + "epoch": 11.493546404425322, + "grad_norm": 2.280012369155884, + "learning_rate": 1.2938573275576204e-05, + "loss": 1.4933, + "step": 37400, + "train_loss_gtc": 0.034439697265625, + "train_loss_gtm": 0.009605464935302734, + "train_loss_lm": 1.451640625 + }, + { + "epoch": 11.524277811923787, + "grad_norm": 0.8614688515663147, + "learning_rate": 1.2722880527452285e-05, + "loss": 1.4916, + "step": 37500, + "train_loss_gtc": 0.032637939453125, + "train_loss_gtm": 0.0070468330383300784, + "train_loss_lm": 1.454375 + }, + { + "epoch": 11.55500921942225, + "grad_norm": 0.40161266922950745, + "learning_rate": 1.250873839556213e-05, + "loss": 1.4943, + "step": 37600, + "train_loss_gtc": 0.033919677734375, + "train_loss_gtm": 0.005923271179199219, + "train_loss_lm": 1.45078125 + }, + { + "epoch": 11.585740626920712, + "grad_norm": 0.4867040514945984, + "learning_rate": 1.2296155787575386e-05, + "loss": 1.4963, + "step": 37700, + "train_loss_gtc": 0.03362060546875, + "train_loss_gtm": 0.01107696533203125, + "train_loss_lm": 1.453515625 + }, + { + "epoch": 11.616472034419177, + "grad_norm": 0.40651935338974, + "learning_rate": 1.208514154629022e-05, + "loss": 1.4943, + "step": 37800, + "train_loss_gtc": 0.034439697265625, + "train_loss_gtm": 0.00758575439453125, + "train_loss_lm": 1.454609375 + }, + { + "epoch": 11.64720344191764, + "grad_norm": 0.43702617287635803, + "learning_rate": 1.1875704449265423e-05, + "loss": 1.4957, + "step": 37900, + "train_loss_gtc": 0.034952392578125, + "train_loss_gtm": 0.010952072143554687, + "train_loss_lm": 1.454921875 + }, + { + "epoch": 11.677934849416104, + "grad_norm": 0.3727381229400635, + "learning_rate": 1.1667853208455325e-05, + "loss": 1.4978, + "step": 38000, + "train_loss_gtc": 0.03486572265625, + "train_loss_gtm": 0.015162067413330078, + "train_loss_lm": 1.450859375 + }, + { + "epoch": 11.708666256914567, + "grad_norm": 0.3844757080078125, + "learning_rate": 1.1461596469847402e-05, + "loss": 1.4953, + "step": 38100, + "train_loss_gtc": 0.035777587890625, + "train_loss_gtm": 0.011620597839355469, + "train_loss_lm": 1.450546875 + }, + { + "epoch": 11.73939766441303, + "grad_norm": 0.40840184688568115, + "learning_rate": 1.1256942813102634e-05, + "loss": 1.4928, + "step": 38200, + "train_loss_gtc": 0.031209716796875, + "train_loss_gtm": 0.00724945068359375, + "train_loss_lm": 1.45421875 + }, + { + "epoch": 11.770129071911494, + "grad_norm": 0.6461498141288757, + "learning_rate": 1.1053900751198614e-05, + "loss": 1.4896, + "step": 38300, + "train_loss_gtc": 0.033707275390625, + "train_loss_gtm": 0.007514209747314453, + "train_loss_lm": 1.45015625 + }, + { + "epoch": 11.800860479409957, + "grad_norm": 0.46932530403137207, + "learning_rate": 1.0852478730075422e-05, + "loss": 1.4971, + "step": 38400, + "train_loss_gtc": 0.0347412109375, + "train_loss_gtm": 0.014281749725341797, + "train_loss_lm": 1.45265625 + }, + { + "epoch": 11.83159188690842, + "grad_norm": 0.417879194021225, + "learning_rate": 1.0652685128284285e-05, + "loss": 1.493, + "step": 38500, + "train_loss_gtc": 0.034190673828125, + "train_loss_gtm": 0.007110633850097656, + "train_loss_lm": 1.451796875 + }, + { + "epoch": 11.862323294406885, + "grad_norm": 0.38669833540916443, + "learning_rate": 1.0454528256639095e-05, + "loss": 1.4928, + "step": 38600, + "train_loss_gtc": 0.032156982421875, + "train_loss_gtm": 0.008788909912109375, + "train_loss_lm": 1.45203125 + }, + { + "epoch": 11.893054701905347, + "grad_norm": 1.0371503829956055, + "learning_rate": 1.0258016357870703e-05, + "loss": 1.4918, + "step": 38700, + "train_loss_gtc": 0.03337646484375, + "train_loss_gtm": 0.007540702819824219, + "train_loss_lm": 1.450390625 + }, + { + "epoch": 11.92378610940381, + "grad_norm": 0.7227888703346252, + "learning_rate": 1.0063157606284001e-05, + "loss": 1.4903, + "step": 38800, + "train_loss_gtc": 0.032996826171875, + "train_loss_gtm": 0.005477218627929687, + "train_loss_lm": 1.452578125 + }, + { + "epoch": 11.954517516902275, + "grad_norm": 0.44045162200927734, + "learning_rate": 9.869960107417924e-06, + "loss": 1.4931, + "step": 38900, + "train_loss_gtc": 0.034642333984375, + "train_loss_gtm": 0.009967632293701172, + "train_loss_lm": 1.4534375 + }, + { + "epoch": 11.985248924400738, + "grad_norm": 0.36739978194236755, + "learning_rate": 9.678431897708279e-06, + "loss": 1.4923, + "step": 39000, + "train_loss_gtc": 0.03304931640625, + "train_loss_gtm": 0.007914905548095702, + "train_loss_lm": 1.45109375 + }, + { + "epoch": 12.0, + "eval_loss": 1.6339843273162842, + "eval_runtime": 3.8887, + "eval_samples_per_second": 255.872, + "eval_steps_per_second": 2.829, + "step": 39048, + "train_loss_gtc": 0.032511393229166664, + "train_loss_gtm": 0.011383334795633951, + "train_loss_lm": 1.4518229166666667, + "val_loss_gtc": 0.067724609375, + "val_loss_gtm": 0.07337799072265624, + "val_loss_lm": 1.47890625 + }, + { + "epoch": 12.0159803318992, + "grad_norm": 0.5276215672492981, + "learning_rate": 9.48858094415348e-06, + "loss": 1.4867, + "step": 39100, + "train_loss_gtc": 0.031123234675480768, + "train_loss_gtm": 0.007110412304217999, + "train_loss_lm": 1.4439603365384615 + }, + { + "epoch": 12.046711739397665, + "grad_norm": 1.3187389373779297, + "learning_rate": 9.300415143983122e-06, + "loss": 1.4823, + "step": 39200, + "train_loss_gtc": 0.03217041015625, + "train_loss_gtm": 0.007877159118652343, + "train_loss_lm": 1.44421875 + }, + { + "epoch": 12.077443146896128, + "grad_norm": 0.37951648235321045, + "learning_rate": 9.113942324329445e-06, + "loss": 1.4868, + "step": 39300, + "train_loss_gtc": 0.032154541015625, + "train_loss_gtm": 0.006891098022460938, + "train_loss_lm": 1.446171875 + }, + { + "epoch": 12.10817455439459, + "grad_norm": 0.6352601051330566, + "learning_rate": 8.929170241901807e-06, + "loss": 1.4818, + "step": 39400, + "train_loss_gtc": 0.032747802734375, + "train_loss_gtm": 0.007182502746582031, + "train_loss_lm": 1.445390625 + }, + { + "epoch": 12.138905961893055, + "grad_norm": 0.46073710918426514, + "learning_rate": 8.746106582663994e-06, + "loss": 1.4839, + "step": 39500, + "train_loss_gtc": 0.03167236328125, + "train_loss_gtm": 0.009096622467041016, + "train_loss_lm": 1.447734375 + }, + { + "epoch": 12.169637369391518, + "grad_norm": 0.3877211809158325, + "learning_rate": 8.56475896151454e-06, + "loss": 1.4845, + "step": 39600, + "train_loss_gtc": 0.032230224609375, + "train_loss_gtm": 0.005583648681640625, + "train_loss_lm": 1.445 + }, + { + "epoch": 12.200368776889981, + "grad_norm": 0.5160537362098694, + "learning_rate": 8.385134921969923e-06, + "loss": 1.4865, + "step": 39700, + "train_loss_gtc": 0.032567138671875, + "train_loss_gtm": 0.012664890289306641, + "train_loss_lm": 1.44546875 + }, + { + "epoch": 12.231100184388445, + "grad_norm": 0.34780940413475037, + "learning_rate": 8.207241935850812e-06, + "loss": 1.4859, + "step": 39800, + "train_loss_gtc": 0.031810302734375, + "train_loss_gtm": 0.00482290267944336, + "train_loss_lm": 1.4471875 + }, + { + "epoch": 12.261831591886908, + "grad_norm": 0.39777079224586487, + "learning_rate": 8.031087402971232e-06, + "loss": 1.488, + "step": 39900, + "train_loss_gtc": 0.0323828125, + "train_loss_gtm": 0.015415172576904296, + "train_loss_lm": 1.444921875 + }, + { + "epoch": 12.292562999385371, + "grad_norm": 0.5161352753639221, + "learning_rate": 7.856678650830806e-06, + "loss": 1.4832, + "step": 40000, + "train_loss_gtc": 0.03137939453125, + "train_loss_gtm": 0.0043726348876953125, + "train_loss_lm": 1.44625 + }, + { + "epoch": 12.323294406883836, + "grad_norm": 0.3717089295387268, + "learning_rate": 7.684022934309926e-06, + "loss": 1.4859, + "step": 40100, + "train_loss_gtc": 0.032230224609375, + "train_loss_gtm": 0.008196029663085937, + "train_loss_lm": 1.44453125 + }, + { + "epoch": 12.354025814382299, + "grad_norm": 0.4823426902294159, + "learning_rate": 7.513127435367923e-06, + "loss": 1.4862, + "step": 40200, + "train_loss_gtc": 0.032799072265625, + "train_loss_gtm": 0.008565444946289063, + "train_loss_lm": 1.446953125 + }, + { + "epoch": 12.384757221880761, + "grad_norm": 0.3817342221736908, + "learning_rate": 7.343999262744389e-06, + "loss": 1.4889, + "step": 40300, + "train_loss_gtc": 0.033624267578125, + "train_loss_gtm": 0.00685495376586914, + "train_loss_lm": 1.445234375 + }, + { + "epoch": 12.415488629379226, + "grad_norm": 0.38065531849861145, + "learning_rate": 7.176645451663433e-06, + "loss": 1.4908, + "step": 40400, + "train_loss_gtc": 0.034915771484375, + "train_loss_gtm": 0.011385536193847657, + "train_loss_lm": 1.443984375 + }, + { + "epoch": 12.446220036877689, + "grad_norm": 0.39833277463912964, + "learning_rate": 7.011072963541088e-06, + "loss": 1.4832, + "step": 40500, + "train_loss_gtc": 0.031995849609375, + "train_loss_gtm": 0.006886463165283203, + "train_loss_lm": 1.44546875 + }, + { + "epoch": 12.476951444376152, + "grad_norm": 0.3548543453216553, + "learning_rate": 6.847288685695663e-06, + "loss": 1.4845, + "step": 40600, + "train_loss_gtc": 0.031795654296875, + "train_loss_gtm": 0.010219860076904296, + "train_loss_lm": 1.446171875 + }, + { + "epoch": 12.507682851874616, + "grad_norm": 0.46865567564964294, + "learning_rate": 6.6852994310613035e-06, + "loss": 1.4804, + "step": 40700, + "train_loss_gtc": 0.03116455078125, + "train_loss_gtm": 0.0027751541137695313, + "train_loss_lm": 1.444921875 + }, + { + "epoch": 12.538414259373079, + "grad_norm": 0.3493591547012329, + "learning_rate": 6.525111937904565e-06, + "loss": 1.4867, + "step": 40800, + "train_loss_gtc": 0.03113525390625, + "train_loss_gtm": 0.006133708953857422, + "train_loss_lm": 1.446484375 + }, + { + "epoch": 12.569145666871542, + "grad_norm": 0.3806462287902832, + "learning_rate": 6.366732869544167e-06, + "loss": 1.4847, + "step": 40900, + "train_loss_gtc": 0.032784423828125, + "train_loss_gtm": 0.009026336669921874, + "train_loss_lm": 1.444609375 + }, + { + "epoch": 12.599877074370006, + "grad_norm": 0.3359711170196533, + "learning_rate": 6.210168814073775e-06, + "loss": 1.4844, + "step": 41000, + "train_loss_gtc": 0.033193359375, + "train_loss_gtm": 0.013145980834960937, + "train_loss_lm": 1.4425 + }, + { + "epoch": 12.63060848186847, + "grad_norm": 0.3647012412548065, + "learning_rate": 6.0554262840879505e-06, + "loss": 1.4819, + "step": 41100, + "train_loss_gtc": 0.03174072265625, + "train_loss_gtm": 0.0059863471984863284, + "train_loss_lm": 1.445078125 + }, + { + "epoch": 12.661339889366934, + "grad_norm": 0.3800066411495209, + "learning_rate": 5.902511716411286e-06, + "loss": 1.4832, + "step": 41200, + "train_loss_gtc": 0.03176025390625, + "train_loss_gtm": 0.004956302642822266, + "train_loss_lm": 1.445703125 + }, + { + "epoch": 12.692071296865397, + "grad_norm": 6.271182060241699, + "learning_rate": 5.75143147183061e-06, + "loss": 1.4843, + "step": 41300, + "train_loss_gtc": 0.032977294921875, + "train_loss_gtm": 0.008317089080810547, + "train_loss_lm": 1.445390625 + }, + { + "epoch": 12.72280270436386, + "grad_norm": 1.3521143198013306, + "learning_rate": 5.602191834830445e-06, + "loss": 1.4785, + "step": 41400, + "train_loss_gtc": 0.030087890625, + "train_loss_gtm": 0.0036014556884765626, + "train_loss_lm": 1.4446875 + }, + { + "epoch": 12.753534111862324, + "grad_norm": 0.38900676369667053, + "learning_rate": 5.454799013331546e-06, + "loss": 1.4838, + "step": 41500, + "train_loss_gtc": 0.031859130859375, + "train_loss_gtm": 0.003786640167236328, + "train_loss_lm": 1.444453125 + }, + { + "epoch": 12.784265519360787, + "grad_norm": 0.36797913908958435, + "learning_rate": 5.309259138432693e-06, + "loss": 1.4843, + "step": 41600, + "train_loss_gtc": 0.031395263671875, + "train_loss_gtm": 0.005061054229736328, + "train_loss_lm": 1.444609375 + }, + { + "epoch": 12.81499692685925, + "grad_norm": 2.0185465812683105, + "learning_rate": 5.165578264155646e-06, + "loss": 1.4854, + "step": 41700, + "train_loss_gtc": 0.03158447265625, + "train_loss_gtm": 0.007223720550537109, + "train_loss_lm": 1.444609375 + }, + { + "epoch": 12.845728334357714, + "grad_norm": 0.37788382172584534, + "learning_rate": 5.023762367193336e-06, + "loss": 1.4802, + "step": 41800, + "train_loss_gtc": 0.031046142578125, + "train_loss_gtm": 0.0037957191467285155, + "train_loss_lm": 1.4475 + }, + { + "epoch": 12.876459741856177, + "grad_norm": 0.31019526720046997, + "learning_rate": 4.883817346661234e-06, + "loss": 1.4895, + "step": 41900, + "train_loss_gtc": 0.033118896484375, + "train_loss_gtm": 0.00923778533935547, + "train_loss_lm": 1.445546875 + }, + { + "epoch": 12.90719114935464, + "grad_norm": 0.3986211121082306, + "learning_rate": 4.745749023851964e-06, + "loss": 1.483, + "step": 42000, + "train_loss_gtc": 0.03188232421875, + "train_loss_gtm": 0.008430919647216796, + "train_loss_lm": 1.44296875 + }, + { + "epoch": 12.937922556853104, + "grad_norm": 0.3529811501502991, + "learning_rate": 4.609563141993156e-06, + "loss": 1.4812, + "step": 42100, + "train_loss_gtc": 0.030782470703125, + "train_loss_gtm": 0.0027103614807128906, + "train_loss_lm": 1.442265625 + }, + { + "epoch": 12.968653964351567, + "grad_norm": 0.3418220579624176, + "learning_rate": 4.475265366008547e-06, + "loss": 1.4829, + "step": 42200, + "train_loss_gtc": 0.03141357421875, + "train_loss_gtm": 0.007238006591796875, + "train_loss_lm": 1.44453125 + }, + { + "epoch": 12.99938537185003, + "grad_norm": 0.385499507188797, + "learning_rate": 4.342861282282362e-06, + "loss": 1.4841, + "step": 42300, + "train_loss_gtc": 0.032645263671875, + "train_loss_gtm": 0.0034380340576171875, + "train_loss_lm": 1.444921875 + }, + { + "epoch": 13.0, + "eval_loss": 1.618749976158142, + "eval_runtime": 3.9049, + "eval_samples_per_second": 254.805, + "eval_steps_per_second": 2.817, + "step": 42302, + "train_loss_gtc": 0.043212890625, + "train_loss_gtm": 0.05727386474609375, + "train_loss_lm": 1.453125, + "val_loss_gtc": 0.06478271484375, + "val_loss_gtm": 0.07197847366333007, + "val_loss_lm": 1.47578125 + }, + { + "epoch": 13.030116779348495, + "grad_norm": 0.4604727327823639, + "learning_rate": 4.212356398426892e-06, + "loss": 1.481, + "step": 42400, + "train_loss_gtc": 0.03175447425063776, + "train_loss_gtm": 0.006234383096500319, + "train_loss_lm": 1.4418845663265305 + }, + { + "epoch": 13.060848186846957, + "grad_norm": 0.41722559928894043, + "learning_rate": 4.0837561430534135e-06, + "loss": 1.4805, + "step": 42500, + "train_loss_gtc": 0.03138427734375, + "train_loss_gtm": 0.006003303527832031, + "train_loss_lm": 1.4434375 + }, + { + "epoch": 13.09157959434542, + "grad_norm": 0.3385833501815796, + "learning_rate": 3.957065865546406e-06, + "loss": 1.4773, + "step": 42600, + "train_loss_gtc": 0.032398681640625, + "train_loss_gtm": 0.005317020416259766, + "train_loss_lm": 1.4409375 + }, + { + "epoch": 13.122311001843885, + "grad_norm": 0.35626187920570374, + "learning_rate": 3.832290835840974e-06, + "loss": 1.4767, + "step": 42700, + "train_loss_gtc": 0.03093505859375, + "train_loss_gtm": 0.0037181663513183596, + "train_loss_lm": 1.440625 + }, + { + "epoch": 13.153042409342348, + "grad_norm": 0.3810971975326538, + "learning_rate": 3.7094362442036845e-06, + "loss": 1.4776, + "step": 42800, + "train_loss_gtc": 0.03213134765625, + "train_loss_gtm": 0.004517803192138672, + "train_loss_lm": 1.441328125 + }, + { + "epoch": 13.18377381684081, + "grad_norm": 0.36459001898765564, + "learning_rate": 3.588507201016633e-06, + "loss": 1.4797, + "step": 42900, + "train_loss_gtc": 0.031527099609375, + "train_loss_gtm": 0.009501018524169923, + "train_loss_lm": 1.440546875 + }, + { + "epoch": 13.214505224339275, + "grad_norm": 0.505306601524353, + "learning_rate": 3.469508736564897e-06, + "loss": 1.4807, + "step": 43000, + "train_loss_gtc": 0.0320068359375, + "train_loss_gtm": 0.006435070037841797, + "train_loss_lm": 1.4425 + }, + { + "epoch": 13.245236631837738, + "grad_norm": 1.4232553243637085, + "learning_rate": 3.3524458008272475e-06, + "loss": 1.4775, + "step": 43100, + "train_loss_gtc": 0.03030517578125, + "train_loss_gtm": 0.004405345916748047, + "train_loss_lm": 1.44171875 + }, + { + "epoch": 13.275968039336203, + "grad_norm": 0.37980103492736816, + "learning_rate": 3.2373232632703197e-06, + "loss": 1.4816, + "step": 43200, + "train_loss_gtc": 0.0322021484375, + "train_loss_gtm": 0.0026582717895507813, + "train_loss_lm": 1.4415625 + }, + { + "epoch": 13.306699446834665, + "grad_norm": 2.2766940593719482, + "learning_rate": 3.1241459126459706e-06, + "loss": 1.4808, + "step": 43300, + "train_loss_gtc": 0.03172119140625, + "train_loss_gtm": 0.008477497100830077, + "train_loss_lm": 1.44171875 + }, + { + "epoch": 13.337430854333128, + "grad_norm": 0.427611768245697, + "learning_rate": 3.01291845679213e-06, + "loss": 1.4783, + "step": 43400, + "train_loss_gtc": 0.03244873046875, + "train_loss_gtm": 0.0040134239196777345, + "train_loss_lm": 1.440390625 + }, + { + "epoch": 13.368162261831593, + "grad_norm": 0.30551981925964355, + "learning_rate": 2.9036455224369765e-06, + "loss": 1.4762, + "step": 43500, + "train_loss_gtc": 0.032469482421875, + "train_loss_gtm": 0.003238506317138672, + "train_loss_lm": 1.439453125 + }, + { + "epoch": 13.398893669330056, + "grad_norm": 0.42366525530815125, + "learning_rate": 2.7963316550064455e-06, + "loss": 1.4821, + "step": 43600, + "train_loss_gtc": 0.033074951171875, + "train_loss_gtm": 0.010958194732666016, + "train_loss_lm": 1.439765625 + }, + { + "epoch": 13.429625076828518, + "grad_norm": 0.33650246262550354, + "learning_rate": 2.6909813184351873e-06, + "loss": 1.4795, + "step": 43700, + "train_loss_gtc": 0.032664794921875, + "train_loss_gtm": 0.0049641036987304685, + "train_loss_lm": 1.4409375 + }, + { + "epoch": 13.460356484326983, + "grad_norm": 0.34106993675231934, + "learning_rate": 2.5875988949808472e-06, + "loss": 1.4846, + "step": 43800, + "train_loss_gtc": 0.03381591796875, + "train_loss_gtm": 0.013602008819580078, + "train_loss_lm": 1.441796875 + }, + { + "epoch": 13.491087891825446, + "grad_norm": 0.3917493224143982, + "learning_rate": 2.486188685041807e-06, + "loss": 1.4821, + "step": 43900, + "train_loss_gtc": 0.031900634765625, + "train_loss_gtm": 0.008092212677001952, + "train_loss_lm": 1.440859375 + }, + { + "epoch": 13.521819299323909, + "grad_norm": 0.3950476050376892, + "learning_rate": 2.386754906978278e-06, + "loss": 1.4819, + "step": 44000, + "train_loss_gtc": 0.03089111328125, + "train_loss_gtm": 0.008052177429199218, + "train_loss_lm": 1.4409375 + }, + { + "epoch": 13.552550706822373, + "grad_norm": 1.4824786186218262, + "learning_rate": 2.2893016969368575e-06, + "loss": 1.4889, + "step": 44100, + "train_loss_gtc": 0.03397705078125, + "train_loss_gtm": 0.019720077514648438, + "train_loss_lm": 1.441640625 + }, + { + "epoch": 13.583282114320836, + "grad_norm": 0.4232785999774933, + "learning_rate": 2.1938331086784335e-06, + "loss": 1.4796, + "step": 44200, + "train_loss_gtc": 0.030982666015625, + "train_loss_gtm": 0.0070129776000976566, + "train_loss_lm": 1.44140625 + }, + { + "epoch": 13.614013521819299, + "grad_norm": 0.3128654956817627, + "learning_rate": 2.1003531134096255e-06, + "loss": 1.4759, + "step": 44300, + "train_loss_gtc": 0.03089111328125, + "train_loss_gtm": 0.005992927551269531, + "train_loss_lm": 1.43984375 + }, + { + "epoch": 13.644744929317763, + "grad_norm": 0.3076239824295044, + "learning_rate": 2.0088655996175097e-06, + "loss": 1.4805, + "step": 44400, + "train_loss_gtc": 0.031224365234375, + "train_loss_gtm": 0.006224002838134766, + "train_loss_lm": 1.4428125 + }, + { + "epoch": 13.675476336816226, + "grad_norm": 0.3875581622123718, + "learning_rate": 1.9193743729079507e-06, + "loss": 1.4787, + "step": 44500, + "train_loss_gtc": 0.030994873046875, + "train_loss_gtm": 0.005257759094238281, + "train_loss_lm": 1.44125 + }, + { + "epoch": 13.706207744314689, + "grad_norm": 0.3164869546890259, + "learning_rate": 1.8318831558472582e-06, + "loss": 1.4788, + "step": 44600, + "train_loss_gtc": 0.032825927734375, + "train_loss_gtm": 0.008005275726318359, + "train_loss_lm": 1.4421875 + }, + { + "epoch": 13.736939151813154, + "grad_norm": 0.40070638060569763, + "learning_rate": 1.7463955878073424e-06, + "loss": 1.4785, + "step": 44700, + "train_loss_gtc": 0.031497802734375, + "train_loss_gtm": 0.0062798881530761715, + "train_loss_lm": 1.44125 + }, + { + "epoch": 13.767670559311616, + "grad_norm": 0.39651504158973694, + "learning_rate": 1.662915224814321e-06, + "loss": 1.4769, + "step": 44800, + "train_loss_gtc": 0.031278076171875, + "train_loss_gtm": 0.00752462387084961, + "train_loss_lm": 1.44140625 + }, + { + "epoch": 13.79840196681008, + "grad_norm": 0.3495580554008484, + "learning_rate": 1.5814455394006167e-06, + "loss": 1.4801, + "step": 44900, + "train_loss_gtc": 0.03222900390625, + "train_loss_gtm": 0.008823738098144532, + "train_loss_lm": 1.4409375 + }, + { + "epoch": 13.829133374308544, + "grad_norm": 0.33641329407691956, + "learning_rate": 1.501989920460517e-06, + "loss": 1.4793, + "step": 45000, + "train_loss_gtc": 0.03152587890625, + "train_loss_gtm": 0.005212993621826172, + "train_loss_lm": 1.441484375 + }, + { + "epoch": 13.859864781807007, + "grad_norm": 0.4456912577152252, + "learning_rate": 1.4245516731091646e-06, + "loss": 1.4772, + "step": 45100, + "train_loss_gtc": 0.03094482421875, + "train_loss_gtm": 0.002681427001953125, + "train_loss_lm": 1.43921875 + }, + { + "epoch": 13.89059618930547, + "grad_norm": 0.34826260805130005, + "learning_rate": 1.349134018545134e-06, + "loss": 1.4777, + "step": 45200, + "train_loss_gtc": 0.032086181640625, + "train_loss_gtm": 0.006848697662353516, + "train_loss_lm": 1.43953125 + }, + { + "epoch": 13.921327596803934, + "grad_norm": 0.3667930066585541, + "learning_rate": 1.2757400939163833e-06, + "loss": 1.4779, + "step": 45300, + "train_loss_gtc": 0.032056884765625, + "train_loss_gtm": 0.005229644775390625, + "train_loss_lm": 1.44171875 + }, + { + "epoch": 13.952059004302397, + "grad_norm": 0.2943558096885681, + "learning_rate": 1.2043729521897752e-06, + "loss": 1.4775, + "step": 45400, + "train_loss_gtc": 0.03116455078125, + "train_loss_gtm": 0.005698661804199218, + "train_loss_lm": 1.440625 + }, + { + "epoch": 13.98279041180086, + "grad_norm": 0.3945413827896118, + "learning_rate": 1.1350355620241226e-06, + "loss": 1.4789, + "step": 45500, + "train_loss_gtc": 0.03146484375, + "train_loss_gtm": 0.005878944396972657, + "train_loss_lm": 1.442421875 + }, + { + "epoch": 14.0, + "eval_loss": 1.60546875, + "eval_runtime": 3.9133, + "eval_samples_per_second": 254.264, + "eval_steps_per_second": 2.811, + "step": 45556, + "train_loss_gtc": 0.030979701450892856, + "train_loss_gtm": 0.002463647297450474, + "train_loss_lm": 1.4439174107142858, + "val_loss_gtc": 0.06396484375, + "val_loss_gtm": 0.06325559616088867, + "val_loss_lm": 1.47578125 + }, + { + "epoch": 14.013521819299324, + "grad_norm": 0.37518543004989624, + "learning_rate": 1.0677308076466385e-06, + "loss": 1.478, + "step": 45600, + "train_loss_gtc": 0.03331687233664773, + "train_loss_gtm": 0.010442083532159979, + "train_loss_lm": 1.4401633522727273 + }, + { + "epoch": 14.044253226797787, + "grad_norm": 0.3532905876636505, + "learning_rate": 1.002461488733003e-06, + "loss": 1.4728, + "step": 45700, + "train_loss_gtc": 0.0307568359375, + "train_loss_gtm": 0.0067650794982910155, + "train_loss_lm": 1.436328125 + }, + { + "epoch": 14.07498463429625, + "grad_norm": 0.40793976187705994, + "learning_rate": 9.392303202908848e-07, + "loss": 1.473, + "step": 45800, + "train_loss_gtc": 0.02984375, + "train_loss_gtm": 0.002398052215576172, + "train_loss_lm": 1.439375 + }, + { + "epoch": 14.105716041794714, + "grad_norm": 0.3508872985839844, + "learning_rate": 8.780399325470313e-07, + "loss": 1.4732, + "step": 45900, + "train_loss_gtc": 0.031231689453125, + "train_loss_gtm": 0.004491233825683593, + "train_loss_lm": 1.43890625 + }, + { + "epoch": 14.136447449293177, + "grad_norm": 1.9087783098220825, + "learning_rate": 8.188928708378229e-07, + "loss": 1.4757, + "step": 46000, + "train_loss_gtc": 0.030797119140625, + "train_loss_gtm": 0.0036650848388671874, + "train_loss_lm": 1.437265625 + }, + { + "epoch": 14.167178856791642, + "grad_norm": 0.5606856942176819, + "learning_rate": 7.61791595503425e-07, + "loss": 1.4788, + "step": 46100, + "train_loss_gtc": 0.03298828125, + "train_loss_gtm": 0.008520011901855468, + "train_loss_lm": 1.440625 + }, + { + "epoch": 14.197910264290105, + "grad_norm": 0.5114173293113708, + "learning_rate": 7.067384817854184e-07, + "loss": 1.4751, + "step": 46200, + "train_loss_gtc": 0.031270751953125, + "train_loss_gtm": 0.006043624877929687, + "train_loss_lm": 1.438828125 + }, + { + "epoch": 14.228641671788568, + "grad_norm": 0.3046382963657379, + "learning_rate": 6.537358197280241e-07, + "loss": 1.4759, + "step": 46300, + "train_loss_gtc": 0.029881591796875, + "train_loss_gtm": 0.00199981689453125, + "train_loss_lm": 1.44015625 + }, + { + "epoch": 14.259373079287032, + "grad_norm": 0.3361985683441162, + "learning_rate": 6.027858140828235e-07, + "loss": 1.48, + "step": 46400, + "train_loss_gtc": 0.032630615234375, + "train_loss_gtm": 0.00764068603515625, + "train_loss_lm": 1.438125 + }, + { + "epoch": 14.290104486785495, + "grad_norm": 0.332077294588089, + "learning_rate": 5.538905842170649e-07, + "loss": 1.4752, + "step": 46500, + "train_loss_gtc": 0.03033203125, + "train_loss_gtm": 0.008144855499267578, + "train_loss_lm": 1.4384375 + }, + { + "epoch": 14.320835894283958, + "grad_norm": 0.3548417389392853, + "learning_rate": 5.070521640254788e-07, + "loss": 1.4765, + "step": 46600, + "train_loss_gtc": 0.031827392578125, + "train_loss_gtm": 0.005391826629638672, + "train_loss_lm": 1.438671875 + }, + { + "epoch": 14.351567301782422, + "grad_norm": 0.35536205768585205, + "learning_rate": 4.622725018457008e-07, + "loss": 1.4791, + "step": 46700, + "train_loss_gtc": 0.031151123046875, + "train_loss_gtm": 0.005690097808837891, + "train_loss_lm": 1.439765625 + }, + { + "epoch": 14.382298709280885, + "grad_norm": 0.36706480383872986, + "learning_rate": 4.1955346037721445e-07, + "loss": 1.4791, + "step": 46800, + "train_loss_gtc": 0.032288818359375, + "train_loss_gtm": 0.011700859069824219, + "train_loss_lm": 1.438515625 + }, + { + "epoch": 14.413030116779348, + "grad_norm": 0.3751659393310547, + "learning_rate": 3.7889681660386866e-07, + "loss": 1.4776, + "step": 46900, + "train_loss_gtc": 0.03252685546875, + "train_loss_gtm": 0.009815158843994141, + "train_loss_lm": 1.438125 + }, + { + "epoch": 14.443761524277813, + "grad_norm": 0.32809144258499146, + "learning_rate": 3.403042617199592e-07, + "loss": 1.4792, + "step": 47000, + "train_loss_gtc": 0.032452392578125, + "train_loss_gtm": 0.0022745895385742187, + "train_loss_lm": 1.440546875 + }, + { + "epoch": 14.474492931776275, + "grad_norm": 0.3384574055671692, + "learning_rate": 3.037774010598793e-07, + "loss": 1.4798, + "step": 47100, + "train_loss_gtc": 0.03126708984375, + "train_loss_gtm": 0.00357696533203125, + "train_loss_lm": 1.44046875 + }, + { + "epoch": 14.505224339274738, + "grad_norm": 0.3188628554344177, + "learning_rate": 2.6931775403135074e-07, + "loss": 1.4742, + "step": 47200, + "train_loss_gtc": 0.030872802734375, + "train_loss_gtm": 0.0019446945190429688, + "train_loss_lm": 1.439609375 + }, + { + "epoch": 14.535955746773203, + "grad_norm": 0.6094369292259216, + "learning_rate": 2.369267540522191e-07, + "loss": 1.4732, + "step": 47300, + "train_loss_gtc": 0.031270751953125, + "train_loss_gtm": 0.0019117927551269532, + "train_loss_lm": 1.44015625 + }, + { + "epoch": 14.566687154271666, + "grad_norm": 0.3574078381061554, + "learning_rate": 2.0660574849081237e-07, + "loss": 1.477, + "step": 47400, + "train_loss_gtc": 0.030413818359375, + "train_loss_gtm": 0.0030179214477539062, + "train_loss_lm": 1.44078125 + }, + { + "epoch": 14.597418561770128, + "grad_norm": 0.34775152802467346, + "learning_rate": 1.783559986099137e-07, + "loss": 1.4796, + "step": 47500, + "train_loss_gtc": 0.031693115234375, + "train_loss_gtm": 0.0033152008056640623, + "train_loss_lm": 1.4409375 + }, + { + "epoch": 14.628149969268593, + "grad_norm": 0.3424850106239319, + "learning_rate": 1.521786795142921e-07, + "loss": 1.4761, + "step": 47600, + "train_loss_gtc": 0.029925537109375, + "train_loss_gtm": 0.0017087364196777344, + "train_loss_lm": 1.43875 + }, + { + "epoch": 14.658881376767056, + "grad_norm": 0.31570205092430115, + "learning_rate": 1.2807488010181945e-07, + "loss": 1.4798, + "step": 47700, + "train_loss_gtc": 0.032105712890625, + "train_loss_gtm": 0.0031629753112792968, + "train_loss_lm": 1.441171875 + }, + { + "epoch": 14.689612784265519, + "grad_norm": 0.3260713517665863, + "learning_rate": 1.0604560301816224e-07, + "loss": 1.4788, + "step": 47800, + "train_loss_gtc": 0.031483154296875, + "train_loss_gtm": 0.006717433929443359, + "train_loss_lm": 1.439140625 + }, + { + "epoch": 14.720344191763983, + "grad_norm": 0.3307570219039917, + "learning_rate": 8.609176461510938e-08, + "loss": 1.4739, + "step": 47900, + "train_loss_gtc": 0.03117919921875, + "train_loss_gtm": 0.005090217590332031, + "train_loss_lm": 1.438203125 + }, + { + "epoch": 14.751075599262446, + "grad_norm": 0.29519036412239075, + "learning_rate": 6.821419491241376e-08, + "loss": 1.472, + "step": 48000, + "train_loss_gtc": 0.030118408203125, + "train_loss_gtm": 0.002861900329589844, + "train_loss_lm": 1.440078125 + }, + { + "epoch": 14.781807006760909, + "grad_norm": 0.38826602697372437, + "learning_rate": 5.2413637563292205e-08, + "loss": 1.4736, + "step": 48100, + "train_loss_gtc": 0.030203857421875, + "train_loss_gtm": 0.004720573425292969, + "train_loss_lm": 1.43859375 + }, + { + "epoch": 14.812538414259373, + "grad_norm": 0.4058314263820648, + "learning_rate": 3.8690749823488967e-08, + "loss": 1.4767, + "step": 48200, + "train_loss_gtc": 0.03099365234375, + "train_loss_gtm": 0.005015640258789062, + "train_loss_lm": 1.4378125 + }, + { + "epoch": 14.843269821757836, + "grad_norm": 0.38652893900871277, + "learning_rate": 2.7046102523919927e-08, + "loss": 1.471, + "step": 48300, + "train_loss_gtc": 0.030880126953125, + "train_loss_gtm": 0.0031046104431152344, + "train_loss_lm": 1.439609375 + }, + { + "epoch": 14.8740012292563, + "grad_norm": 0.3828943967819214, + "learning_rate": 1.748018004694707e-08, + "loss": 1.4734, + "step": 48400, + "train_loss_gtc": 0.029947509765625, + "train_loss_gtm": 0.0023802757263183595, + "train_loss_lm": 1.43984375 + }, + { + "epoch": 14.904732636754764, + "grad_norm": 0.3241247832775116, + "learning_rate": 9.993380306222432e-09, + "loss": 1.4751, + "step": 48500, + "train_loss_gtc": 0.0314208984375, + "train_loss_gtm": 0.005055904388427734, + "train_loss_lm": 1.440234375 + }, + { + "epoch": 14.935464044253226, + "grad_norm": 0.3689234256744385, + "learning_rate": 4.586014730140198e-09, + "loss": 1.477, + "step": 48600, + "train_loss_gtc": 0.032603759765625, + "train_loss_gtm": 0.004544639587402343, + "train_loss_lm": 1.43734375 + }, + { + "epoch": 14.966195451751691, + "grad_norm": 0.376676082611084, + "learning_rate": 1.2583082488581976e-09, + "loss": 1.4825, + "step": 48700, + "train_loss_gtc": 0.03310546875, + "train_loss_gtm": 0.009465179443359374, + "train_loss_lm": 1.439765625 + }, + { + "epoch": 14.996926859250154, + "grad_norm": 0.3576776385307312, + "learning_rate": 1.0399284983142465e-11, + "loss": 1.4777, + "step": 48800, + "train_loss_gtc": 0.0317724609375, + "train_loss_gtm": 0.005117168426513672, + "train_loss_lm": 1.439921875 + }, + { + "epoch": 15.0, + "eval_loss": 1.6144530773162842, + "eval_runtime": 3.9176, + "eval_samples_per_second": 253.984, + "eval_steps_per_second": 2.808, + "step": 48810, + "train_loss_gtc": 0.03424072265625, + "train_loss_gtm": 0.010280990600585937, + "train_loss_lm": 1.44140625, + "val_loss_gtc": 0.0637451171875, + "val_loss_gtm": 0.06403846740722656, + "val_loss_lm": 1.475 + } + ], + "logging_steps": 100, + "max_steps": 48810, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 3, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 48, + "trial_name": null, + "trial_params": null +}