| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 49.82051282051282, |
| "eval_steps": 500, |
| "global_step": 100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.41025641025641024, |
| "grad_norm": 8.151017189025879, |
| "learning_rate": 0.0, |
| "loss": 1.3291, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.8205128205128205, |
| "grad_norm": 8.3428955078125, |
| "learning_rate": 1.1111111111111112e-05, |
| "loss": 1.3498, |
| "step": 2 |
| }, |
| { |
| "epoch": 1.4102564102564101, |
| "grad_norm": 3.494961738586426, |
| "learning_rate": 2.2222222222222223e-05, |
| "loss": 1.3324, |
| "step": 3 |
| }, |
| { |
| "epoch": 1.8205128205128205, |
| "grad_norm": 3.1425204277038574, |
| "learning_rate": 3.3333333333333335e-05, |
| "loss": 1.127, |
| "step": 4 |
| }, |
| { |
| "epoch": 2.41025641025641, |
| "grad_norm": 2.282806396484375, |
| "learning_rate": 4.4444444444444447e-05, |
| "loss": 1.029, |
| "step": 5 |
| }, |
| { |
| "epoch": 2.8205128205128203, |
| "grad_norm": 4.562756061553955, |
| "learning_rate": 5.555555555555556e-05, |
| "loss": 0.8966, |
| "step": 6 |
| }, |
| { |
| "epoch": 3.41025641025641, |
| "grad_norm": 5.485057353973389, |
| "learning_rate": 6.666666666666667e-05, |
| "loss": 0.8497, |
| "step": 7 |
| }, |
| { |
| "epoch": 3.8205128205128203, |
| "grad_norm": 1.8801462650299072, |
| "learning_rate": 7.777777777777778e-05, |
| "loss": 0.71, |
| "step": 8 |
| }, |
| { |
| "epoch": 4.410256410256411, |
| "grad_norm": 1.7765756845474243, |
| "learning_rate": 8.888888888888889e-05, |
| "loss": 0.6853, |
| "step": 9 |
| }, |
| { |
| "epoch": 4.82051282051282, |
| "grad_norm": 2.199324131011963, |
| "learning_rate": 0.0001, |
| "loss": 0.535, |
| "step": 10 |
| }, |
| { |
| "epoch": 5.410256410256411, |
| "grad_norm": 2.8563392162323, |
| "learning_rate": 0.00011111111111111112, |
| "loss": 0.5501, |
| "step": 11 |
| }, |
| { |
| "epoch": 5.82051282051282, |
| "grad_norm": 2.2397141456604004, |
| "learning_rate": 0.00012222222222222224, |
| "loss": 0.3953, |
| "step": 12 |
| }, |
| { |
| "epoch": 6.410256410256411, |
| "grad_norm": 1.4788132905960083, |
| "learning_rate": 0.00013333333333333334, |
| "loss": 0.3417, |
| "step": 13 |
| }, |
| { |
| "epoch": 6.82051282051282, |
| "grad_norm": 1.7762037515640259, |
| "learning_rate": 0.00014444444444444444, |
| "loss": 0.2219, |
| "step": 14 |
| }, |
| { |
| "epoch": 7.410256410256411, |
| "grad_norm": 0.7586674690246582, |
| "learning_rate": 0.00015555555555555556, |
| "loss": 0.2104, |
| "step": 15 |
| }, |
| { |
| "epoch": 7.82051282051282, |
| "grad_norm": 0.9284120202064514, |
| "learning_rate": 0.0001666666666666667, |
| "loss": 0.1662, |
| "step": 16 |
| }, |
| { |
| "epoch": 8.41025641025641, |
| "grad_norm": 0.6511984467506409, |
| "learning_rate": 0.00017777777777777779, |
| "loss": 0.1547, |
| "step": 17 |
| }, |
| { |
| "epoch": 8.820512820512821, |
| "grad_norm": 1.1308850049972534, |
| "learning_rate": 0.00018888888888888888, |
| "loss": 0.1444, |
| "step": 18 |
| }, |
| { |
| "epoch": 9.41025641025641, |
| "grad_norm": 0.662821352481842, |
| "learning_rate": 0.0002, |
| "loss": 0.123, |
| "step": 19 |
| }, |
| { |
| "epoch": 9.820512820512821, |
| "grad_norm": 0.5322834253311157, |
| "learning_rate": 0.00019999854312354064, |
| "loss": 0.1036, |
| "step": 20 |
| }, |
| { |
| "epoch": 10.41025641025641, |
| "grad_norm": 0.5911012887954712, |
| "learning_rate": 0.00019999417253661235, |
| "loss": 0.0969, |
| "step": 21 |
| }, |
| { |
| "epoch": 10.820512820512821, |
| "grad_norm": 0.628558874130249, |
| "learning_rate": 0.00019998688836656323, |
| "loss": 0.0857, |
| "step": 22 |
| }, |
| { |
| "epoch": 11.41025641025641, |
| "grad_norm": 0.7193434834480286, |
| "learning_rate": 0.00019997669082563597, |
| "loss": 0.0748, |
| "step": 23 |
| }, |
| { |
| "epoch": 11.820512820512821, |
| "grad_norm": 0.3524335026741028, |
| "learning_rate": 0.00019996358021096176, |
| "loss": 0.066, |
| "step": 24 |
| }, |
| { |
| "epoch": 12.41025641025641, |
| "grad_norm": 0.47225716710090637, |
| "learning_rate": 0.00019994755690455152, |
| "loss": 0.0613, |
| "step": 25 |
| }, |
| { |
| "epoch": 12.820512820512821, |
| "grad_norm": 0.4532913267612457, |
| "learning_rate": 0.00019992862137328474, |
| "loss": 0.0408, |
| "step": 26 |
| }, |
| { |
| "epoch": 13.41025641025641, |
| "grad_norm": 0.2988104224205017, |
| "learning_rate": 0.00019990677416889608, |
| "loss": 0.0353, |
| "step": 27 |
| }, |
| { |
| "epoch": 13.820512820512821, |
| "grad_norm": 0.31864798069000244, |
| "learning_rate": 0.0001998820159279591, |
| "loss": 0.033, |
| "step": 28 |
| }, |
| { |
| "epoch": 14.41025641025641, |
| "grad_norm": 0.29291290044784546, |
| "learning_rate": 0.0001998543473718677, |
| "loss": 0.0235, |
| "step": 29 |
| }, |
| { |
| "epoch": 14.820512820512821, |
| "grad_norm": 0.24096333980560303, |
| "learning_rate": 0.00019982376930681531, |
| "loss": 0.0194, |
| "step": 30 |
| }, |
| { |
| "epoch": 15.41025641025641, |
| "grad_norm": 0.2400427609682083, |
| "learning_rate": 0.00019979028262377118, |
| "loss": 0.0177, |
| "step": 31 |
| }, |
| { |
| "epoch": 15.820512820512821, |
| "grad_norm": 0.23485173285007477, |
| "learning_rate": 0.00019975388829845448, |
| "loss": 0.0132, |
| "step": 32 |
| }, |
| { |
| "epoch": 16.41025641025641, |
| "grad_norm": 0.4795994758605957, |
| "learning_rate": 0.00019971458739130598, |
| "loss": 0.0123, |
| "step": 33 |
| }, |
| { |
| "epoch": 16.82051282051282, |
| "grad_norm": 0.3436650335788727, |
| "learning_rate": 0.00019967238104745696, |
| "loss": 0.0077, |
| "step": 34 |
| }, |
| { |
| "epoch": 17.41025641025641, |
| "grad_norm": 0.24164724349975586, |
| "learning_rate": 0.000199627270496696, |
| "loss": 0.0083, |
| "step": 35 |
| }, |
| { |
| "epoch": 17.82051282051282, |
| "grad_norm": 0.11744043976068497, |
| "learning_rate": 0.0001995792570534331, |
| "loss": 0.0053, |
| "step": 36 |
| }, |
| { |
| "epoch": 18.41025641025641, |
| "grad_norm": 0.2771929204463959, |
| "learning_rate": 0.0001995283421166614, |
| "loss": 0.0076, |
| "step": 37 |
| }, |
| { |
| "epoch": 18.82051282051282, |
| "grad_norm": 0.14852669835090637, |
| "learning_rate": 0.00019947452716991633, |
| "loss": 0.0042, |
| "step": 38 |
| }, |
| { |
| "epoch": 19.41025641025641, |
| "grad_norm": 1.1028482913970947, |
| "learning_rate": 0.00019941781378123244, |
| "loss": 0.0114, |
| "step": 39 |
| }, |
| { |
| "epoch": 19.82051282051282, |
| "grad_norm": 0.23756887018680573, |
| "learning_rate": 0.00019935820360309777, |
| "loss": 0.0043, |
| "step": 40 |
| }, |
| { |
| "epoch": 20.41025641025641, |
| "grad_norm": 0.8769266605377197, |
| "learning_rate": 0.00019929569837240564, |
| "loss": 0.0047, |
| "step": 41 |
| }, |
| { |
| "epoch": 20.82051282051282, |
| "grad_norm": 0.531132698059082, |
| "learning_rate": 0.00019923029991040402, |
| "loss": 0.0063, |
| "step": 42 |
| }, |
| { |
| "epoch": 21.41025641025641, |
| "grad_norm": 1.1996066570281982, |
| "learning_rate": 0.00019916201012264254, |
| "loss": 0.0143, |
| "step": 43 |
| }, |
| { |
| "epoch": 21.82051282051282, |
| "grad_norm": 0.6255332827568054, |
| "learning_rate": 0.0001990908309989168, |
| "loss": 0.0168, |
| "step": 44 |
| }, |
| { |
| "epoch": 22.41025641025641, |
| "grad_norm": 49.508148193359375, |
| "learning_rate": 0.00019901676461321068, |
| "loss": 0.0621, |
| "step": 45 |
| }, |
| { |
| "epoch": 22.82051282051282, |
| "grad_norm": 8.113191604614258, |
| "learning_rate": 0.00019893981312363562, |
| "loss": 0.1062, |
| "step": 46 |
| }, |
| { |
| "epoch": 23.41025641025641, |
| "grad_norm": 2.3446950912475586, |
| "learning_rate": 0.00019885997877236788, |
| "loss": 0.066, |
| "step": 47 |
| }, |
| { |
| "epoch": 23.82051282051282, |
| "grad_norm": 153.14146423339844, |
| "learning_rate": 0.00019877726388558325, |
| "loss": 0.0612, |
| "step": 48 |
| }, |
| { |
| "epoch": 24.41025641025641, |
| "grad_norm": 43.04759216308594, |
| "learning_rate": 0.00019869167087338907, |
| "loss": 0.135, |
| "step": 49 |
| }, |
| { |
| "epoch": 24.82051282051282, |
| "grad_norm": 51.32644271850586, |
| "learning_rate": 0.00019860320222975431, |
| "loss": 0.1375, |
| "step": 50 |
| }, |
| { |
| "epoch": 25.41025641025641, |
| "grad_norm": 1.9464935064315796, |
| "learning_rate": 0.00019851186053243666, |
| "loss": 0.3427, |
| "step": 51 |
| }, |
| { |
| "epoch": 25.82051282051282, |
| "grad_norm": 381.23974609375, |
| "learning_rate": 0.00019841764844290744, |
| "loss": 2.1722, |
| "step": 52 |
| }, |
| { |
| "epoch": 26.41025641025641, |
| "grad_norm": 119.89301300048828, |
| "learning_rate": 0.00019832056870627417, |
| "loss": 2.4659, |
| "step": 53 |
| }, |
| { |
| "epoch": 26.82051282051282, |
| "grad_norm": 48.73936080932617, |
| "learning_rate": 0.00019822062415120054, |
| "loss": 0.8509, |
| "step": 54 |
| }, |
| { |
| "epoch": 27.41025641025641, |
| "grad_norm": 19.564029693603516, |
| "learning_rate": 0.0001981178176898239, |
| "loss": 0.4258, |
| "step": 55 |
| }, |
| { |
| "epoch": 27.82051282051282, |
| "grad_norm": 33.161495208740234, |
| "learning_rate": 0.00019801215231767056, |
| "loss": 0.2414, |
| "step": 56 |
| }, |
| { |
| "epoch": 28.41025641025641, |
| "grad_norm": 5.548079013824463, |
| "learning_rate": 0.00019790363111356837, |
| "loss": 0.1882, |
| "step": 57 |
| }, |
| { |
| "epoch": 28.82051282051282, |
| "grad_norm": 4.21547794342041, |
| "learning_rate": 0.00019779225723955707, |
| "loss": 0.1264, |
| "step": 58 |
| }, |
| { |
| "epoch": 29.41025641025641, |
| "grad_norm": 2.7072598934173584, |
| "learning_rate": 0.00019767803394079615, |
| "loss": 0.2181, |
| "step": 59 |
| }, |
| { |
| "epoch": 29.82051282051282, |
| "grad_norm": 7.378205299377441, |
| "learning_rate": 0.0001975609645454704, |
| "loss": 0.1593, |
| "step": 60 |
| }, |
| { |
| "epoch": 30.41025641025641, |
| "grad_norm": 9.17626667022705, |
| "learning_rate": 0.00019744105246469263, |
| "loss": 0.3464, |
| "step": 61 |
| }, |
| { |
| "epoch": 30.82051282051282, |
| "grad_norm": 27.878585815429688, |
| "learning_rate": 0.00019731830119240463, |
| "loss": 0.4882, |
| "step": 62 |
| }, |
| { |
| "epoch": 31.41025641025641, |
| "grad_norm": 15.55352783203125, |
| "learning_rate": 0.0001971927143052752, |
| "loss": 0.7857, |
| "step": 63 |
| }, |
| { |
| "epoch": 31.82051282051282, |
| "grad_norm": 16.477920532226562, |
| "learning_rate": 0.00019706429546259593, |
| "loss": 0.663, |
| "step": 64 |
| }, |
| { |
| "epoch": 32.41025641025641, |
| "grad_norm": 13.829732894897461, |
| "learning_rate": 0.00019693304840617457, |
| "loss": 0.5652, |
| "step": 65 |
| }, |
| { |
| "epoch": 32.82051282051282, |
| "grad_norm": 1.885118842124939, |
| "learning_rate": 0.00019679897696022608, |
| "loss": 0.2583, |
| "step": 66 |
| }, |
| { |
| "epoch": 33.41025641025641, |
| "grad_norm": 1.9031124114990234, |
| "learning_rate": 0.00019666208503126112, |
| "loss": 0.2566, |
| "step": 67 |
| }, |
| { |
| "epoch": 33.82051282051282, |
| "grad_norm": 1.2540283203125, |
| "learning_rate": 0.0001965223766079723, |
| "loss": 0.1855, |
| "step": 68 |
| }, |
| { |
| "epoch": 34.41025641025641, |
| "grad_norm": 0.9428790807723999, |
| "learning_rate": 0.00019637985576111778, |
| "loss": 0.1633, |
| "step": 69 |
| }, |
| { |
| "epoch": 34.82051282051282, |
| "grad_norm": 0.8358070254325867, |
| "learning_rate": 0.00019623452664340306, |
| "loss": 0.1277, |
| "step": 70 |
| }, |
| { |
| "epoch": 35.41025641025641, |
| "grad_norm": 0.9116950631141663, |
| "learning_rate": 0.0001960863934893594, |
| "loss": 0.1124, |
| "step": 71 |
| }, |
| { |
| "epoch": 35.82051282051282, |
| "grad_norm": 1.235021948814392, |
| "learning_rate": 0.00019593546061522093, |
| "loss": 0.0928, |
| "step": 72 |
| }, |
| { |
| "epoch": 36.41025641025641, |
| "grad_norm": 0.7440080046653748, |
| "learning_rate": 0.00019578173241879872, |
| "loss": 0.0839, |
| "step": 73 |
| }, |
| { |
| "epoch": 36.82051282051282, |
| "grad_norm": 0.5238239765167236, |
| "learning_rate": 0.00019562521337935257, |
| "loss": 0.0589, |
| "step": 74 |
| }, |
| { |
| "epoch": 37.41025641025641, |
| "grad_norm": 0.637015700340271, |
| "learning_rate": 0.00019546590805746052, |
| "loss": 0.0538, |
| "step": 75 |
| }, |
| { |
| "epoch": 37.82051282051282, |
| "grad_norm": 0.3730023205280304, |
| "learning_rate": 0.0001953038210948861, |
| "loss": 0.0379, |
| "step": 76 |
| }, |
| { |
| "epoch": 38.41025641025641, |
| "grad_norm": 0.39598342776298523, |
| "learning_rate": 0.00019513895721444286, |
| "loss": 0.0314, |
| "step": 77 |
| }, |
| { |
| "epoch": 38.82051282051282, |
| "grad_norm": 0.26019713282585144, |
| "learning_rate": 0.00019497132121985695, |
| "loss": 0.0247, |
| "step": 78 |
| }, |
| { |
| "epoch": 39.41025641025641, |
| "grad_norm": 0.27270156145095825, |
| "learning_rate": 0.00019480091799562704, |
| "loss": 0.0219, |
| "step": 79 |
| }, |
| { |
| "epoch": 39.82051282051282, |
| "grad_norm": 0.31213200092315674, |
| "learning_rate": 0.0001946277525068821, |
| "loss": 0.0177, |
| "step": 80 |
| }, |
| { |
| "epoch": 40.41025641025641, |
| "grad_norm": 0.3065904676914215, |
| "learning_rate": 0.00019445182979923654, |
| "loss": 0.0167, |
| "step": 81 |
| }, |
| { |
| "epoch": 40.82051282051282, |
| "grad_norm": 0.25565171241760254, |
| "learning_rate": 0.00019427315499864344, |
| "loss": 0.0132, |
| "step": 82 |
| }, |
| { |
| "epoch": 41.41025641025641, |
| "grad_norm": 0.16997747123241425, |
| "learning_rate": 0.000194091733311245, |
| "loss": 0.0106, |
| "step": 83 |
| }, |
| { |
| "epoch": 41.82051282051282, |
| "grad_norm": 0.13165056705474854, |
| "learning_rate": 0.0001939075700232209, |
| "loss": 0.0108, |
| "step": 84 |
| }, |
| { |
| "epoch": 42.41025641025641, |
| "grad_norm": 0.10982735455036163, |
| "learning_rate": 0.00019372067050063438, |
| "loss": 0.0096, |
| "step": 85 |
| }, |
| { |
| "epoch": 42.82051282051282, |
| "grad_norm": 0.10672740638256073, |
| "learning_rate": 0.00019353104018927567, |
| "loss": 0.0083, |
| "step": 86 |
| }, |
| { |
| "epoch": 43.41025641025641, |
| "grad_norm": 0.1570005714893341, |
| "learning_rate": 0.0001933386846145036, |
| "loss": 0.0066, |
| "step": 87 |
| }, |
| { |
| "epoch": 43.82051282051282, |
| "grad_norm": 0.1381327509880066, |
| "learning_rate": 0.00019314360938108425, |
| "loss": 0.008, |
| "step": 88 |
| }, |
| { |
| "epoch": 44.41025641025641, |
| "grad_norm": 0.13799023628234863, |
| "learning_rate": 0.00019294582017302797, |
| "loss": 0.0075, |
| "step": 89 |
| }, |
| { |
| "epoch": 44.82051282051282, |
| "grad_norm": 0.07857757061719894, |
| "learning_rate": 0.00019274532275342354, |
| "loss": 0.0058, |
| "step": 90 |
| }, |
| { |
| "epoch": 45.41025641025641, |
| "grad_norm": 0.40940356254577637, |
| "learning_rate": 0.00019254212296427044, |
| "loss": 0.0078, |
| "step": 91 |
| }, |
| { |
| "epoch": 45.82051282051282, |
| "grad_norm": 0.13838538527488708, |
| "learning_rate": 0.0001923362267263084, |
| "loss": 0.0063, |
| "step": 92 |
| }, |
| { |
| "epoch": 46.41025641025641, |
| "grad_norm": 0.1280914694070816, |
| "learning_rate": 0.0001921276400388451, |
| "loss": 0.0051, |
| "step": 93 |
| }, |
| { |
| "epoch": 46.82051282051282, |
| "grad_norm": 0.1300235092639923, |
| "learning_rate": 0.00019191636897958122, |
| "loss": 0.0045, |
| "step": 94 |
| }, |
| { |
| "epoch": 47.41025641025641, |
| "grad_norm": 0.05682254955172539, |
| "learning_rate": 0.00019170241970443343, |
| "loss": 0.0045, |
| "step": 95 |
| }, |
| { |
| "epoch": 47.82051282051282, |
| "grad_norm": 0.06927549839019775, |
| "learning_rate": 0.00019148579844735497, |
| "loss": 0.0032, |
| "step": 96 |
| }, |
| { |
| "epoch": 48.41025641025641, |
| "grad_norm": 0.09624794870615005, |
| "learning_rate": 0.00019126651152015403, |
| "loss": 0.0041, |
| "step": 97 |
| }, |
| { |
| "epoch": 48.82051282051282, |
| "grad_norm": 0.0919504463672638, |
| "learning_rate": 0.00019104456531230984, |
| "loss": 0.0032, |
| "step": 98 |
| }, |
| { |
| "epoch": 49.41025641025641, |
| "grad_norm": 0.20492327213287354, |
| "learning_rate": 0.00019081996629078657, |
| "loss": 0.0039, |
| "step": 99 |
| }, |
| { |
| "epoch": 49.82051282051282, |
| "grad_norm": 0.042908795177936554, |
| "learning_rate": 0.0001905927209998447, |
| "loss": 0.002, |
| "step": 100 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 600, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 300, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.187460722471731e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|