| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.1, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0, |
| "grad_norm": 22.986753463745117, |
| "learning_rate": 0.00198, |
| "loss": 1.7003, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0, |
| "grad_norm": 8.073963165283203, |
| "learning_rate": 0.00196, |
| "loss": 1.9785, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 7.6197099685668945, |
| "learning_rate": 0.0019399999999999999, |
| "loss": 1.9353, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 6.2731146812438965, |
| "learning_rate": 0.00192, |
| "loss": 1.5962, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 5.425559043884277, |
| "learning_rate": 0.0019, |
| "loss": 1.389, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 4.872774600982666, |
| "learning_rate": 0.00188, |
| "loss": 1.4156, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.01, |
| "grad_norm": 5.199490070343018, |
| "learning_rate": 0.00186, |
| "loss": 1.6583, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 4.413191318511963, |
| "learning_rate": 0.00184, |
| "loss": 1.4334, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 5.90674352645874, |
| "learning_rate": 0.00182, |
| "loss": 1.6046, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 6.410930633544922, |
| "learning_rate": 0.0018000000000000002, |
| "loss": 1.5504, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 3.529223680496216, |
| "learning_rate": 0.0017800000000000001, |
| "loss": 1.6463, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 4.781284332275391, |
| "learning_rate": 0.00176, |
| "loss": 1.6136, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 5.6382951736450195, |
| "learning_rate": 0.00174, |
| "loss": 1.5105, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 4.392839431762695, |
| "learning_rate": 0.00172, |
| "loss": 1.6061, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 3.9011926651000977, |
| "learning_rate": 0.0017, |
| "loss": 1.6188, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 4.002920627593994, |
| "learning_rate": 0.00168, |
| "loss": 1.4177, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03, |
| "grad_norm": 4.34838342666626, |
| "learning_rate": 0.00166, |
| "loss": 1.5689, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 8.142854690551758, |
| "learning_rate": 0.00164, |
| "loss": 1.5804, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 5.837989330291748, |
| "learning_rate": 0.0016200000000000001, |
| "loss": 1.5981, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.33852219581604, |
| "learning_rate": 0.0016, |
| "loss": 1.4347, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 3.069826602935791, |
| "learning_rate": 0.00158, |
| "loss": 1.4809, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 2.71095609664917, |
| "learning_rate": 0.0015600000000000002, |
| "loss": 1.388, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 4.450407981872559, |
| "learning_rate": 0.0015400000000000001, |
| "loss": 1.603, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 4.475738048553467, |
| "learning_rate": 0.00152, |
| "loss": 1.5731, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 3.051819086074829, |
| "learning_rate": 0.0015, |
| "loss": 1.5133, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 5.014269828796387, |
| "learning_rate": 0.00148, |
| "loss": 1.5458, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 2.558957815170288, |
| "learning_rate": 0.00146, |
| "loss": 1.4918, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 4.6234660148620605, |
| "learning_rate": 0.0014399999999999999, |
| "loss": 1.5247, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 2.9923095703125, |
| "learning_rate": 0.00142, |
| "loss": 1.6671, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 7.883978366851807, |
| "learning_rate": 0.0014, |
| "loss": 1.5732, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 3.3218066692352295, |
| "learning_rate": 0.00138, |
| "loss": 1.6297, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 9.045559883117676, |
| "learning_rate": 0.00136, |
| "loss": 1.6581, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.832301139831543, |
| "learning_rate": 0.00134, |
| "loss": 1.6966, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 3.6719107627868652, |
| "learning_rate": 0.00132, |
| "loss": 1.5904, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 5.4335455894470215, |
| "learning_rate": 0.0013000000000000002, |
| "loss": 1.6643, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 3.2848339080810547, |
| "learning_rate": 0.00128, |
| "loss": 1.4174, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.07, |
| "grad_norm": 2.8206841945648193, |
| "learning_rate": 0.00126, |
| "loss": 1.7362, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 3.389599084854126, |
| "learning_rate": 0.00124, |
| "loss": 1.6058, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 4.887266159057617, |
| "learning_rate": 0.00122, |
| "loss": 1.4604, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.9653384685516357, |
| "learning_rate": 0.0012, |
| "loss": 1.5152, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.5362136363983154, |
| "learning_rate": 0.00118, |
| "loss": 1.469, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 2.7318670749664307, |
| "learning_rate": 0.00116, |
| "loss": 1.4136, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 3.6364078521728516, |
| "learning_rate": 0.00114, |
| "loss": 1.6937, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 1.9428081512451172, |
| "learning_rate": 0.0011200000000000001, |
| "loss": 1.4825, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.1813700199127197, |
| "learning_rate": 0.0011, |
| "loss": 1.4593, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 4.612652778625488, |
| "learning_rate": 0.00108, |
| "loss": 1.389, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.09, |
| "grad_norm": 2.5145719051361084, |
| "learning_rate": 0.0010600000000000002, |
| "loss": 1.3896, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.4980382919311523, |
| "learning_rate": 0.0010400000000000001, |
| "loss": 1.3725, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.6995227336883545, |
| "learning_rate": 0.00102, |
| "loss": 1.4769, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.1483154296875, |
| "learning_rate": 0.001, |
| "loss": 1.5983, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "total_flos": 4.0788232684018176e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|