| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.6149993409779886, |
| "eval_steps": 100, |
| "global_step": 3100, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.08, |
| "grad_norm": 6.90625, |
| "learning_rate": 3e-06, |
| "loss": 3.4225, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_loss": 3.0902585983276367, |
| "eval_runtime": 125.4074, |
| "eval_samples_per_second": 67.221, |
| "eval_steps_per_second": 33.61, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17, |
| "grad_norm": 3.3125, |
| "learning_rate": 6e-06, |
| "loss": 2.6313, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.17, |
| "eval_loss": 1.4370466470718384, |
| "eval_runtime": 124.5179, |
| "eval_samples_per_second": 67.701, |
| "eval_steps_per_second": 33.851, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9e-06, |
| "loss": 1.2926, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.25, |
| "eval_loss": 1.0395174026489258, |
| "eval_runtime": 125.1005, |
| "eval_samples_per_second": 67.386, |
| "eval_steps_per_second": 33.693, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 1.6015625, |
| "learning_rate": 1.2e-05, |
| "loss": 1.1492, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.34, |
| "eval_loss": 1.0025979280471802, |
| "eval_runtime": 124.2401, |
| "eval_samples_per_second": 67.852, |
| "eval_steps_per_second": 33.926, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 1.2734375, |
| "learning_rate": 1.5e-05, |
| "loss": 1.1062, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.42, |
| "eval_loss": 0.9820207357406616, |
| "eval_runtime": 124.4083, |
| "eval_samples_per_second": 67.761, |
| "eval_steps_per_second": 33.88, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.51, |
| "grad_norm": 1.3984375, |
| "learning_rate": 1.4960378963658215e-05, |
| "loss": 1.0761, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.51, |
| "eval_loss": 0.9677473902702332, |
| "eval_runtime": 124.3968, |
| "eval_samples_per_second": 67.767, |
| "eval_steps_per_second": 33.884, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.59, |
| "grad_norm": 1.328125, |
| "learning_rate": 1.484193447503841e-05, |
| "loss": 1.056, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.59, |
| "eval_loss": 0.9569339156150818, |
| "eval_runtime": 124.228, |
| "eval_samples_per_second": 67.859, |
| "eval_steps_per_second": 33.93, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 1.359375, |
| "learning_rate": 1.4645917972377404e-05, |
| "loss": 1.0446, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.67, |
| "eval_loss": 0.9495565891265869, |
| "eval_runtime": 124.2985, |
| "eval_samples_per_second": 67.821, |
| "eval_steps_per_second": 33.91, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.76, |
| "grad_norm": 1.3125, |
| "learning_rate": 1.4374400489535342e-05, |
| "loss": 1.0399, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.76, |
| "eval_loss": 0.9445509910583496, |
| "eval_runtime": 124.152, |
| "eval_samples_per_second": 67.901, |
| "eval_steps_per_second": 33.95, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.84, |
| "grad_norm": 1.2578125, |
| "learning_rate": 1.403025077426025e-05, |
| "loss": 1.0252, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.84, |
| "eval_loss": 0.9403882026672363, |
| "eval_runtime": 123.8904, |
| "eval_samples_per_second": 68.044, |
| "eval_steps_per_second": 34.022, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 1.296875, |
| "learning_rate": 1.3617104978119044e-05, |
| "loss": 1.0284, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.93, |
| "eval_loss": 0.9368470907211304, |
| "eval_runtime": 123.9167, |
| "eval_samples_per_second": 68.03, |
| "eval_steps_per_second": 34.015, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.01, |
| "grad_norm": 1.375, |
| "learning_rate": 1.3139328238339287e-05, |
| "loss": 1.0171, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.01, |
| "eval_loss": 0.9342640042304993, |
| "eval_runtime": 124.6542, |
| "eval_samples_per_second": 67.627, |
| "eval_steps_per_second": 33.814, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 1.21875, |
| "learning_rate": 1.2601968557473e-05, |
| "loss": 1.0086, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.1, |
| "eval_loss": 0.9315484166145325, |
| "eval_runtime": 124.4874, |
| "eval_samples_per_second": 67.718, |
| "eval_steps_per_second": 33.859, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.18, |
| "grad_norm": 1.2265625, |
| "learning_rate": 1.2010703468171973e-05, |
| "loss": 1.0056, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.18, |
| "eval_loss": 0.9296947717666626, |
| "eval_runtime": 124.175, |
| "eval_samples_per_second": 67.888, |
| "eval_steps_per_second": 33.944, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 1.25, |
| "learning_rate": 1.1371780046593758e-05, |
| "loss": 1.0083, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.27, |
| "eval_loss": 0.9280151128768921, |
| "eval_runtime": 123.9014, |
| "eval_samples_per_second": 68.038, |
| "eval_steps_per_second": 34.019, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 1.2890625, |
| "learning_rate": 1.069194890823328e-05, |
| "loss": 1.0037, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.35, |
| "eval_loss": 0.926633358001709, |
| "eval_runtime": 123.7914, |
| "eval_samples_per_second": 68.098, |
| "eval_steps_per_second": 34.049, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.43, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.978392883554342e-06, |
| "loss": 0.9951, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.43, |
| "eval_loss": 0.9259628653526306, |
| "eval_runtime": 124.5775, |
| "eval_samples_per_second": 67.669, |
| "eval_steps_per_second": 33.834, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.52, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.238651127006462e-06, |
| "loss": 0.9962, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.52, |
| "eval_loss": 0.9251705408096313, |
| "eval_runtime": 124.4301, |
| "eval_samples_per_second": 67.749, |
| "eval_steps_per_second": 33.874, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.48053946126157e-06, |
| "loss": 0.9907, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_loss": 0.924351155757904, |
| "eval_runtime": 124.2821, |
| "eval_samples_per_second": 67.83, |
| "eval_steps_per_second": 33.915, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.69, |
| "grad_norm": 1.3046875, |
| "learning_rate": 7.712067798282222e-06, |
| "loss": 1.0003, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.69, |
| "eval_loss": 0.9239010810852051, |
| "eval_runtime": 123.3108, |
| "eval_samples_per_second": 68.364, |
| "eval_steps_per_second": 34.182, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.77, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.941355509718164e-06, |
| "loss": 0.9976, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.77, |
| "eval_loss": 0.9232047200202942, |
| "eval_runtime": 124.3131, |
| "eval_samples_per_second": 67.813, |
| "eval_steps_per_second": 33.906, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.86, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.176545640794535e-06, |
| "loss": 0.9896, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.86, |
| "eval_loss": 0.923151969909668, |
| "eval_runtime": 123.8739, |
| "eval_samples_per_second": 68.053, |
| "eval_steps_per_second": 34.027, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.94, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.4257188740743086e-06, |
| "loss": 0.9954, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.94, |
| "eval_loss": 0.9228904247283936, |
| "eval_runtime": 124.4202, |
| "eval_samples_per_second": 67.754, |
| "eval_steps_per_second": 33.877, |
| "step": 2300 |
| }, |
| { |
| "epoch": 2.02, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.696808152120318e-06, |
| "loss": 0.9982, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.02, |
| "eval_loss": 0.9227039813995361, |
| "eval_runtime": 124.3531, |
| "eval_samples_per_second": 67.791, |
| "eval_steps_per_second": 33.895, |
| "step": 2400 |
| }, |
| { |
| "epoch": 2.11, |
| "grad_norm": 1.2421875, |
| "learning_rate": 3.997514861120414e-06, |
| "loss": 0.9957, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.11, |
| "eval_loss": 0.9224779605865479, |
| "eval_runtime": 123.9488, |
| "eval_samples_per_second": 68.012, |
| "eval_steps_per_second": 34.006, |
| "step": 2500 |
| }, |
| { |
| "epoch": 2.19, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.335227461046941e-06, |
| "loss": 0.9883, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.19, |
| "eval_loss": 0.922317624092102, |
| "eval_runtime": 124.5434, |
| "eval_samples_per_second": 67.687, |
| "eval_steps_per_second": 33.844, |
| "step": 2600 |
| }, |
| { |
| "epoch": 2.28, |
| "grad_norm": 1.2890625, |
| "learning_rate": 2.7169434220724335e-06, |
| "loss": 0.9849, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.28, |
| "eval_loss": 0.9223732352256775, |
| "eval_runtime": 124.8267, |
| "eval_samples_per_second": 67.534, |
| "eval_steps_per_second": 33.767, |
| "step": 2700 |
| }, |
| { |
| "epoch": 2.36, |
| "grad_norm": 1.2109375, |
| "learning_rate": 2.14919529203096e-06, |
| "loss": 0.9974, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.36, |
| "eval_loss": 0.9222919940948486, |
| "eval_runtime": 124.9102, |
| "eval_samples_per_second": 67.489, |
| "eval_steps_per_second": 33.744, |
| "step": 2800 |
| }, |
| { |
| "epoch": 2.45, |
| "grad_norm": 1.28125, |
| "learning_rate": 1.6379816760674141e-06, |
| "loss": 0.9854, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.45, |
| "eval_loss": 0.9222748279571533, |
| "eval_runtime": 124.029, |
| "eval_samples_per_second": 67.968, |
| "eval_steps_per_second": 33.984, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.53, |
| "grad_norm": 1.234375, |
| "learning_rate": 1.1887038577168646e-06, |
| "loss": 0.9831, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.53, |
| "eval_loss": 0.922382652759552, |
| "eval_runtime": 123.9646, |
| "eval_samples_per_second": 68.003, |
| "eval_steps_per_second": 34.002, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.61, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.061087310508917e-07, |
| "loss": 0.9958, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.61, |
| "eval_loss": 0.9223530292510986, |
| "eval_runtime": 124.2043, |
| "eval_samples_per_second": 67.872, |
| "eval_steps_per_second": 33.936, |
| "step": 3100 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 3555, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "total_flos": 2.106483852705915e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|