| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8, |
| "eval_steps": 500, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 46.553125, |
| "epoch": 0.008, |
| "grad_norm": 0.05134107172489166, |
| "kl": 0.012939453125, |
| "learning_rate": 9.95e-07, |
| "loss": 0.0001, |
| "reward": 2.703125, |
| "reward_std": 0.11205126643180847, |
| "rewards/accuracy_reward": 1.7125, |
| "rewards/format_reward": 0.990625, |
| "step": 10 |
| }, |
| { |
| "completion_length": 49.34375, |
| "epoch": 0.016, |
| "grad_norm": 0.06966069340705872, |
| "kl": 0.01898193359375, |
| "learning_rate": 9.9e-07, |
| "loss": 0.0002, |
| "reward": 2.775, |
| "reward_std": 0.05, |
| "rewards/accuracy_reward": 1.78125, |
| "rewards/format_reward": 0.99375, |
| "step": 20 |
| }, |
| { |
| "completion_length": 44.63125, |
| "epoch": 0.024, |
| "grad_norm": 5.11226749420166, |
| "kl": 0.0212158203125, |
| "learning_rate": 9.849999999999999e-07, |
| "loss": 0.0002, |
| "reward": 2.546875, |
| "reward_std": 0.09568375647068024, |
| "rewards/accuracy_reward": 1.55625, |
| "rewards/format_reward": 0.990625, |
| "step": 30 |
| }, |
| { |
| "completion_length": 42.775, |
| "epoch": 0.032, |
| "grad_norm": 0.0820818841457367, |
| "kl": 0.042626953125, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0004, |
| "reward": 2.775, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.775, |
| "rewards/format_reward": 1.0, |
| "step": 40 |
| }, |
| { |
| "completion_length": 44.275, |
| "epoch": 0.04, |
| "grad_norm": 0.06030546873807907, |
| "kl": 0.03828125, |
| "learning_rate": 9.75e-07, |
| "loss": 0.0004, |
| "reward": 2.74375, |
| "reward_std": 0.026933756470680238, |
| "rewards/accuracy_reward": 1.74375, |
| "rewards/format_reward": 1.0, |
| "step": 50 |
| }, |
| { |
| "completion_length": 50.990625, |
| "epoch": 0.048, |
| "grad_norm": 0.10005596280097961, |
| "kl": 0.03060302734375, |
| "learning_rate": 9.7e-07, |
| "loss": 0.0003, |
| "reward": 2.60625, |
| "reward_std": 0.10193375647068023, |
| "rewards/accuracy_reward": 1.60625, |
| "rewards/format_reward": 1.0, |
| "step": 60 |
| }, |
| { |
| "completion_length": 54.375, |
| "epoch": 0.056, |
| "grad_norm": 4.453707695007324, |
| "kl": 0.0556640625, |
| "learning_rate": 9.649999999999999e-07, |
| "loss": 0.0006, |
| "reward": 2.590625, |
| "reward_std": 0.08318375647068024, |
| "rewards/accuracy_reward": 1.59375, |
| "rewards/format_reward": 0.996875, |
| "step": 70 |
| }, |
| { |
| "completion_length": 48.815625, |
| "epoch": 0.064, |
| "grad_norm": 2.5629329681396484, |
| "kl": 0.040283203125, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0004, |
| "reward": 2.765625, |
| "reward_std": 0.058183756470680234, |
| "rewards/accuracy_reward": 1.76875, |
| "rewards/format_reward": 0.996875, |
| "step": 80 |
| }, |
| { |
| "completion_length": 47.53125, |
| "epoch": 0.072, |
| "grad_norm": 0.08292120695114136, |
| "kl": 0.0712646484375, |
| "learning_rate": 9.55e-07, |
| "loss": 0.0007, |
| "reward": 2.825, |
| "reward_std": 0.05, |
| "rewards/accuracy_reward": 1.825, |
| "rewards/format_reward": 1.0, |
| "step": 90 |
| }, |
| { |
| "completion_length": 46.703125, |
| "epoch": 0.08, |
| "grad_norm": 2.7465286254882812, |
| "kl": 0.05367431640625, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0005, |
| "reward": 2.71875, |
| "reward_std": 0.07693375647068024, |
| "rewards/accuracy_reward": 1.71875, |
| "rewards/format_reward": 1.0, |
| "step": 100 |
| }, |
| { |
| "completion_length": 46.225, |
| "epoch": 0.088, |
| "grad_norm": 2.1839213371276855, |
| "kl": 0.0655517578125, |
| "learning_rate": 9.45e-07, |
| "loss": 0.0007, |
| "reward": 2.609375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 1.61875, |
| "rewards/format_reward": 0.990625, |
| "step": 110 |
| }, |
| { |
| "completion_length": 44.096875, |
| "epoch": 0.096, |
| "grad_norm": 0.07181887328624725, |
| "kl": 0.06865234375, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0007, |
| "reward": 2.71875, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.71875, |
| "rewards/format_reward": 1.0, |
| "step": 120 |
| }, |
| { |
| "completion_length": 44.590625, |
| "epoch": 0.104, |
| "grad_norm": 0.09902142733335495, |
| "kl": 0.0936767578125, |
| "learning_rate": 9.35e-07, |
| "loss": 0.0009, |
| "reward": 2.56875, |
| "reward_std": 0.0625, |
| "rewards/accuracy_reward": 1.575, |
| "rewards/format_reward": 0.99375, |
| "step": 130 |
| }, |
| { |
| "completion_length": 43.81875, |
| "epoch": 0.112, |
| "grad_norm": 2.340815305709839, |
| "kl": 0.066015625, |
| "learning_rate": 9.3e-07, |
| "loss": 0.0007, |
| "reward": 2.75, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.75, |
| "rewards/format_reward": 1.0, |
| "step": 140 |
| }, |
| { |
| "completion_length": 49.06875, |
| "epoch": 0.12, |
| "grad_norm": 2.58245849609375, |
| "kl": 0.0600341796875, |
| "learning_rate": 9.25e-07, |
| "loss": 0.0006, |
| "reward": 2.7125, |
| "reward_std": 0.125, |
| "rewards/accuracy_reward": 1.7125, |
| "rewards/format_reward": 1.0, |
| "step": 150 |
| }, |
| { |
| "completion_length": 52.84375, |
| "epoch": 0.128, |
| "grad_norm": 0.06839890778064728, |
| "kl": 0.0785400390625, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0008, |
| "reward": 2.7, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.7, |
| "rewards/format_reward": 1.0, |
| "step": 160 |
| }, |
| { |
| "completion_length": 47.7, |
| "epoch": 0.136, |
| "grad_norm": 0.11428700387477875, |
| "kl": 0.06865234375, |
| "learning_rate": 9.15e-07, |
| "loss": 0.0007, |
| "reward": 2.75, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.75, |
| "rewards/format_reward": 1.0, |
| "step": 170 |
| }, |
| { |
| "completion_length": 43.478125, |
| "epoch": 0.144, |
| "grad_norm": 2.188392400741577, |
| "kl": 0.062451171875, |
| "learning_rate": 9.1e-07, |
| "loss": 0.0006, |
| "reward": 2.615625, |
| "reward_std": 0.06875, |
| "rewards/accuracy_reward": 1.61875, |
| "rewards/format_reward": 0.996875, |
| "step": 180 |
| }, |
| { |
| "completion_length": 42.540625, |
| "epoch": 0.152, |
| "grad_norm": 3.399991512298584, |
| "kl": 0.076953125, |
| "learning_rate": 9.05e-07, |
| "loss": 0.0008, |
| "reward": 2.64375, |
| "reward_std": 0.09136751294136047, |
| "rewards/accuracy_reward": 1.64375, |
| "rewards/format_reward": 1.0, |
| "step": 190 |
| }, |
| { |
| "completion_length": 50.21875, |
| "epoch": 0.16, |
| "grad_norm": 0.10214658826589584, |
| "kl": 0.09365234375, |
| "learning_rate": 9e-07, |
| "loss": 0.0009, |
| "reward": 2.784375, |
| "reward_std": 0.05625, |
| "rewards/accuracy_reward": 1.7875, |
| "rewards/format_reward": 0.996875, |
| "step": 200 |
| }, |
| { |
| "completion_length": 54.35, |
| "epoch": 0.168, |
| "grad_norm": 0.08639144152402878, |
| "kl": 0.1749267578125, |
| "learning_rate": 8.95e-07, |
| "loss": 0.0017, |
| "reward": 2.7875, |
| "reward_std": 0.014433756470680237, |
| "rewards/accuracy_reward": 1.7875, |
| "rewards/format_reward": 1.0, |
| "step": 210 |
| }, |
| { |
| "completion_length": 56.29375, |
| "epoch": 0.176, |
| "grad_norm": 0.06954076141119003, |
| "kl": 0.119091796875, |
| "learning_rate": 8.9e-07, |
| "loss": 0.0012, |
| "reward": 2.75, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.75625, |
| "rewards/format_reward": 0.99375, |
| "step": 220 |
| }, |
| { |
| "completion_length": 48.8875, |
| "epoch": 0.184, |
| "grad_norm": 0.06490013003349304, |
| "kl": 0.12080078125, |
| "learning_rate": 8.85e-07, |
| "loss": 0.0012, |
| "reward": 2.65, |
| "reward_std": 0.07886751294136048, |
| "rewards/accuracy_reward": 1.65, |
| "rewards/format_reward": 1.0, |
| "step": 230 |
| }, |
| { |
| "completion_length": 41.115625, |
| "epoch": 0.192, |
| "grad_norm": 0.12679292261600494, |
| "kl": 0.12470703125, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0012, |
| "reward": 2.8125, |
| "reward_std": 0.07886751294136048, |
| "rewards/accuracy_reward": 1.81875, |
| "rewards/format_reward": 0.99375, |
| "step": 240 |
| }, |
| { |
| "completion_length": 40.4125, |
| "epoch": 0.2, |
| "grad_norm": 0.11438746750354767, |
| "kl": 30.11142578125, |
| "learning_rate": 8.75e-07, |
| "loss": 0.3012, |
| "reward": 2.725, |
| "reward_std": 0.05, |
| "rewards/accuracy_reward": 1.73125, |
| "rewards/format_reward": 0.99375, |
| "step": 250 |
| }, |
| { |
| "completion_length": 46.790625, |
| "epoch": 0.208, |
| "grad_norm": 2.282456159591675, |
| "kl": 0.10205078125, |
| "learning_rate": 8.699999999999999e-07, |
| "loss": 0.001, |
| "reward": 2.578125, |
| "reward_std": 0.06875, |
| "rewards/accuracy_reward": 1.58125, |
| "rewards/format_reward": 0.996875, |
| "step": 260 |
| }, |
| { |
| "completion_length": 52.253125, |
| "epoch": 0.216, |
| "grad_norm": 1.9098315238952637, |
| "kl": 0.105810546875, |
| "learning_rate": 8.65e-07, |
| "loss": 0.0011, |
| "reward": 2.8, |
| "reward_std": 0.07886751294136048, |
| "rewards/accuracy_reward": 1.8, |
| "rewards/format_reward": 1.0, |
| "step": 270 |
| }, |
| { |
| "completion_length": 49.828125, |
| "epoch": 0.224, |
| "grad_norm": 0.058336157351732254, |
| "kl": 0.0714599609375, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0007, |
| "reward": 2.73125, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.73125, |
| "rewards/format_reward": 1.0, |
| "step": 280 |
| }, |
| { |
| "completion_length": 47.14375, |
| "epoch": 0.232, |
| "grad_norm": 0.07711385935544968, |
| "kl": 0.08037109375, |
| "learning_rate": 8.55e-07, |
| "loss": 0.0008, |
| "reward": 2.875, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.875, |
| "rewards/format_reward": 1.0, |
| "step": 290 |
| }, |
| { |
| "completion_length": 46.759375, |
| "epoch": 0.24, |
| "grad_norm": 0.059466563165187836, |
| "kl": 0.079248046875, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.70625, |
| "reward_std": 0.051933756470680235, |
| "rewards/accuracy_reward": 1.70625, |
| "rewards/format_reward": 1.0, |
| "step": 300 |
| }, |
| { |
| "completion_length": 48.540625, |
| "epoch": 0.248, |
| "grad_norm": 3.2264294624328613, |
| "kl": 0.0768310546875, |
| "learning_rate": 8.45e-07, |
| "loss": 0.0008, |
| "reward": 2.7375, |
| "reward_std": 0.075, |
| "rewards/accuracy_reward": 1.7375, |
| "rewards/format_reward": 1.0, |
| "step": 310 |
| }, |
| { |
| "completion_length": 46.85, |
| "epoch": 0.256, |
| "grad_norm": 0.08373435586690903, |
| "kl": 0.088037109375, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0009, |
| "reward": 2.728125, |
| "reward_std": 0.08318375647068024, |
| "rewards/accuracy_reward": 1.73125, |
| "rewards/format_reward": 0.996875, |
| "step": 320 |
| }, |
| { |
| "completion_length": 45.0125, |
| "epoch": 0.264, |
| "grad_norm": 0.08248328417539597, |
| "kl": 0.084375, |
| "learning_rate": 8.349999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.684375, |
| "reward_std": 0.04568375647068024, |
| "rewards/accuracy_reward": 1.6875, |
| "rewards/format_reward": 0.996875, |
| "step": 330 |
| }, |
| { |
| "completion_length": 47.1, |
| "epoch": 0.272, |
| "grad_norm": 0.08357389271259308, |
| "kl": 0.07880859375, |
| "learning_rate": 8.299999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.628125, |
| "reward_std": 0.03318375647068024, |
| "rewards/accuracy_reward": 1.63125, |
| "rewards/format_reward": 0.996875, |
| "step": 340 |
| }, |
| { |
| "completion_length": 48.95625, |
| "epoch": 0.28, |
| "grad_norm": 1.7901896238327026, |
| "kl": 0.084033203125, |
| "learning_rate": 8.249999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.609375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 1.6125, |
| "rewards/format_reward": 0.996875, |
| "step": 350 |
| }, |
| { |
| "completion_length": 46.15, |
| "epoch": 0.288, |
| "grad_norm": 0.07559721171855927, |
| "kl": 0.14404296875, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0014, |
| "reward": 2.778125, |
| "reward_std": 0.04375, |
| "rewards/accuracy_reward": 1.78125, |
| "rewards/format_reward": 0.996875, |
| "step": 360 |
| }, |
| { |
| "completion_length": 43.8375, |
| "epoch": 0.296, |
| "grad_norm": 3.8727450370788574, |
| "kl": 0.109521484375, |
| "learning_rate": 8.149999999999999e-07, |
| "loss": 0.0011, |
| "reward": 2.83125, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.83125, |
| "rewards/format_reward": 1.0, |
| "step": 370 |
| }, |
| { |
| "completion_length": 45.9625, |
| "epoch": 0.304, |
| "grad_norm": 0.05233932286500931, |
| "kl": 0.0930908203125, |
| "learning_rate": 8.1e-07, |
| "loss": 0.0009, |
| "reward": 2.796875, |
| "reward_std": 0.00625, |
| "rewards/accuracy_reward": 1.8, |
| "rewards/format_reward": 0.996875, |
| "step": 380 |
| }, |
| { |
| "completion_length": 49.55, |
| "epoch": 0.312, |
| "grad_norm": 4.457919120788574, |
| "kl": 0.0723876953125, |
| "learning_rate": 8.05e-07, |
| "loss": 0.0007, |
| "reward": 2.75, |
| "reward_std": 0.053867512941360475, |
| "rewards/accuracy_reward": 1.75, |
| "rewards/format_reward": 1.0, |
| "step": 390 |
| }, |
| { |
| "completion_length": 50.909375, |
| "epoch": 0.32, |
| "grad_norm": 0.050397127866744995, |
| "kl": 0.08388671875, |
| "learning_rate": 8e-07, |
| "loss": 0.0008, |
| "reward": 2.7625, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.7625, |
| "rewards/format_reward": 1.0, |
| "step": 400 |
| }, |
| { |
| "completion_length": 49.165625, |
| "epoch": 0.328, |
| "grad_norm": 0.1388678401708603, |
| "kl": 0.084033203125, |
| "learning_rate": 7.95e-07, |
| "loss": 0.0008, |
| "reward": 2.6875, |
| "reward_std": 0.014433756470680237, |
| "rewards/accuracy_reward": 1.6875, |
| "rewards/format_reward": 1.0, |
| "step": 410 |
| }, |
| { |
| "completion_length": 48.190625, |
| "epoch": 0.336, |
| "grad_norm": 2.034395933151245, |
| "kl": 0.078125, |
| "learning_rate": 7.9e-07, |
| "loss": 0.0008, |
| "reward": 2.76875, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.76875, |
| "rewards/format_reward": 1.0, |
| "step": 420 |
| }, |
| { |
| "completion_length": 49.45, |
| "epoch": 0.344, |
| "grad_norm": 2.2621846199035645, |
| "kl": 0.075048828125, |
| "learning_rate": 7.85e-07, |
| "loss": 0.0008, |
| "reward": 2.634375, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 1.6375, |
| "rewards/format_reward": 0.996875, |
| "step": 430 |
| }, |
| { |
| "completion_length": 52.03125, |
| "epoch": 0.352, |
| "grad_norm": 2.9660024642944336, |
| "kl": 0.0776123046875, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.7625, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.7625, |
| "rewards/format_reward": 1.0, |
| "step": 440 |
| }, |
| { |
| "completion_length": 52.496875, |
| "epoch": 0.36, |
| "grad_norm": 0.040182050317525864, |
| "kl": 0.0726806640625, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0007, |
| "reward": 2.6875, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.6875, |
| "rewards/format_reward": 1.0, |
| "step": 450 |
| }, |
| { |
| "completion_length": 51.725, |
| "epoch": 0.368, |
| "grad_norm": 0.06841447949409485, |
| "kl": 0.0802001953125, |
| "learning_rate": 7.699999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.8, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.8, |
| "rewards/format_reward": 1.0, |
| "step": 460 |
| }, |
| { |
| "completion_length": 48.14375, |
| "epoch": 0.376, |
| "grad_norm": 0.04733005911111832, |
| "kl": 0.0659912109375, |
| "learning_rate": 7.65e-07, |
| "loss": 0.0007, |
| "reward": 2.61875, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.61875, |
| "rewards/format_reward": 1.0, |
| "step": 470 |
| }, |
| { |
| "completion_length": 46.89375, |
| "epoch": 0.384, |
| "grad_norm": 2.7484917640686035, |
| "kl": 0.0697998046875, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0007, |
| "reward": 2.74375, |
| "reward_std": 0.09136751294136047, |
| "rewards/accuracy_reward": 1.74375, |
| "rewards/format_reward": 1.0, |
| "step": 480 |
| }, |
| { |
| "completion_length": 48.48125, |
| "epoch": 0.392, |
| "grad_norm": 1.7968782186508179, |
| "kl": 0.0580078125, |
| "learning_rate": 7.55e-07, |
| "loss": 0.0006, |
| "reward": 2.7125, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.7125, |
| "rewards/format_reward": 1.0, |
| "step": 490 |
| }, |
| { |
| "completion_length": 50.034375, |
| "epoch": 0.4, |
| "grad_norm": 0.08426347374916077, |
| "kl": 0.077099609375, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0008, |
| "reward": 2.68125, |
| "reward_std": 0.04136751294136047, |
| "rewards/accuracy_reward": 1.68125, |
| "rewards/format_reward": 1.0, |
| "step": 500 |
| }, |
| { |
| "completion_length": 51.378125, |
| "epoch": 0.408, |
| "grad_norm": 0.040815118700265884, |
| "kl": 0.06416015625, |
| "learning_rate": 7.45e-07, |
| "loss": 0.0006, |
| "reward": 2.73125, |
| "reward_std": 0.026933756470680238, |
| "rewards/accuracy_reward": 1.73125, |
| "rewards/format_reward": 1.0, |
| "step": 510 |
| }, |
| { |
| "completion_length": 49.878125, |
| "epoch": 0.416, |
| "grad_norm": 0.06027600169181824, |
| "kl": 0.0675537109375, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0007, |
| "reward": 2.671875, |
| "reward_std": 0.00625, |
| "rewards/accuracy_reward": 1.675, |
| "rewards/format_reward": 0.996875, |
| "step": 520 |
| }, |
| { |
| "completion_length": 47.921875, |
| "epoch": 0.424, |
| "grad_norm": 0.06604389101266861, |
| "kl": 0.07177734375, |
| "learning_rate": 7.35e-07, |
| "loss": 0.0007, |
| "reward": 2.675, |
| "reward_std": 0.08943375647068023, |
| "rewards/accuracy_reward": 1.68125, |
| "rewards/format_reward": 0.99375, |
| "step": 530 |
| }, |
| { |
| "completion_length": 41.890625, |
| "epoch": 0.432, |
| "grad_norm": 2.579275608062744, |
| "kl": 0.080859375, |
| "learning_rate": 7.3e-07, |
| "loss": 0.0008, |
| "reward": 2.75, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.75, |
| "rewards/format_reward": 1.0, |
| "step": 540 |
| }, |
| { |
| "completion_length": 44.046875, |
| "epoch": 0.44, |
| "grad_norm": 0.04179125651717186, |
| "kl": 0.076025390625, |
| "learning_rate": 7.249999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.5375, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.5375, |
| "rewards/format_reward": 1.0, |
| "step": 550 |
| }, |
| { |
| "completion_length": 46.725, |
| "epoch": 0.448, |
| "grad_norm": 0.04865502566099167, |
| "kl": 0.075830078125, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0008, |
| "reward": 2.66875, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.66875, |
| "rewards/format_reward": 1.0, |
| "step": 560 |
| }, |
| { |
| "completion_length": 48.0875, |
| "epoch": 0.456, |
| "grad_norm": 0.1781499981880188, |
| "kl": 92.47451171875, |
| "learning_rate": 7.149999999999999e-07, |
| "loss": 0.9243, |
| "reward": 2.8, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.8, |
| "rewards/format_reward": 1.0, |
| "step": 570 |
| }, |
| { |
| "completion_length": 49.703125, |
| "epoch": 0.464, |
| "grad_norm": 0.05255131423473358, |
| "kl": 0.0656982421875, |
| "learning_rate": 7.1e-07, |
| "loss": 0.0007, |
| "reward": 2.6625, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.6625, |
| "rewards/format_reward": 1.0, |
| "step": 580 |
| }, |
| { |
| "completion_length": 52.06875, |
| "epoch": 0.472, |
| "grad_norm": 0.1266418695449829, |
| "kl": 0.0781005859375, |
| "learning_rate": 7.049999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.75, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.75, |
| "rewards/format_reward": 1.0, |
| "step": 590 |
| }, |
| { |
| "completion_length": 53.475, |
| "epoch": 0.48, |
| "grad_norm": 0.07561592757701874, |
| "kl": 0.0699951171875, |
| "learning_rate": 7e-07, |
| "loss": 0.0007, |
| "reward": 2.6875, |
| "reward_std": 0.053867512941360475, |
| "rewards/accuracy_reward": 1.6875, |
| "rewards/format_reward": 1.0, |
| "step": 600 |
| }, |
| { |
| "completion_length": 52.0625, |
| "epoch": 0.488, |
| "grad_norm": 0.04883831739425659, |
| "kl": 0.0799560546875, |
| "learning_rate": 6.949999999999999e-07, |
| "loss": 0.0008, |
| "reward": 2.65625, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.65625, |
| "rewards/format_reward": 1.0, |
| "step": 610 |
| }, |
| { |
| "completion_length": 49.54375, |
| "epoch": 0.496, |
| "grad_norm": 2.3243064880371094, |
| "kl": 0.0752685546875, |
| "learning_rate": 6.9e-07, |
| "loss": 0.0008, |
| "reward": 2.815625, |
| "reward_std": 0.058183756470680234, |
| "rewards/accuracy_reward": 1.81875, |
| "rewards/format_reward": 0.996875, |
| "step": 620 |
| }, |
| { |
| "completion_length": 48.690625, |
| "epoch": 0.504, |
| "grad_norm": 0.06750122457742691, |
| "kl": 0.06513671875, |
| "learning_rate": 6.85e-07, |
| "loss": 0.0007, |
| "reward": 2.84375, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.84375, |
| "rewards/format_reward": 1.0, |
| "step": 630 |
| }, |
| { |
| "completion_length": 49.271875, |
| "epoch": 0.512, |
| "grad_norm": 0.056099992245435715, |
| "kl": 0.0666259765625, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0007, |
| "reward": 2.69375, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.69375, |
| "rewards/format_reward": 1.0, |
| "step": 640 |
| }, |
| { |
| "completion_length": 46.4375, |
| "epoch": 0.52, |
| "grad_norm": 0.0455087348818779, |
| "kl": 0.0546630859375, |
| "learning_rate": 6.75e-07, |
| "loss": 0.0005, |
| "reward": 2.75625, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.75625, |
| "rewards/format_reward": 1.0, |
| "step": 650 |
| }, |
| { |
| "completion_length": 46.496875, |
| "epoch": 0.528, |
| "grad_norm": 0.05418640747666359, |
| "kl": 0.0645263671875, |
| "learning_rate": 6.7e-07, |
| "loss": 0.0006, |
| "reward": 2.6875, |
| "reward_std": 0.014433756470680237, |
| "rewards/accuracy_reward": 1.6875, |
| "rewards/format_reward": 1.0, |
| "step": 660 |
| }, |
| { |
| "completion_length": 46.328125, |
| "epoch": 0.536, |
| "grad_norm": 4.0458455085754395, |
| "kl": 0.081103515625, |
| "learning_rate": 6.65e-07, |
| "loss": 0.0008, |
| "reward": 2.65625, |
| "reward_std": 0.08080126941204072, |
| "rewards/accuracy_reward": 1.65625, |
| "rewards/format_reward": 1.0, |
| "step": 670 |
| }, |
| { |
| "completion_length": 48.91875, |
| "epoch": 0.544, |
| "grad_norm": 0.04970540851354599, |
| "kl": 0.0717041015625, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0007, |
| "reward": 2.7625, |
| "reward_std": 0.04330126941204071, |
| "rewards/accuracy_reward": 1.7625, |
| "rewards/format_reward": 1.0, |
| "step": 680 |
| }, |
| { |
| "completion_length": 49.01875, |
| "epoch": 0.552, |
| "grad_norm": 0.1746923178434372, |
| "kl": 0.073779296875, |
| "learning_rate": 6.55e-07, |
| "loss": 0.0007, |
| "reward": 2.75, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.75, |
| "rewards/format_reward": 1.0, |
| "step": 690 |
| }, |
| { |
| "completion_length": 48.3125, |
| "epoch": 0.56, |
| "grad_norm": 0.051023293286561966, |
| "kl": 0.06783447265625, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0007, |
| "reward": 2.7625, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.7625, |
| "rewards/format_reward": 1.0, |
| "step": 700 |
| }, |
| { |
| "completion_length": 49.11875, |
| "epoch": 0.568, |
| "grad_norm": 0.07166194915771484, |
| "kl": 0.0619384765625, |
| "learning_rate": 6.45e-07, |
| "loss": 0.0006, |
| "reward": 2.7375, |
| "reward_std": 0.014433756470680237, |
| "rewards/accuracy_reward": 1.7375, |
| "rewards/format_reward": 1.0, |
| "step": 710 |
| }, |
| { |
| "completion_length": 51.103125, |
| "epoch": 0.576, |
| "grad_norm": 0.08520376682281494, |
| "kl": 0.0830078125, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0008, |
| "reward": 2.7375, |
| "reward_std": 0.014433756470680237, |
| "rewards/accuracy_reward": 1.7375, |
| "rewards/format_reward": 1.0, |
| "step": 720 |
| }, |
| { |
| "completion_length": 49.615625, |
| "epoch": 0.584, |
| "grad_norm": 0.10399647802114487, |
| "kl": 0.0688232421875, |
| "learning_rate": 6.35e-07, |
| "loss": 0.0007, |
| "reward": 2.69375, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.69375, |
| "rewards/format_reward": 1.0, |
| "step": 730 |
| }, |
| { |
| "completion_length": 50.596875, |
| "epoch": 0.592, |
| "grad_norm": 0.06369677186012268, |
| "kl": 0.087890625, |
| "learning_rate": 6.3e-07, |
| "loss": 0.0009, |
| "reward": 2.61875, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.61875, |
| "rewards/format_reward": 1.0, |
| "step": 740 |
| }, |
| { |
| "completion_length": 50.56875, |
| "epoch": 0.6, |
| "grad_norm": 0.07198835164308548, |
| "kl": 0.10087890625, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.001, |
| "reward": 2.625, |
| "reward_std": 0.06443375647068024, |
| "rewards/accuracy_reward": 1.625, |
| "rewards/format_reward": 1.0, |
| "step": 750 |
| }, |
| { |
| "completion_length": 50.953125, |
| "epoch": 0.608, |
| "grad_norm": 0.04980659857392311, |
| "kl": 0.101806640625, |
| "learning_rate": 6.2e-07, |
| "loss": 0.001, |
| "reward": 2.721875, |
| "reward_std": 0.03125, |
| "rewards/accuracy_reward": 1.725, |
| "rewards/format_reward": 0.996875, |
| "step": 760 |
| }, |
| { |
| "completion_length": 46.609375, |
| "epoch": 0.616, |
| "grad_norm": 2.673631191253662, |
| "kl": 0.0730224609375, |
| "learning_rate": 6.149999999999999e-07, |
| "loss": 0.0007, |
| "reward": 2.6875, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.6875, |
| "rewards/format_reward": 1.0, |
| "step": 770 |
| }, |
| { |
| "completion_length": 46.16875, |
| "epoch": 0.624, |
| "grad_norm": 0.07191024720668793, |
| "kl": 0.07197265625, |
| "learning_rate": 6.1e-07, |
| "loss": 0.0007, |
| "reward": 2.6125, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.6125, |
| "rewards/format_reward": 1.0, |
| "step": 780 |
| }, |
| { |
| "completion_length": 47.346875, |
| "epoch": 0.632, |
| "grad_norm": 0.31487828493118286, |
| "kl": 0.0890625, |
| "learning_rate": 6.049999999999999e-07, |
| "loss": 0.0009, |
| "reward": 2.7, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.7, |
| "rewards/format_reward": 1.0, |
| "step": 790 |
| }, |
| { |
| "completion_length": 48.5125, |
| "epoch": 0.64, |
| "grad_norm": 0.04281134530901909, |
| "kl": 0.0651611328125, |
| "learning_rate": 6e-07, |
| "loss": 0.0007, |
| "reward": 2.65, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.65, |
| "rewards/format_reward": 1.0, |
| "step": 800 |
| }, |
| { |
| "completion_length": 47.9625, |
| "epoch": 0.648, |
| "grad_norm": 1.7782899141311646, |
| "kl": 0.0711669921875, |
| "learning_rate": 5.949999999999999e-07, |
| "loss": 0.0007, |
| "reward": 2.634375, |
| "reward_std": 0.04568375647068024, |
| "rewards/accuracy_reward": 1.6375, |
| "rewards/format_reward": 0.996875, |
| "step": 810 |
| }, |
| { |
| "completion_length": 47.7, |
| "epoch": 0.656, |
| "grad_norm": 0.9939271211624146, |
| "kl": 0.07099609375, |
| "learning_rate": 5.9e-07, |
| "loss": 0.0007, |
| "reward": 2.6625, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.6625, |
| "rewards/format_reward": 1.0, |
| "step": 820 |
| }, |
| { |
| "completion_length": 45.790625, |
| "epoch": 0.664, |
| "grad_norm": 0.05890406668186188, |
| "kl": 0.0596923828125, |
| "learning_rate": 5.849999999999999e-07, |
| "loss": 0.0006, |
| "reward": 2.696875, |
| "reward_std": 0.00625, |
| "rewards/accuracy_reward": 1.7, |
| "rewards/format_reward": 0.996875, |
| "step": 830 |
| }, |
| { |
| "completion_length": 46.675, |
| "epoch": 0.672, |
| "grad_norm": 0.062360286712646484, |
| "kl": 0.071240234375, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0007, |
| "reward": 2.5875, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.5875, |
| "rewards/format_reward": 1.0, |
| "step": 840 |
| }, |
| { |
| "completion_length": 47.621875, |
| "epoch": 0.68, |
| "grad_norm": 2.2732224464416504, |
| "kl": 1.6703369140625, |
| "learning_rate": 5.749999999999999e-07, |
| "loss": 0.0167, |
| "reward": 2.64375, |
| "reward_std": 0.04136751294136047, |
| "rewards/accuracy_reward": 1.64375, |
| "rewards/format_reward": 1.0, |
| "step": 850 |
| }, |
| { |
| "completion_length": 50.35, |
| "epoch": 0.688, |
| "grad_norm": 2.1026487350463867, |
| "kl": 0.09111328125, |
| "learning_rate": 5.699999999999999e-07, |
| "loss": 0.0009, |
| "reward": 2.6625, |
| "reward_std": 0.053867512941360475, |
| "rewards/accuracy_reward": 1.6625, |
| "rewards/format_reward": 1.0, |
| "step": 860 |
| }, |
| { |
| "completion_length": 52.321875, |
| "epoch": 0.696, |
| "grad_norm": 3.0173561573028564, |
| "kl": 321.521728515625, |
| "learning_rate": 5.649999999999999e-07, |
| "loss": 3.2171, |
| "reward": 2.46875, |
| "reward_std": 0.04136751294136047, |
| "rewards/accuracy_reward": 1.46875, |
| "rewards/format_reward": 1.0, |
| "step": 870 |
| }, |
| { |
| "completion_length": 49.7625, |
| "epoch": 0.704, |
| "grad_norm": 0.06468257308006287, |
| "kl": 0.39306640625, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0039, |
| "reward": 2.7125, |
| "reward_std": 0.06443375647068024, |
| "rewards/accuracy_reward": 1.7125, |
| "rewards/format_reward": 1.0, |
| "step": 880 |
| }, |
| { |
| "completion_length": 48.240625, |
| "epoch": 0.712, |
| "grad_norm": 0.07906866073608398, |
| "kl": 0.12939453125, |
| "learning_rate": 5.55e-07, |
| "loss": 0.0013, |
| "reward": 2.68125, |
| "reward_std": 0.026933756470680238, |
| "rewards/accuracy_reward": 1.68125, |
| "rewards/format_reward": 1.0, |
| "step": 890 |
| }, |
| { |
| "completion_length": 49.03125, |
| "epoch": 0.72, |
| "grad_norm": 0.07313551008701324, |
| "kl": 0.18505859375, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0019, |
| "reward": 2.69375, |
| "reward_std": 0.026933756470680238, |
| "rewards/accuracy_reward": 1.69375, |
| "rewards/format_reward": 1.0, |
| "step": 900 |
| }, |
| { |
| "completion_length": 49.60625, |
| "epoch": 0.728, |
| "grad_norm": 4.0763630867004395, |
| "kl": 2.13671875, |
| "learning_rate": 5.45e-07, |
| "loss": 0.0213, |
| "reward": 2.753125, |
| "reward_std": 0.058183756470680234, |
| "rewards/accuracy_reward": 1.75625, |
| "rewards/format_reward": 0.996875, |
| "step": 910 |
| }, |
| { |
| "completion_length": 49.83125, |
| "epoch": 0.736, |
| "grad_norm": 4.245804786682129, |
| "kl": 0.094384765625, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0009, |
| "reward": 2.75625, |
| "reward_std": 0.04136751294136047, |
| "rewards/accuracy_reward": 1.75625, |
| "rewards/format_reward": 1.0, |
| "step": 920 |
| }, |
| { |
| "completion_length": 50.996875, |
| "epoch": 0.744, |
| "grad_norm": 39.86748504638672, |
| "kl": 3.122021484375, |
| "learning_rate": 5.35e-07, |
| "loss": 0.0313, |
| "reward": 2.8125, |
| "reward_std": 0.03943375647068024, |
| "rewards/accuracy_reward": 1.8125, |
| "rewards/format_reward": 1.0, |
| "step": 930 |
| }, |
| { |
| "completion_length": 49.59375, |
| "epoch": 0.752, |
| "grad_norm": 0.05779128894209862, |
| "kl": 0.0998046875, |
| "learning_rate": 5.3e-07, |
| "loss": 0.001, |
| "reward": 2.79375, |
| "reward_std": 0.051933756470680235, |
| "rewards/accuracy_reward": 1.79375, |
| "rewards/format_reward": 1.0, |
| "step": 940 |
| }, |
| { |
| "completion_length": 47.74375, |
| "epoch": 0.76, |
| "grad_norm": 0.056427907198667526, |
| "kl": 0.445361328125, |
| "learning_rate": 5.25e-07, |
| "loss": 0.0045, |
| "reward": 2.76875, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.76875, |
| "rewards/format_reward": 1.0, |
| "step": 950 |
| }, |
| { |
| "completion_length": 48.50625, |
| "epoch": 0.768, |
| "grad_norm": 66.90420532226562, |
| "kl": 2.232958984375, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0223, |
| "reward": 2.80625, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.80625, |
| "rewards/format_reward": 1.0, |
| "step": 960 |
| }, |
| { |
| "completion_length": 51.01875, |
| "epoch": 0.776, |
| "grad_norm": 0.09945037215948105, |
| "kl": 0.09228515625, |
| "learning_rate": 5.149999999999999e-07, |
| "loss": 0.0009, |
| "reward": 2.775, |
| "reward_std": 0.0, |
| "rewards/accuracy_reward": 1.775, |
| "rewards/format_reward": 1.0, |
| "step": 970 |
| }, |
| { |
| "completion_length": 50.090625, |
| "epoch": 0.784, |
| "grad_norm": 0.07448896020650864, |
| "kl": 0.15419921875, |
| "learning_rate": 5.1e-07, |
| "loss": 0.0015, |
| "reward": 2.76875, |
| "reward_std": 0.0375, |
| "rewards/accuracy_reward": 1.76875, |
| "rewards/format_reward": 1.0, |
| "step": 980 |
| }, |
| { |
| "completion_length": 49.23125, |
| "epoch": 0.792, |
| "grad_norm": 2.0038902759552, |
| "kl": 0.098828125, |
| "learning_rate": 5.049999999999999e-07, |
| "loss": 0.001, |
| "reward": 2.64375, |
| "reward_std": 0.0125, |
| "rewards/accuracy_reward": 1.64375, |
| "rewards/format_reward": 1.0, |
| "step": 990 |
| }, |
| { |
| "completion_length": 49.790625, |
| "epoch": 0.8, |
| "grad_norm": 0.03871207684278488, |
| "kl": 0.31083984375, |
| "learning_rate": 5e-07, |
| "loss": 0.0031, |
| "reward": 2.7625, |
| "reward_std": 0.025, |
| "rewards/accuracy_reward": 1.7625, |
| "rewards/format_reward": 1.0, |
| "step": 1000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 2000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|