{ "best_global_step": 800, "best_metric": 209.9661102294922, "best_model_checkpoint": "final-model-dpo-ad-1ep/checkpoint-800", "epoch": 1.0, "eval_steps": 800, "global_step": 3230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03096634345545685, "grad_norm": 4.53125, "learning_rate": 9.976838348954221e-06, "loss": 5.5573, "step": 100 }, { "epoch": 0.0619326869109137, "grad_norm": 4.15625, "learning_rate": 9.906634890087323e-06, "loss": 0.9191, "step": 200 }, { "epoch": 0.09289903036637055, "grad_norm": 4.96875, "learning_rate": 9.790050865156384e-06, "loss": 1.0287, "step": 300 }, { "epoch": 0.1238653738218274, "grad_norm": 11.625, "learning_rate": 9.628188298907782e-06, "loss": 1.0742, "step": 400 }, { "epoch": 0.15483171727728426, "grad_norm": 8.6875, "learning_rate": 9.422577217034351e-06, "loss": 1.2919, "step": 500 }, { "epoch": 0.1857980607327411, "grad_norm": 16.75, "learning_rate": 9.175161183420499e-06, "loss": 0.9978, "step": 600 }, { "epoch": 0.21676440418819795, "grad_norm": 5.1875, "learning_rate": 8.888278928367003e-06, "loss": 1.0586, "step": 700 }, { "epoch": 0.2477307476436548, "grad_norm": 15.4375, "learning_rate": 8.564642241456986e-06, "loss": 1.4034, "step": 800 }, { "epoch": 0.2477307476436548, "eval_logits/chosen": NaN, "eval_logits/rejected": NaN, "eval_logps/chosen": -6010.39697265625, "eval_logps/rejected": -4464.38623046875, "eval_loss": 209.9661102294922, "eval_rewards/accuracies": 0.3250929117202759, "eval_rewards/chosen": -555.6686401367188, "eval_rewards/margins": -142.71261596679688, "eval_rewards/rejected": -412.95599365234375, "eval_runtime": 1394.4098, "eval_samples_per_second": 13.895, "eval_steps_per_second": 13.895, "step": 800 }, { "epoch": 0.27869709109911167, "grad_norm": 13.875, "learning_rate": 8.207310338033391e-06, "loss": 1.5456, "step": 900 }, { "epoch": 0.3096634345545685, "grad_norm": 40.25, "learning_rate": 7.819660941592014e-06, "loss": 1.1894, "step": 1000 }, { "epoch": 0.34062977801002536, "grad_norm": 40.25, "learning_rate": 7.405358355437272e-06, "loss": 1.4514, "step": 1100 }, { "epoch": 0.3715961214654822, "grad_norm": 10.375, "learning_rate": 6.968318825407323e-06, "loss": 1.1933, "step": 1200 }, { "epoch": 0.40256246492093906, "grad_norm": 28.125, "learning_rate": 6.512673521081566e-06, "loss": 1.3781, "step": 1300 }, { "epoch": 0.4335288083763959, "grad_norm": 10.75, "learning_rate": 6.042729485395221e-06, "loss": 1.219, "step": 1400 }, { "epoch": 0.46449515183185275, "grad_norm": 13.4375, "learning_rate": 5.562928921789507e-06, "loss": 1.3677, "step": 1500 }, { "epoch": 0.4954614952873096, "grad_norm": 20.5, "learning_rate": 5.077807203740619e-06, "loss": 2.2353, "step": 1600 }, { "epoch": 0.4954614952873096, "eval_logits/chosen": NaN, "eval_logits/rejected": NaN, "eval_logps/chosen": -6157.478515625, "eval_logps/rejected": -4588.53955078125, "eval_loss": 214.6779022216797, "eval_rewards/accuracies": 0.3266928195953369, "eval_rewards/chosen": -570.3768310546875, "eval_rewards/margins": -145.00547790527344, "eval_rewards/rejected": -425.37127685546875, "eval_runtime": 2581.7706, "eval_samples_per_second": 7.505, "eval_steps_per_second": 7.505, "step": 1600 }, { "epoch": 0.5264278387427664, "grad_norm": 7.25, "learning_rate": 4.591950003587562e-06, "loss": 1.754, "step": 1700 }, { "epoch": 0.5573941821982233, "grad_norm": 10.6875, "learning_rate": 4.109949945903833e-06, "loss": 1.6524, "step": 1800 }, { "epoch": 0.5883605256536801, "grad_norm": 12.9375, "learning_rate": 3.636363195152255e-06, "loss": 1.2557, "step": 1900 }, { "epoch": 0.619326869109137, "grad_norm": 27.5, "learning_rate": 3.1756663879834735e-06, "loss": 1.2763, "step": 2000 }, { "epoch": 0.6502932125645938, "grad_norm": 18.25, "learning_rate": 2.732214317280802e-06, "loss": 1.7662, "step": 2100 }, { "epoch": 0.6812595560200507, "grad_norm": 24.25, "learning_rate": 2.3101987679481918e-06, "loss": 1.7935, "step": 2200 }, { "epoch": 0.7122258994755075, "grad_norm": 22.375, "learning_rate": 1.913608893551036e-06, "loss": 1.6459, "step": 2300 }, { "epoch": 0.7431922429309644, "grad_norm": 32.75, "learning_rate": 1.5461935083544755e-06, "loss": 1.4079, "step": 2400 }, { "epoch": 0.7431922429309644, "eval_logits/chosen": NaN, "eval_logits/rejected": NaN, "eval_logps/chosen": -6187.34716796875, "eval_logps/rejected": -4600.21826171875, "eval_loss": 216.61477661132812, "eval_rewards/accuracies": 0.3254541754722595, "eval_rewards/chosen": -573.3637084960938, "eval_rewards/margins": -146.82444763183594, "eval_rewards/rejected": -426.5391845703125, "eval_runtime": 2856.8785, "eval_samples_per_second": 6.782, "eval_steps_per_second": 6.782, "step": 2400 }, { "epoch": 0.7741585863864212, "grad_norm": 16.375, "learning_rate": 1.2114256511983274e-06, "loss": 1.2724, "step": 2500 }, { "epoch": 0.8051249298418781, "grad_norm": 12.6875, "learning_rate": 9.124697561729073e-07, "loss": 1.5263, "step": 2600 }, { "epoch": 0.836091273297335, "grad_norm": 12.75, "learning_rate": 6.521517404190009e-07, "loss": 1.6869, "step": 2700 }, { "epoch": 0.8670576167527918, "grad_norm": 11.625, "learning_rate": 4.3293229180065233e-07, "loss": 1.4792, "step": 2800 }, { "epoch": 0.8980239602082487, "grad_norm": 9.0, "learning_rate": 2.5688360895234796e-07, "loss": 1.3264, "step": 2900 }, { "epoch": 0.9289903036637055, "grad_norm": 25.625, "learning_rate": 1.256698135681289e-07, "loss": 1.3937, "step": 3000 }, { "epoch": 0.9599566471191624, "grad_norm": 21.25, "learning_rate": 4.05312200878627e-08, "loss": 1.6048, "step": 3100 }, { "epoch": 0.9909229905746192, "grad_norm": 36.0, "learning_rate": 2.272611473388975e-09, "loss": 1.2954, "step": 3200 }, { "epoch": 0.9909229905746192, "eval_logits/chosen": NaN, "eval_logits/rejected": NaN, "eval_logps/chosen": -6186.20751953125, "eval_logps/rejected": -4598.9970703125, "eval_loss": 216.60252380371094, "eval_rewards/accuracies": 0.3249380588531494, "eval_rewards/chosen": -573.2498168945312, "eval_rewards/margins": -146.83267211914062, "eval_rewards/rejected": -426.41705322265625, "eval_runtime": 1367.7925, "eval_samples_per_second": 14.166, "eval_steps_per_second": 14.166, "step": 3200 } ], "logging_steps": 100, "max_steps": 3230, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }