diff --git "a/dpo_Qwen2.5-14B-Instruct/trainer_state.json" "b/dpo_Qwen2.5-14B-Instruct/trainer_state.json" new file mode 100644--- /dev/null +++ "b/dpo_Qwen2.5-14B-Instruct/trainer_state.json" @@ -0,0 +1,3097 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.282051282051282, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00641025641025641, + "grad_norm": 112.49995152389637, + "learning_rate": 1e-08, + "logits/chosen": -1.2448434829711914, + "logits/rejected": -1.2056102752685547, + "logps/chosen": -354.68695068359375, + "logps/rejected": -337.0610656738281, + "loss": 2.5996, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.5995607376098633, + "rewards/margins": -0.22218748927116394, + "rewards/rejected": 3.8217480182647705, + "step": 2 + }, + { + "epoch": 0.01282051282051282, + "grad_norm": 119.1548147385064, + "learning_rate": 2e-08, + "logits/chosen": -0.9590847492218018, + "logits/rejected": -0.8616329431533813, + "logps/chosen": -315.861572265625, + "logps/rejected": -434.3745422363281, + "loss": 1.9575, + "rewards/accuracies": 0.375, + "rewards/chosen": 1.9579038619995117, + "rewards/margins": -3.184068441390991, + "rewards/rejected": 5.141972064971924, + "step": 4 + }, + { + "epoch": 0.019230769230769232, + "grad_norm": 122.87860751187283, + "learning_rate": 3e-08, + "logits/chosen": -1.0793626308441162, + "logits/rejected": -1.0623524188995361, + "logps/chosen": -311.1429443359375, + "logps/rejected": -367.40118408203125, + "loss": 2.6614, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.7262353897094727, + "rewards/margins": -1.0591639280319214, + "rewards/rejected": 3.7853991985321045, + "step": 6 + }, + { + "epoch": 0.02564102564102564, + "grad_norm": 130.14004749083819, + "learning_rate": 4e-08, + "logits/chosen": -0.996293842792511, + "logits/rejected": -1.0375988483428955, + "logps/chosen": -319.14990234375, + "logps/rejected": -332.11102294921875, + "loss": 1.7324, + "rewards/accuracies": 0.4375, + "rewards/chosen": 3.158529758453369, + "rewards/margins": -1.2365788221359253, + "rewards/rejected": 4.395108222961426, + "step": 8 + }, + { + "epoch": 0.03205128205128205, + "grad_norm": 123.36678612830033, + "learning_rate": 5e-08, + "logits/chosen": -1.163619041442871, + "logits/rejected": -1.1582694053649902, + "logps/chosen": -284.8478088378906, + "logps/rejected": -322.7598876953125, + "loss": 2.6948, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.851339340209961, + "rewards/margins": -3.2639975547790527, + "rewards/rejected": 6.115336894989014, + "step": 10 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 100.45692289390142, + "learning_rate": 6e-08, + "logits/chosen": -1.2759606838226318, + "logits/rejected": -1.2590994834899902, + "logps/chosen": -284.07769775390625, + "logps/rejected": -321.27484130859375, + "loss": 1.9801, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.7482399940490723, + "rewards/margins": 1.01253342628479, + "rewards/rejected": 2.735706329345703, + "step": 12 + }, + { + "epoch": 0.04487179487179487, + "grad_norm": 96.4108624137188, + "learning_rate": 7e-08, + "logits/chosen": -1.1259269714355469, + "logits/rejected": -0.9157360196113586, + "logps/chosen": -351.7616882324219, + "logps/rejected": -363.45458984375, + "loss": 1.8719, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.427593231201172, + "rewards/margins": -3.161393642425537, + "rewards/rejected": 5.588986396789551, + "step": 14 + }, + { + "epoch": 0.05128205128205128, + "grad_norm": 95.42355115319369, + "learning_rate": 8e-08, + "logits/chosen": -1.1730051040649414, + "logits/rejected": -1.125391960144043, + "logps/chosen": -295.6844177246094, + "logps/rejected": -248.12355041503906, + "loss": 2.5148, + "rewards/accuracies": 0.4375, + "rewards/chosen": 2.9050228595733643, + "rewards/margins": -0.5626314282417297, + "rewards/rejected": 3.467654228210449, + "step": 16 + }, + { + "epoch": 0.057692307692307696, + "grad_norm": 97.63560279065712, + "learning_rate": 9e-08, + "logits/chosen": -1.183789610862732, + "logits/rejected": -1.1477504968643188, + "logps/chosen": -304.2013854980469, + "logps/rejected": -324.23773193359375, + "loss": 2.1499, + "rewards/accuracies": 0.4375, + "rewards/chosen": 2.8154826164245605, + "rewards/margins": -1.4445247650146484, + "rewards/rejected": 4.260007381439209, + "step": 18 + }, + { + "epoch": 0.0641025641025641, + "grad_norm": 117.8852112781559, + "learning_rate": 1e-07, + "logits/chosen": -1.063579797744751, + "logits/rejected": -1.0517311096191406, + "logps/chosen": -344.0760192871094, + "logps/rejected": -372.3055114746094, + "loss": 2.0421, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.2084708213806152, + "rewards/margins": -0.8433233499526978, + "rewards/rejected": 4.051794052124023, + "step": 20 + }, + { + "epoch": 0.07051282051282051, + "grad_norm": 76.99267413550359, + "learning_rate": 1.0999999999999999e-07, + "logits/chosen": -1.0992228984832764, + "logits/rejected": -0.9774513244628906, + "logps/chosen": -273.5168762207031, + "logps/rejected": -375.86981201171875, + "loss": 1.8554, + "rewards/accuracies": 0.375, + "rewards/chosen": 1.8230633735656738, + "rewards/margins": -1.5747861862182617, + "rewards/rejected": 3.3978495597839355, + "step": 22 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 155.9042986237964, + "learning_rate": 1.2e-07, + "logits/chosen": -1.0372235774993896, + "logits/rejected": -0.8894882202148438, + "logps/chosen": -237.50111389160156, + "logps/rejected": -286.1100158691406, + "loss": 2.5155, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.2997632026672363, + "rewards/margins": -0.5999266505241394, + "rewards/rejected": 2.8996896743774414, + "step": 24 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 115.55369479155001, + "learning_rate": 1.3e-07, + "logits/chosen": -1.2320899963378906, + "logits/rejected": -0.9306809902191162, + "logps/chosen": -365.75048828125, + "logps/rejected": -370.4112243652344, + "loss": 2.3311, + "rewards/accuracies": 0.4375, + "rewards/chosen": 2.4405670166015625, + "rewards/margins": -3.624695062637329, + "rewards/rejected": 6.065262317657471, + "step": 26 + }, + { + "epoch": 0.08974358974358974, + "grad_norm": 111.23075573081465, + "learning_rate": 1.4e-07, + "logits/chosen": -1.0849733352661133, + "logits/rejected": -1.0493545532226562, + "logps/chosen": -265.06085205078125, + "logps/rejected": -336.2076721191406, + "loss": 1.738, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.724541664123535, + "rewards/margins": -0.48108214139938354, + "rewards/rejected": 3.2056236267089844, + "step": 28 + }, + { + "epoch": 0.09615384615384616, + "grad_norm": 84.93735572889129, + "learning_rate": 1.5e-07, + "logits/chosen": -1.0241395235061646, + "logits/rejected": -1.0278414487838745, + "logps/chosen": -291.67333984375, + "logps/rejected": -280.37554931640625, + "loss": 1.6791, + "rewards/accuracies": 0.625, + "rewards/chosen": 2.423109531402588, + "rewards/margins": -0.48134735226631165, + "rewards/rejected": 2.904456615447998, + "step": 30 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 122.82605744564269, + "learning_rate": 1.6e-07, + "logits/chosen": -1.2750225067138672, + "logits/rejected": -1.1413352489471436, + "logps/chosen": -313.16650390625, + "logps/rejected": -330.56103515625, + "loss": 1.9623, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.3830716609954834, + "rewards/margins": -0.23222044110298157, + "rewards/rejected": 3.6152920722961426, + "step": 32 + }, + { + "epoch": 0.10897435897435898, + "grad_norm": 130.41349138492058, + "learning_rate": 1.7000000000000001e-07, + "logits/chosen": -1.285585641860962, + "logits/rejected": -1.3130979537963867, + "logps/chosen": -341.670654296875, + "logps/rejected": -377.56591796875, + "loss": 1.6415, + "rewards/accuracies": 0.75, + "rewards/chosen": 3.555304527282715, + "rewards/margins": 0.6710004210472107, + "rewards/rejected": 2.8843040466308594, + "step": 34 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 110.48141078039077, + "learning_rate": 1.8e-07, + "logits/chosen": -1.191585898399353, + "logits/rejected": -1.0690052509307861, + "logps/chosen": -272.9148864746094, + "logps/rejected": -312.38714599609375, + "loss": 1.961, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.6469974517822266, + "rewards/margins": -2.608886957168579, + "rewards/rejected": 6.255884170532227, + "step": 36 + }, + { + "epoch": 0.12179487179487179, + "grad_norm": 119.32596313319678, + "learning_rate": 1.8999999999999998e-07, + "logits/chosen": -1.0829436779022217, + "logits/rejected": -1.149304986000061, + "logps/chosen": -237.84658813476562, + "logps/rejected": -269.9066467285156, + "loss": 1.9882, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.878349542617798, + "rewards/margins": -0.8613954186439514, + "rewards/rejected": 3.7397449016571045, + "step": 38 + }, + { + "epoch": 0.1282051282051282, + "grad_norm": 173.2340771350673, + "learning_rate": 2e-07, + "logits/chosen": -1.277957797050476, + "logits/rejected": -1.1888484954833984, + "logps/chosen": -269.4563903808594, + "logps/rejected": -308.91265869140625, + "loss": 2.5476, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.6314220428466797, + "rewards/margins": -1.9564207792282104, + "rewards/rejected": 5.58784294128418, + "step": 40 + }, + { + "epoch": 0.1346153846153846, + "grad_norm": 135.60629685979305, + "learning_rate": 2.0999999999999997e-07, + "logits/chosen": -0.9286764860153198, + "logits/rejected": -0.9079974889755249, + "logps/chosen": -294.025634765625, + "logps/rejected": -319.5558166503906, + "loss": 1.9227, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.2886180877685547, + "rewards/margins": 0.5016406774520874, + "rewards/rejected": 1.7869774103164673, + "step": 42 + }, + { + "epoch": 0.14102564102564102, + "grad_norm": 195.10368309542935, + "learning_rate": 2.1999999999999998e-07, + "logits/chosen": -1.0710866451263428, + "logits/rejected": -1.0489989519119263, + "logps/chosen": -409.41650390625, + "logps/rejected": -380.30426025390625, + "loss": 2.5616, + "rewards/accuracies": 0.5625, + "rewards/chosen": 2.735966682434082, + "rewards/margins": -0.38184601068496704, + "rewards/rejected": 3.1178126335144043, + "step": 44 + }, + { + "epoch": 0.14743589743589744, + "grad_norm": 130.8198419075927, + "learning_rate": 2.3e-07, + "logits/chosen": -1.1227138042449951, + "logits/rejected": -1.1442391872406006, + "logps/chosen": -284.76885986328125, + "logps/rejected": -329.89532470703125, + "loss": 2.277, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.643310546875, + "rewards/margins": 0.5456356406211853, + "rewards/rejected": 2.09767484664917, + "step": 46 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 130.26006638586685, + "learning_rate": 2.4e-07, + "logits/chosen": -1.1510767936706543, + "logits/rejected": -1.1432616710662842, + "logps/chosen": -289.3182067871094, + "logps/rejected": -290.2795104980469, + "loss": 1.7135, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.615015983581543, + "rewards/margins": -1.5631537437438965, + "rewards/rejected": 5.1781697273254395, + "step": 48 + }, + { + "epoch": 0.16025641025641027, + "grad_norm": 137.33901755874504, + "learning_rate": 2.5e-07, + "logits/chosen": -1.0494900941848755, + "logits/rejected": -1.1131045818328857, + "logps/chosen": -329.8565368652344, + "logps/rejected": -311.13909912109375, + "loss": 1.4951, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.6522865295410156, + "rewards/margins": 0.21213269233703613, + "rewards/rejected": 2.4401538372039795, + "step": 50 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 104.98354037290923, + "learning_rate": 2.6e-07, + "logits/chosen": -0.9214585423469543, + "logits/rejected": -0.8989849090576172, + "logps/chosen": -286.54913330078125, + "logps/rejected": -294.0342712402344, + "loss": 1.6991, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.9489517211914062, + "rewards/margins": 0.5319545269012451, + "rewards/rejected": 2.4169974327087402, + "step": 52 + }, + { + "epoch": 0.17307692307692307, + "grad_norm": 108.50076448756127, + "learning_rate": 2.7e-07, + "logits/chosen": -1.1428769826889038, + "logits/rejected": -1.1290037631988525, + "logps/chosen": -291.0087585449219, + "logps/rejected": -317.94281005859375, + "loss": 1.4717, + "rewards/accuracies": 0.5, + "rewards/chosen": 3.3230457305908203, + "rewards/margins": -0.4745556712150574, + "rewards/rejected": 3.7976016998291016, + "step": 54 + }, + { + "epoch": 0.1794871794871795, + "grad_norm": 103.94370143588714, + "learning_rate": 2.8e-07, + "logits/chosen": -1.0812921524047852, + "logits/rejected": -1.0234498977661133, + "logps/chosen": -376.9350280761719, + "logps/rejected": -397.9642333984375, + "loss": 1.455, + "rewards/accuracies": 0.5625, + "rewards/chosen": 2.3725953102111816, + "rewards/margins": -0.046460457146167755, + "rewards/rejected": 2.419055461883545, + "step": 56 + }, + { + "epoch": 0.1858974358974359, + "grad_norm": 62.726054551655814, + "learning_rate": 2.9e-07, + "logits/chosen": -1.073312520980835, + "logits/rejected": -0.9113187789916992, + "logps/chosen": -264.15667724609375, + "logps/rejected": -298.25274658203125, + "loss": 1.3886, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.8897204399108887, + "rewards/margins": -0.18457120656967163, + "rewards/rejected": 3.074291706085205, + "step": 58 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 103.93132540755165, + "learning_rate": 3e-07, + "logits/chosen": -1.2675256729125977, + "logits/rejected": -1.1400632858276367, + "logps/chosen": -399.09112548828125, + "logps/rejected": -398.646728515625, + "loss": 1.3299, + "rewards/accuracies": 0.625, + "rewards/chosen": 3.039437770843506, + "rewards/margins": -0.21770703792572021, + "rewards/rejected": 3.2571449279785156, + "step": 60 + }, + { + "epoch": 0.1987179487179487, + "grad_norm": 99.31530861515097, + "learning_rate": 3.1e-07, + "logits/chosen": -1.0217715501785278, + "logits/rejected": -0.959376335144043, + "logps/chosen": -298.750732421875, + "logps/rejected": -414.6912841796875, + "loss": 1.2063, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.600154399871826, + "rewards/margins": 2.6170525550842285, + "rewards/rejected": -0.016898036003112793, + "step": 62 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 74.78192042440652, + "learning_rate": 3.2e-07, + "logits/chosen": -1.1241461038589478, + "logits/rejected": -1.1051523685455322, + "logps/chosen": -328.9538879394531, + "logps/rejected": -388.71807861328125, + "loss": 0.8977, + "rewards/accuracies": 0.5625, + "rewards/chosen": 3.5934743881225586, + "rewards/margins": -0.6333028078079224, + "rewards/rejected": 4.22677755355835, + "step": 64 + }, + { + "epoch": 0.21153846153846154, + "grad_norm": 78.90872165665947, + "learning_rate": 3.3e-07, + "logits/chosen": -1.1632623672485352, + "logits/rejected": -1.099599838256836, + "logps/chosen": -281.401123046875, + "logps/rejected": -301.1926574707031, + "loss": 1.235, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.605003833770752, + "rewards/margins": 1.4718358516693115, + "rewards/rejected": 1.1331682205200195, + "step": 66 + }, + { + "epoch": 0.21794871794871795, + "grad_norm": 83.30191689706389, + "learning_rate": 3.4000000000000003e-07, + "logits/chosen": -1.1661943197250366, + "logits/rejected": -1.0930298566818237, + "logps/chosen": -347.7239685058594, + "logps/rejected": -496.9989013671875, + "loss": 0.8906, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.7266435623168945, + "rewards/margins": 2.3288183212280273, + "rewards/rejected": 0.39782512187957764, + "step": 68 + }, + { + "epoch": 0.22435897435897437, + "grad_norm": 102.47680974291092, + "learning_rate": 3.5e-07, + "logits/chosen": -1.0875322818756104, + "logits/rejected": -0.8776979446411133, + "logps/chosen": -270.054931640625, + "logps/rejected": -316.69219970703125, + "loss": 1.1472, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.149214267730713, + "rewards/margins": 1.7957603931427002, + "rewards/rejected": 0.3534536361694336, + "step": 70 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 97.97025119907397, + "learning_rate": 3.6e-07, + "logits/chosen": -1.174084186553955, + "logits/rejected": -1.1361169815063477, + "logps/chosen": -305.4373779296875, + "logps/rejected": -437.92315673828125, + "loss": 1.3013, + "rewards/accuracies": 0.375, + "rewards/chosen": 1.9955430030822754, + "rewards/margins": 0.16910505294799805, + "rewards/rejected": 1.8264379501342773, + "step": 72 + }, + { + "epoch": 0.23717948717948717, + "grad_norm": 100.62052512284637, + "learning_rate": 3.7e-07, + "logits/chosen": -1.1352994441986084, + "logits/rejected": -1.133829116821289, + "logps/chosen": -321.6334533691406, + "logps/rejected": -345.19525146484375, + "loss": 0.9001, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.579514741897583, + "rewards/margins": 0.8097899556159973, + "rewards/rejected": 0.7697248458862305, + "step": 74 + }, + { + "epoch": 0.24358974358974358, + "grad_norm": 56.69877366701209, + "learning_rate": 3.7999999999999996e-07, + "logits/chosen": -1.2270028591156006, + "logits/rejected": -1.247793197631836, + "logps/chosen": -360.0379943847656, + "logps/rejected": -531.5338745117188, + "loss": 0.7812, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.967543840408325, + "rewards/margins": 5.847589492797852, + "rewards/rejected": -2.8800458908081055, + "step": 76 + }, + { + "epoch": 0.25, + "grad_norm": 113.90412408576785, + "learning_rate": 3.8999999999999997e-07, + "logits/chosen": -1.320005178451538, + "logits/rejected": -1.1764707565307617, + "logps/chosen": -263.6769104003906, + "logps/rejected": -476.15814208984375, + "loss": 1.1502, + "rewards/accuracies": 0.6875, + "rewards/chosen": 3.257598400115967, + "rewards/margins": 5.317660808563232, + "rewards/rejected": -2.0600616931915283, + "step": 78 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 106.64611056078613, + "learning_rate": 4e-07, + "logits/chosen": -1.148504376411438, + "logits/rejected": -1.1187819242477417, + "logps/chosen": -262.672119140625, + "logps/rejected": -323.44873046875, + "loss": 1.0057, + "rewards/accuracies": 0.375, + "rewards/chosen": 2.6537346839904785, + "rewards/margins": 0.2272154688835144, + "rewards/rejected": 2.4265193939208984, + "step": 80 + }, + { + "epoch": 0.26282051282051283, + "grad_norm": 98.6839252752944, + "learning_rate": 4.0999999999999994e-07, + "logits/chosen": -1.2318834066390991, + "logits/rejected": -1.1649481058120728, + "logps/chosen": -275.3931884765625, + "logps/rejected": -295.09283447265625, + "loss": 0.984, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.413456439971924, + "rewards/margins": 1.1995419263839722, + "rewards/rejected": 1.213914394378662, + "step": 82 + }, + { + "epoch": 0.2692307692307692, + "grad_norm": 76.55668058412084, + "learning_rate": 4.1999999999999995e-07, + "logits/chosen": -1.345212697982788, + "logits/rejected": -1.2046184539794922, + "logps/chosen": -342.62939453125, + "logps/rejected": -364.7247619628906, + "loss": 0.694, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.6433377265930176, + "rewards/margins": 0.919085681438446, + "rewards/rejected": 1.7242518663406372, + "step": 84 + }, + { + "epoch": 0.27564102564102566, + "grad_norm": 103.90243960644969, + "learning_rate": 4.2999999999999996e-07, + "logits/chosen": -1.072326421737671, + "logits/rejected": -0.9028797149658203, + "logps/chosen": -259.6208801269531, + "logps/rejected": -361.7303466796875, + "loss": 0.9374, + "rewards/accuracies": 0.5625, + "rewards/chosen": 2.208549976348877, + "rewards/margins": 2.0721185207366943, + "rewards/rejected": 0.13643111288547516, + "step": 86 + }, + { + "epoch": 0.28205128205128205, + "grad_norm": 60.43741134530524, + "learning_rate": 4.3999999999999997e-07, + "logits/chosen": -1.1322457790374756, + "logits/rejected": -1.1339733600616455, + "logps/chosen": -291.927978515625, + "logps/rejected": -385.3431396484375, + "loss": 0.8154, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.1503965854644775, + "rewards/margins": 2.4622902870178223, + "rewards/rejected": -0.3118935227394104, + "step": 88 + }, + { + "epoch": 0.28846153846153844, + "grad_norm": 64.96931007630867, + "learning_rate": 4.5e-07, + "logits/chosen": -1.3891561031341553, + "logits/rejected": -1.216461420059204, + "logps/chosen": -328.8116455078125, + "logps/rejected": -440.5103759765625, + "loss": 0.6651, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.9966187477111816, + "rewards/margins": 5.870615005493164, + "rewards/rejected": -2.8739962577819824, + "step": 90 + }, + { + "epoch": 0.2948717948717949, + "grad_norm": 66.44661160039124, + "learning_rate": 4.6e-07, + "logits/chosen": -1.085283875465393, + "logits/rejected": -1.065922737121582, + "logps/chosen": -228.49908447265625, + "logps/rejected": -317.1948547363281, + "loss": 0.6938, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.4189622402191162, + "rewards/margins": 1.909195899963379, + "rewards/rejected": -0.49023348093032837, + "step": 92 + }, + { + "epoch": 0.30128205128205127, + "grad_norm": 63.1207548618782, + "learning_rate": 4.6999999999999995e-07, + "logits/chosen": -1.367796540260315, + "logits/rejected": -1.3280856609344482, + "logps/chosen": -241.49716186523438, + "logps/rejected": -269.218505859375, + "loss": 0.7154, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.2394094467163086, + "rewards/margins": -0.29525822401046753, + "rewards/rejected": 1.5346674919128418, + "step": 94 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 63.45621982457679, + "learning_rate": 4.8e-07, + "logits/chosen": -1.2086436748504639, + "logits/rejected": -1.1243126392364502, + "logps/chosen": -325.5728454589844, + "logps/rejected": -541.4525756835938, + "loss": 0.6701, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.1171650886535645, + "rewards/margins": 6.678624629974365, + "rewards/rejected": -4.561459541320801, + "step": 96 + }, + { + "epoch": 0.3141025641025641, + "grad_norm": 46.682270342642695, + "learning_rate": 4.9e-07, + "logits/chosen": -1.1883878707885742, + "logits/rejected": -1.0791866779327393, + "logps/chosen": -268.7866516113281, + "logps/rejected": -445.59295654296875, + "loss": 0.5977, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7666086554527283, + "rewards/margins": 9.177267074584961, + "rewards/rejected": -8.410658836364746, + "step": 98 + }, + { + "epoch": 0.32051282051282054, + "grad_norm": 56.20155145754416, + "learning_rate": 5e-07, + "logits/chosen": -1.0538123846054077, + "logits/rejected": -1.057773470878601, + "logps/chosen": -335.8092956542969, + "logps/rejected": -443.2895812988281, + "loss": 0.6577, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8577184677124023, + "rewards/margins": 10.441128730773926, + "rewards/rejected": -8.583410263061523, + "step": 100 + }, + { + "epoch": 0.32051282051282054, + "eval_logits/chosen": -0.8472009897232056, + "eval_logits/rejected": -0.8077085018157959, + "eval_logps/chosen": -445.84893798828125, + "eval_logps/rejected": -573.0791015625, + "eval_loss": 0.9123198986053467, + "eval_rewards/accuracies": 0.5384615659713745, + "eval_rewards/chosen": 0.8257265686988831, + "eval_rewards/margins": 13.480864524841309, + "eval_rewards/rejected": -12.655137062072754, + "eval_runtime": 18.6991, + "eval_samples_per_second": 5.348, + "eval_steps_per_second": 0.695, + "step": 100 + }, + { + "epoch": 0.3269230769230769, + "grad_norm": 87.55882265844058, + "learning_rate": 4.999820277698201e-07, + "logits/chosen": -1.3192846775054932, + "logits/rejected": -1.1711474657058716, + "logps/chosen": -277.801025390625, + "logps/rejected": -559.6638793945312, + "loss": 1.4, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.057104647159576416, + "rewards/margins": 19.48062515258789, + "rewards/rejected": -19.537729263305664, + "step": 102 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 71.98896944839848, + "learning_rate": 4.999281136632891e-07, + "logits/chosen": -1.2481763362884521, + "logits/rejected": -1.1536132097244263, + "logps/chosen": -310.8056640625, + "logps/rejected": -395.9885559082031, + "loss": 0.7369, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07353533804416656, + "rewards/margins": 4.133586883544922, + "rewards/rejected": -4.060051918029785, + "step": 104 + }, + { + "epoch": 0.33974358974358976, + "grad_norm": 170.44433422910302, + "learning_rate": 4.998382654320609e-07, + "logits/chosen": -1.3416852951049805, + "logits/rejected": -1.3144190311431885, + "logps/chosen": -444.1352233886719, + "logps/rejected": -465.98077392578125, + "loss": 0.9978, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.5513283014297485, + "rewards/margins": 0.332836776971817, + "rewards/rejected": 1.2184914350509644, + "step": 106 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 59.59385148315801, + "learning_rate": 4.997124959943201e-07, + "logits/chosen": -1.221919059753418, + "logits/rejected": -1.2385119199752808, + "logps/chosen": -314.23291015625, + "logps/rejected": -401.683349609375, + "loss": 0.6857, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2510930299758911, + "rewards/margins": 2.87782621383667, + "rewards/rejected": -2.6267330646514893, + "step": 108 + }, + { + "epoch": 0.3525641025641026, + "grad_norm": 54.29137105777559, + "learning_rate": 4.99550823432925e-07, + "logits/chosen": -1.2256851196289062, + "logits/rejected": -1.1725949048995972, + "logps/chosen": -363.8654479980469, + "logps/rejected": -442.36474609375, + "loss": 0.6015, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9511426091194153, + "rewards/margins": 1.252941370010376, + "rewards/rejected": -0.3017987012863159, + "step": 110 + }, + { + "epoch": 0.358974358974359, + "grad_norm": 63.371646866325236, + "learning_rate": 4.993532709928075e-07, + "logits/chosen": -1.4472473859786987, + "logits/rejected": -1.3765482902526855, + "logps/chosen": -373.6371765136719, + "logps/rejected": -431.0395202636719, + "loss": 0.5987, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1647520065307617, + "rewards/margins": 6.814955711364746, + "rewards/rejected": -5.650203704833984, + "step": 112 + }, + { + "epoch": 0.36538461538461536, + "grad_norm": 141.70303288043664, + "learning_rate": 4.99119867077631e-07, + "logits/chosen": -1.0582250356674194, + "logits/rejected": -1.0068751573562622, + "logps/chosen": -293.22735595703125, + "logps/rejected": -369.929931640625, + "loss": 0.7043, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.053703784942627, + "rewards/margins": 3.7460641860961914, + "rewards/rejected": -5.799767971038818, + "step": 114 + }, + { + "epoch": 0.3717948717948718, + "grad_norm": 110.93696504596429, + "learning_rate": 4.988506452457066e-07, + "logits/chosen": -1.2521686553955078, + "logits/rejected": -1.235703945159912, + "logps/chosen": -252.42727661132812, + "logps/rejected": -303.4141845703125, + "loss": 0.6717, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.9578218460083008, + "rewards/margins": 1.5941026210784912, + "rewards/rejected": 0.3637194037437439, + "step": 116 + }, + { + "epoch": 0.3782051282051282, + "grad_norm": 58.86737023394013, + "learning_rate": 4.985456442051682e-07, + "logits/chosen": -1.446759819984436, + "logits/rejected": -1.440021276473999, + "logps/chosen": -265.3540344238281, + "logps/rejected": -357.1883544921875, + "loss": 0.501, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.4770517349243164, + "rewards/margins": 3.54362154006958, + "rewards/rejected": -2.0665698051452637, + "step": 118 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 54.91445468793373, + "learning_rate": 4.98204907808407e-07, + "logits/chosen": -1.3273491859436035, + "logits/rejected": -1.1831560134887695, + "logps/chosen": -324.9310607910156, + "logps/rejected": -465.48211669921875, + "loss": 0.7541, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.7483316659927368, + "rewards/margins": 8.44149112701416, + "rewards/rejected": -6.6931586265563965, + "step": 120 + }, + { + "epoch": 0.391025641025641, + "grad_norm": 58.25367706617695, + "learning_rate": 4.978284850457668e-07, + "logits/chosen": -1.299121379852295, + "logits/rejected": -1.1402654647827148, + "logps/chosen": -319.9881591796875, + "logps/rejected": -553.1646118164062, + "loss": 0.4806, + "rewards/accuracies": 0.75, + "rewards/chosen": 2.202224016189575, + "rewards/margins": 11.036829948425293, + "rewards/rejected": -8.834606170654297, + "step": 122 + }, + { + "epoch": 0.3974358974358974, + "grad_norm": 55.556131668959004, + "learning_rate": 4.974164300384997e-07, + "logits/chosen": -1.161794900894165, + "logits/rejected": -1.124051809310913, + "logps/chosen": -294.56402587890625, + "logps/rejected": -394.50701904296875, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.5111699104309082, + "rewards/margins": 6.14288330078125, + "rewards/rejected": -4.631712913513184, + "step": 124 + }, + { + "epoch": 0.40384615384615385, + "grad_norm": 59.11921291398244, + "learning_rate": 4.969688020309852e-07, + "logits/chosen": -1.1417524814605713, + "logits/rejected": -1.0565614700317383, + "logps/chosen": -300.486083984375, + "logps/rejected": -535.545166015625, + "loss": 0.4642, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6234948635101318, + "rewards/margins": 12.211280822753906, + "rewards/rejected": -10.587785720825195, + "step": 126 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 44.2085778252882, + "learning_rate": 4.964856653822122e-07, + "logits/chosen": -1.213273525238037, + "logits/rejected": -1.188025951385498, + "logps/chosen": -332.03704833984375, + "logps/rejected": -440.1702880859375, + "loss": 0.5238, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.22303307056427, + "rewards/margins": 4.220616340637207, + "rewards/rejected": -2.997584104537964, + "step": 128 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 48.46961080121887, + "learning_rate": 4.959670895565248e-07, + "logits/chosen": -1.0044407844543457, + "logits/rejected": -0.9687132835388184, + "logps/chosen": -256.96185302734375, + "logps/rejected": -445.46697998046875, + "loss": 0.5512, + "rewards/accuracies": 0.6875, + "rewards/chosen": 2.041879892349243, + "rewards/margins": 10.02515697479248, + "rewards/rejected": -7.983275890350342, + "step": 130 + }, + { + "epoch": 0.4230769230769231, + "grad_norm": 52.52054201300236, + "learning_rate": 4.954131491136361e-07, + "logits/chosen": -1.3613969087600708, + "logits/rejected": -1.2766222953796387, + "logps/chosen": -291.88555908203125, + "logps/rejected": -362.47174072265625, + "loss": 0.5254, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.576281189918518, + "rewards/margins": 4.421535968780518, + "rewards/rejected": -2.845255136489868, + "step": 132 + }, + { + "epoch": 0.42948717948717946, + "grad_norm": 54.060098375685975, + "learning_rate": 4.948239236979073e-07, + "logits/chosen": -1.3446414470672607, + "logits/rejected": -1.358689546585083, + "logps/chosen": -286.98553466796875, + "logps/rejected": -345.2357177734375, + "loss": 0.4854, + "rewards/accuracies": 0.875, + "rewards/chosen": 2.9822263717651367, + "rewards/margins": 1.3513668775558472, + "rewards/rejected": 1.630859375, + "step": 134 + }, + { + "epoch": 0.4358974358974359, + "grad_norm": 53.859839745357036, + "learning_rate": 4.941994980268966e-07, + "logits/chosen": -1.180131435394287, + "logits/rejected": -1.1889455318450928, + "logps/chosen": -419.29571533203125, + "logps/rejected": -485.98516845703125, + "loss": 0.5619, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.3205814361572266, + "rewards/margins": 2.3364877700805664, + "rewards/rejected": -1.015906572341919, + "step": 136 + }, + { + "epoch": 0.4423076923076923, + "grad_norm": 62.12311667039662, + "learning_rate": 4.935399618791793e-07, + "logits/chosen": -1.1390939950942993, + "logits/rejected": -1.1100345849990845, + "logps/chosen": -335.9249267578125, + "logps/rejected": -442.02496337890625, + "loss": 0.6157, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7030931711196899, + "rewards/margins": 6.646831512451172, + "rewards/rejected": -5.943737983703613, + "step": 138 + }, + { + "epoch": 0.44871794871794873, + "grad_norm": 54.28848759916782, + "learning_rate": 4.92845410081439e-07, + "logits/chosen": -1.2480337619781494, + "logits/rejected": -1.2012183666229248, + "logps/chosen": -362.7234191894531, + "logps/rejected": -497.97259521484375, + "loss": 0.4837, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.098800539970398, + "rewards/margins": 3.53757643699646, + "rewards/rejected": -2.4387755393981934, + "step": 140 + }, + { + "epoch": 0.4551282051282051, + "grad_norm": 54.67683660342895, + "learning_rate": 4.921159424948339e-07, + "logits/chosen": -1.2331373691558838, + "logits/rejected": -1.0444371700286865, + "logps/chosen": -223.99148559570312, + "logps/rejected": -548.15283203125, + "loss": 0.4669, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.8411800861358643, + "rewards/margins": 18.91892433166504, + "rewards/rejected": -17.07774543762207, + "step": 142 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 63.67915206511405, + "learning_rate": 4.91351664000639e-07, + "logits/chosen": -1.1953372955322266, + "logits/rejected": -1.1508889198303223, + "logps/chosen": -358.0416259765625, + "logps/rejected": -377.67059326171875, + "loss": 0.5963, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.9016644954681396, + "rewards/margins": 0.9963706731796265, + "rewards/rejected": 0.9052937626838684, + "step": 144 + }, + { + "epoch": 0.46794871794871795, + "grad_norm": 61.646794241160514, + "learning_rate": 4.905526844851666e-07, + "logits/chosen": -1.0901618003845215, + "logits/rejected": -1.0123677253723145, + "logps/chosen": -306.400146484375, + "logps/rejected": -437.4715576171875, + "loss": 0.5185, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6404218673706055, + "rewards/margins": 3.7662460803985596, + "rewards/rejected": -3.125823974609375, + "step": 146 + }, + { + "epoch": 0.47435897435897434, + "grad_norm": 58.35814936994193, + "learning_rate": 4.897191188239667e-07, + "logits/chosen": -1.2903209924697876, + "logits/rejected": -1.175422191619873, + "logps/chosen": -342.691162109375, + "logps/rejected": -402.43377685546875, + "loss": 0.5536, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.6270394325256348, + "rewards/margins": 2.5350823402404785, + "rewards/rejected": -1.9080431461334229, + "step": 148 + }, + { + "epoch": 0.4807692307692308, + "grad_norm": 51.898035755372625, + "learning_rate": 4.888510868653107e-07, + "logits/chosen": -1.4782743453979492, + "logits/rejected": -1.3963589668273926, + "logps/chosen": -332.8338623046875, + "logps/rejected": -371.3902587890625, + "loss": 0.4178, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.5780057907104492, + "rewards/margins": 4.4161787033081055, + "rewards/rejected": -2.8381733894348145, + "step": 150 + }, + { + "epoch": 0.48717948717948717, + "grad_norm": 46.52156825890617, + "learning_rate": 4.879487134129599e-07, + "logits/chosen": -1.2283132076263428, + "logits/rejected": -1.1541554927825928, + "logps/chosen": -311.4658203125, + "logps/rejected": -475.25042724609375, + "loss": 0.5014, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.6838963031768799, + "rewards/margins": 14.21804141998291, + "rewards/rejected": -12.534146308898926, + "step": 152 + }, + { + "epoch": 0.4935897435897436, + "grad_norm": 51.39314526001204, + "learning_rate": 4.870121282082217e-07, + "logits/chosen": -1.3298598527908325, + "logits/rejected": -1.294224739074707, + "logps/chosen": -348.53387451171875, + "logps/rejected": -467.8633117675781, + "loss": 0.5807, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7962983846664429, + "rewards/margins": 5.248057842254639, + "rewards/rejected": -4.451759338378906, + "step": 154 + }, + { + "epoch": 0.5, + "grad_norm": 47.340182198570666, + "learning_rate": 4.860414659112948e-07, + "logits/chosen": -1.1741009950637817, + "logits/rejected": -1.1159331798553467, + "logps/chosen": -349.2378845214844, + "logps/rejected": -463.61328125, + "loss": 0.5365, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8840088844299316, + "rewards/margins": 10.934059143066406, + "rewards/rejected": -9.050048828125, + "step": 156 + }, + { + "epoch": 0.5064102564102564, + "grad_norm": 40.65853642523701, + "learning_rate": 4.850368660819092e-07, + "logits/chosen": -1.2367990016937256, + "logits/rejected": -1.1518332958221436, + "logps/chosen": -390.1644287109375, + "logps/rejected": -484.31890869140625, + "loss": 0.49, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7704789638519287, + "rewards/margins": 5.575088024139404, + "rewards/rejected": -3.8046092987060547, + "step": 158 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 48.96118095050601, + "learning_rate": 4.8399847315926e-07, + "logits/chosen": -1.2842177152633667, + "logits/rejected": -1.2662012577056885, + "logps/chosen": -308.41668701171875, + "logps/rejected": -404.96453857421875, + "loss": 0.522, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.9070611596107483, + "rewards/margins": 3.2024736404418945, + "rewards/rejected": -2.295412540435791, + "step": 160 + }, + { + "epoch": 0.5192307692307693, + "grad_norm": 43.246139480477396, + "learning_rate": 4.829264364412399e-07, + "logits/chosen": -1.103914737701416, + "logits/rejected": -1.1134490966796875, + "logps/chosen": -338.3736572265625, + "logps/rejected": -469.35406494140625, + "loss": 0.4839, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8931735754013062, + "rewards/margins": 7.317771911621094, + "rewards/rejected": -5.4245991706848145, + "step": 162 + }, + { + "epoch": 0.5256410256410257, + "grad_norm": 39.01449878204908, + "learning_rate": 4.818209100629744e-07, + "logits/chosen": -1.2873077392578125, + "logits/rejected": -1.2998539209365845, + "logps/chosen": -294.57171630859375, + "logps/rejected": -475.5189208984375, + "loss": 0.4554, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5997680425643921, + "rewards/margins": 10.845602035522461, + "rewards/rejected": -10.245833396911621, + "step": 164 + }, + { + "epoch": 0.532051282051282, + "grad_norm": 216.75939668719425, + "learning_rate": 4.806820529746598e-07, + "logits/chosen": -1.198462724685669, + "logits/rejected": -1.1168766021728516, + "logps/chosen": -378.0664978027344, + "logps/rejected": -468.2367248535156, + "loss": 1.2182, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0289223194122314, + "rewards/margins": 2.2482898235321045, + "rewards/rejected": -1.2193677425384521, + "step": 166 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 40.84109251300859, + "learning_rate": 4.795100289187098e-07, + "logits/chosen": -1.215508222579956, + "logits/rejected": -1.180190920829773, + "logps/chosen": -284.1643371582031, + "logps/rejected": -440.7747802734375, + "loss": 0.4631, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.7360546588897705, + "rewards/margins": 7.168399333953857, + "rewards/rejected": -5.432344913482666, + "step": 168 + }, + { + "epoch": 0.5448717948717948, + "grad_norm": 46.58046107250321, + "learning_rate": 4.783050064062134e-07, + "logits/chosen": -1.1620948314666748, + "logits/rejected": -1.0842164754867554, + "logps/chosen": -308.44415283203125, + "logps/rejected": -333.0160217285156, + "loss": 0.4811, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7258564233779907, + "rewards/margins": 3.4721713066101074, + "rewards/rejected": -2.746314525604248, + "step": 170 + }, + { + "epoch": 0.5512820512820513, + "grad_norm": 51.94300870725352, + "learning_rate": 4.770671586927063e-07, + "logits/chosen": -1.2438768148422241, + "logits/rejected": -1.2915990352630615, + "logps/chosen": -389.8465576171875, + "logps/rejected": -469.39202880859375, + "loss": 0.4802, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.9014434814453125, + "rewards/margins": 4.070134162902832, + "rewards/rejected": -2.1686909198760986, + "step": 172 + }, + { + "epoch": 0.5576923076923077, + "grad_norm": 46.990777089734884, + "learning_rate": 4.7579666375326087e-07, + "logits/chosen": -1.1769006252288818, + "logits/rejected": -1.0622663497924805, + "logps/chosen": -325.16204833984375, + "logps/rejected": -553.401123046875, + "loss": 0.4059, + "rewards/accuracies": 0.9375, + "rewards/chosen": 2.232010841369629, + "rewards/margins": 11.683952331542969, + "rewards/rejected": -9.451940536499023, + "step": 174 + }, + { + "epoch": 0.5641025641025641, + "grad_norm": 79.70857719899739, + "learning_rate": 4.7449370425689694e-07, + "logits/chosen": -1.1880089044570923, + "logits/rejected": -1.2385491132736206, + "logps/chosen": -335.0114440917969, + "logps/rejected": -345.5968322753906, + "loss": 0.6178, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.2661206722259521, + "rewards/margins": 0.7478514909744263, + "rewards/rejected": 0.5182691216468811, + "step": 176 + }, + { + "epoch": 0.5705128205128205, + "grad_norm": 41.55608787003336, + "learning_rate": 4.731584675403184e-07, + "logits/chosen": -1.1960301399230957, + "logits/rejected": -1.1504721641540527, + "logps/chosen": -312.40185546875, + "logps/rejected": -475.05023193359375, + "loss": 0.4143, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1944634914398193, + "rewards/margins": 6.927181243896484, + "rewards/rejected": -5.732717990875244, + "step": 178 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 71.1625121877707, + "learning_rate": 4.7179114558097814e-07, + "logits/chosen": -1.228834629058838, + "logits/rejected": -1.179775357246399, + "logps/chosen": -325.7759704589844, + "logps/rejected": -386.529296875, + "loss": 0.5992, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.4810247421264648, + "rewards/margins": 4.664861679077148, + "rewards/rejected": -3.1838366985321045, + "step": 180 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 45.10395458327989, + "learning_rate": 4.7039193496947616e-07, + "logits/chosen": -1.2397207021713257, + "logits/rejected": -1.1701984405517578, + "logps/chosen": -397.3980407714844, + "logps/rejected": -437.7143249511719, + "loss": 0.5121, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.743217408657074, + "rewards/margins": 3.8061106204986572, + "rewards/rejected": -3.0628929138183594, + "step": 182 + }, + { + "epoch": 0.5897435897435898, + "grad_norm": 46.60281934559473, + "learning_rate": 4.6896103688129377e-07, + "logits/chosen": -1.2048474550247192, + "logits/rejected": -1.200062870979309, + "logps/chosen": -295.4891357421875, + "logps/rejected": -335.4971923828125, + "loss": 0.5133, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.0741400718688965, + "rewards/margins": 2.5473904609680176, + "rewards/rejected": -1.4732500314712524, + "step": 184 + }, + { + "epoch": 0.5961538461538461, + "grad_norm": 43.0203100954839, + "learning_rate": 4.674986570478695e-07, + "logits/chosen": -1.2524969577789307, + "logits/rejected": -1.223731279373169, + "logps/chosen": -280.24212646484375, + "logps/rejected": -460.49853515625, + "loss": 0.4253, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2648104429244995, + "rewards/margins": 7.019309043884277, + "rewards/rejected": -5.754498481750488, + "step": 186 + }, + { + "epoch": 0.6025641025641025, + "grad_norm": 52.27836466870514, + "learning_rate": 4.660050057270191e-07, + "logits/chosen": -1.2979581356048584, + "logits/rejected": -1.3155229091644287, + "logps/chosen": -247.7891387939453, + "logps/rejected": -322.99786376953125, + "loss": 0.5666, + "rewards/accuracies": 0.5, + "rewards/chosen": 1.105317234992981, + "rewards/margins": 5.087085723876953, + "rewards/rejected": -3.9817683696746826, + "step": 188 + }, + { + "epoch": 0.6089743589743589, + "grad_norm": 47.8284661077608, + "learning_rate": 4.644802976727053e-07, + "logits/chosen": -1.3539597988128662, + "logits/rejected": -1.2253767251968384, + "logps/chosen": -253.56301879882812, + "logps/rejected": -388.1710205078125, + "loss": 0.4415, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.8682527542114258, + "rewards/margins": 6.8694167137146, + "rewards/rejected": -5.001164436340332, + "step": 190 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 51.5879390444526, + "learning_rate": 4.6292475210416107e-07, + "logits/chosen": -1.363115906715393, + "logits/rejected": -1.3582110404968262, + "logps/chosen": -299.76873779296875, + "logps/rejected": -372.174560546875, + "loss": 0.5143, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.8180210590362549, + "rewards/margins": 1.5533020496368408, + "rewards/rejected": -0.7352810502052307, + "step": 192 + }, + { + "epoch": 0.6217948717948718, + "grad_norm": 48.03451594805343, + "learning_rate": 4.6133859267437047e-07, + "logits/chosen": -1.305558681488037, + "logits/rejected": -1.34196138381958, + "logps/chosen": -346.08489990234375, + "logps/rejected": -435.93231201171875, + "loss": 0.4272, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9088157415390015, + "rewards/margins": 1.103931188583374, + "rewards/rejected": -0.19511568546295166, + "step": 194 + }, + { + "epoch": 0.6282051282051282, + "grad_norm": 43.1232396006547, + "learning_rate": 4.597220474379125e-07, + "logits/chosen": -1.3772120475769043, + "logits/rejected": -1.285635232925415, + "logps/chosen": -286.9246826171875, + "logps/rejected": -387.873291015625, + "loss": 0.4635, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7378672361373901, + "rewards/margins": 1.8036994934082031, + "rewards/rejected": -1.0658321380615234, + "step": 196 + }, + { + "epoch": 0.6346153846153846, + "grad_norm": 41.03425880219558, + "learning_rate": 4.5807534881817183e-07, + "logits/chosen": -1.1889774799346924, + "logits/rejected": -1.2006278038024902, + "logps/chosen": -259.9949951171875, + "logps/rejected": -245.51040649414062, + "loss": 0.4392, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.8805737495422363, + "rewards/margins": 3.397979736328125, + "rewards/rejected": -1.5174059867858887, + "step": 198 + }, + { + "epoch": 0.6410256410256411, + "grad_norm": 51.35919090882913, + "learning_rate": 4.5639873357392157e-07, + "logits/chosen": -1.0301035642623901, + "logits/rejected": -0.9902200698852539, + "logps/chosen": -252.16293334960938, + "logps/rejected": -334.22283935546875, + "loss": 0.5029, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.6112852096557617, + "rewards/margins": 5.990167140960693, + "rewards/rejected": -4.378881931304932, + "step": 200 + }, + { + "epoch": 0.6410256410256411, + "eval_logits/chosen": -0.8330116868019104, + "eval_logits/rejected": -0.8191525340080261, + "eval_logps/chosen": -449.2957458496094, + "eval_logps/rejected": -489.9350280761719, + "eval_loss": 0.7941520810127258, + "eval_rewards/accuracies": 0.5384615659713745, + "eval_rewards/chosen": 0.4810504913330078, + "eval_rewards/margins": 4.821780204772949, + "eval_rewards/rejected": -4.340729236602783, + "eval_runtime": 16.8856, + "eval_samples_per_second": 5.922, + "eval_steps_per_second": 0.77, + "step": 200 + }, + { + "epoch": 0.6474358974358975, + "grad_norm": 57.2508551264232, + "learning_rate": 4.546924427652824e-07, + "logits/chosen": -1.179105281829834, + "logits/rejected": -1.2150788307189941, + "logps/chosen": -283.37115478515625, + "logps/rejected": -249.56002807617188, + "loss": 0.4656, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6425219178199768, + "rewards/margins": 1.205476999282837, + "rewards/rejected": -1.847999095916748, + "step": 202 + }, + { + "epoch": 0.6538461538461539, + "grad_norm": 42.1698810303972, + "learning_rate": 4.5295672171906365e-07, + "logits/chosen": -1.2583081722259521, + "logits/rejected": -1.2646164894104004, + "logps/chosen": -336.4263610839844, + "logps/rejected": -385.14324951171875, + "loss": 0.4783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8167861700057983, + "rewards/margins": 1.7936593294143677, + "rewards/rejected": -0.9768733382225037, + "step": 204 + }, + { + "epoch": 0.6602564102564102, + "grad_norm": 38.54788269974552, + "learning_rate": 4.5119181999349065e-07, + "logits/chosen": -0.9713093042373657, + "logits/rejected": -0.9149163961410522, + "logps/chosen": -276.9900207519531, + "logps/rejected": -487.7098693847656, + "loss": 0.4595, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.417994737625122, + "rewards/margins": 10.213150978088379, + "rewards/rejected": -8.79515552520752, + "step": 206 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 48.23696788049309, + "learning_rate": 4.493979913423239e-07, + "logits/chosen": -1.167783260345459, + "logits/rejected": -1.1681760549545288, + "logps/chosen": -233.1728057861328, + "logps/rejected": -265.2602844238281, + "loss": 0.5401, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.7662079334259033, + "rewards/margins": 1.4114222526550293, + "rewards/rejected": 0.35478541254997253, + "step": 208 + }, + { + "epoch": 0.6730769230769231, + "grad_norm": 186.5227256498083, + "learning_rate": 4.4757549367837487e-07, + "logits/chosen": -1.1317946910858154, + "logits/rejected": -0.9495127201080322, + "logps/chosen": -400.63983154296875, + "logps/rejected": -557.700927734375, + "loss": 0.8193, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.180715084075928, + "rewards/margins": 9.595314979553223, + "rewards/rejected": -14.776029586791992, + "step": 210 + }, + { + "epoch": 0.6794871794871795, + "grad_norm": 42.36514995109615, + "learning_rate": 4.457245890364235e-07, + "logits/chosen": -1.2297334671020508, + "logits/rejected": -1.2197715044021606, + "logps/chosen": -301.80572509765625, + "logps/rejected": -400.6791687011719, + "loss": 0.4533, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1558496952056885, + "rewards/margins": 5.097892761230469, + "rewards/rejected": -3.9420433044433594, + "step": 212 + }, + { + "epoch": 0.6858974358974359, + "grad_norm": 37.44158378069016, + "learning_rate": 4.438455435355442e-07, + "logits/chosen": -0.8933911919593811, + "logits/rejected": -0.8581579923629761, + "logps/chosen": -329.072265625, + "logps/rejected": -345.00567626953125, + "loss": 0.5023, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1225879192352295, + "rewards/margins": 0.840599775314331, + "rewards/rejected": 0.2819880545139313, + "step": 214 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 33.73559534533908, + "learning_rate": 4.4193862734084277e-07, + "logits/chosen": -1.2856342792510986, + "logits/rejected": -1.2485191822052002, + "logps/chosen": -350.41644287109375, + "logps/rejected": -360.15631103515625, + "loss": 0.3993, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.281527042388916, + "rewards/margins": 2.0371432304382324, + "rewards/rejected": 0.24438396096229553, + "step": 216 + }, + { + "epoch": 0.6987179487179487, + "grad_norm": 41.68548258582068, + "learning_rate": 4.400041146246136e-07, + "logits/chosen": -1.2767317295074463, + "logits/rejected": -1.2496519088745117, + "logps/chosen": -347.34368896484375, + "logps/rejected": -388.32708740234375, + "loss": 0.4453, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.8583331108093262, + "rewards/margins": 4.0854620933532715, + "rewards/rejected": -2.2271294593811035, + "step": 218 + }, + { + "epoch": 0.7051282051282052, + "grad_norm": 51.697457108558865, + "learning_rate": 4.380422835269193e-07, + "logits/chosen": -1.3155988454818726, + "logits/rejected": -1.272523045539856, + "logps/chosen": -352.7498779296875, + "logps/rejected": -451.03948974609375, + "loss": 0.4957, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.475347876548767, + "rewards/margins": 2.454881191253662, + "rewards/rejected": -0.9795333743095398, + "step": 220 + }, + { + "epoch": 0.7115384615384616, + "grad_norm": 99.87392091649696, + "learning_rate": 4.360534161156003e-07, + "logits/chosen": -1.1554548740386963, + "logits/rejected": -1.1182796955108643, + "logps/chosen": -374.2333679199219, + "logps/rejected": -444.99700927734375, + "loss": 0.6808, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0857256650924683, + "rewards/margins": 7.018498420715332, + "rewards/rejected": -5.932773113250732, + "step": 222 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 57.77103784560175, + "learning_rate": 4.3403779834572e-07, + "logits/chosen": -1.3060379028320312, + "logits/rejected": -1.136815071105957, + "logps/chosen": -237.18177795410156, + "logps/rejected": -423.6202392578125, + "loss": 0.4473, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2666206359863281, + "rewards/margins": 8.948336601257324, + "rewards/rejected": -7.681715965270996, + "step": 224 + }, + { + "epoch": 0.7243589743589743, + "grad_norm": 57.35757807250867, + "learning_rate": 4.3199572001845043e-07, + "logits/chosen": -1.4666709899902344, + "logits/rejected": -1.327986240386963, + "logps/chosen": -348.210693359375, + "logps/rejected": -449.1205139160156, + "loss": 0.5169, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1110727787017822, + "rewards/margins": 9.026819229125977, + "rewards/rejected": -7.915746688842773, + "step": 226 + }, + { + "epoch": 0.7307692307692307, + "grad_norm": 48.08399963307796, + "learning_rate": 4.299274747394055e-07, + "logits/chosen": -1.062417984008789, + "logits/rejected": -1.0438898801803589, + "logps/chosen": -312.5767822265625, + "logps/rejected": -444.9324035644531, + "loss": 0.4905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.892605185508728, + "rewards/margins": 5.7781081199646, + "rewards/rejected": -4.885502815246582, + "step": 228 + }, + { + "epoch": 0.7371794871794872, + "grad_norm": 68.70986151380923, + "learning_rate": 4.278333598764271e-07, + "logits/chosen": -1.2313703298568726, + "logits/rejected": -1.1486096382141113, + "logps/chosen": -306.9612121582031, + "logps/rejected": -402.64892578125, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5822381377220154, + "rewards/margins": 3.605410099029541, + "rewards/rejected": -3.023171901702881, + "step": 230 + }, + { + "epoch": 0.7435897435897436, + "grad_norm": 53.50931679712491, + "learning_rate": 4.257136765168299e-07, + "logits/chosen": -1.20849609375, + "logits/rejected": -1.153712272644043, + "logps/chosen": -341.30194091796875, + "logps/rejected": -498.90576171875, + "loss": 0.5337, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.291115403175354, + "rewards/margins": 5.4868340492248535, + "rewards/rejected": -4.195718765258789, + "step": 232 + }, + { + "epoch": 0.75, + "grad_norm": 38.70138934757289, + "learning_rate": 4.235687294241119e-07, + "logits/chosen": -1.0791088342666626, + "logits/rejected": -1.0456739664077759, + "logps/chosen": -347.1581726074219, + "logps/rejected": -530.249755859375, + "loss": 0.399, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.560847520828247, + "rewards/margins": 10.248472213745117, + "rewards/rejected": -8.68762493133545, + "step": 234 + }, + { + "epoch": 0.7564102564102564, + "grad_norm": 29.74889356687727, + "learning_rate": 4.2139882699413613e-07, + "logits/chosen": -1.2497999668121338, + "logits/rejected": -1.2147784233093262, + "logps/chosen": -400.81439208984375, + "logps/rejected": -478.99176025390625, + "loss": 0.4291, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.4088551998138428, + "rewards/margins": 5.4775800704956055, + "rewards/rejected": -4.068724632263184, + "step": 236 + }, + { + "epoch": 0.7628205128205128, + "grad_norm": 40.33274202001432, + "learning_rate": 4.1920428121079e-07, + "logits/chosen": -1.1209681034088135, + "logits/rejected": -1.0711822509765625, + "logps/chosen": -277.2111511230469, + "logps/rejected": -529.2734375, + "loss": 0.4629, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.4290432929992676, + "rewards/margins": 10.097843170166016, + "rewards/rejected": -8.66879940032959, + "step": 238 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 35.074202094554686, + "learning_rate": 4.169854076011292e-07, + "logits/chosen": -1.4265623092651367, + "logits/rejected": -1.3700249195098877, + "logps/chosen": -278.44677734375, + "logps/rejected": -357.0823669433594, + "loss": 0.4875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.688441514968872, + "rewards/margins": 4.847907543182373, + "rewards/rejected": -3.15946626663208, + "step": 240 + }, + { + "epoch": 0.7756410256410257, + "grad_norm": 40.05052425648403, + "learning_rate": 4.1474252519001185e-07, + "logits/chosen": -1.2000222206115723, + "logits/rejected": -1.1607084274291992, + "logps/chosen": -263.72052001953125, + "logps/rejected": -368.5860595703125, + "loss": 0.4883, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2200478315353394, + "rewards/margins": 4.447844505310059, + "rewards/rejected": -3.227797508239746, + "step": 242 + }, + { + "epoch": 0.782051282051282, + "grad_norm": 47.99182219689601, + "learning_rate": 4.124759564542295e-07, + "logits/chosen": -1.36995530128479, + "logits/rejected": -1.187804102897644, + "logps/chosen": -336.81610107421875, + "logps/rejected": -444.2264099121094, + "loss": 0.4799, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4630799889564514, + "rewards/margins": 8.040542602539062, + "rewards/rejected": -7.577462673187256, + "step": 244 + }, + { + "epoch": 0.7884615384615384, + "grad_norm": 63.043919678845825, + "learning_rate": 4.101860272761426e-07, + "logits/chosen": -1.0336024761199951, + "logits/rejected": -1.0574252605438232, + "logps/chosen": -364.8282165527344, + "logps/rejected": -400.51556396484375, + "loss": 0.557, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7366421222686768, + "rewards/margins": 0.019814223051071167, + "rewards/rejected": 0.7168278694152832, + "step": 246 + }, + { + "epoch": 0.7948717948717948, + "grad_norm": 38.10178559602611, + "learning_rate": 4.078730668968252e-07, + "logits/chosen": -1.2227962017059326, + "logits/rejected": -1.1656228303909302, + "logps/chosen": -366.4295654296875, + "logps/rejected": -450.58282470703125, + "loss": 0.4574, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.778436303138733, + "rewards/margins": 6.253500938415527, + "rewards/rejected": -4.475064277648926, + "step": 248 + }, + { + "epoch": 0.8012820512820513, + "grad_norm": 69.97512445870161, + "learning_rate": 4.055374078687282e-07, + "logits/chosen": -1.082871437072754, + "logits/rejected": -0.9831995368003845, + "logps/chosen": -295.03143310546875, + "logps/rejected": -583.1727294921875, + "loss": 0.5091, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5504722595214844, + "rewards/margins": 11.467432975769043, + "rewards/rejected": -10.916959762573242, + "step": 250 + }, + { + "epoch": 0.8076923076923077, + "grad_norm": 162.78667829256125, + "learning_rate": 4.0317938600786484e-07, + "logits/chosen": -1.135309100151062, + "logits/rejected": -1.0801632404327393, + "logps/chosen": -281.5797424316406, + "logps/rejected": -390.37554931640625, + "loss": 0.5132, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2766234874725342, + "rewards/margins": 6.887472629547119, + "rewards/rejected": -8.164095878601074, + "step": 252 + }, + { + "epoch": 0.8141025641025641, + "grad_norm": 40.14628374676188, + "learning_rate": 4.0079934034552843e-07, + "logits/chosen": -1.2007012367248535, + "logits/rejected": -1.0240919589996338, + "logps/chosen": -327.4549560546875, + "logps/rejected": -427.1672058105469, + "loss": 0.4855, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9479133486747742, + "rewards/margins": 6.302669525146484, + "rewards/rejected": -5.354755401611328, + "step": 254 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 49.52748937789625, + "learning_rate": 3.983976130795467e-07, + "logits/chosen": -0.9544306397438049, + "logits/rejected": -0.8206841945648193, + "logps/chosen": -367.2049560546875, + "logps/rejected": -570.1994018554688, + "loss": 0.5036, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1424559354782104, + "rewards/margins": 13.830140113830566, + "rewards/rejected": -12.68768310546875, + "step": 256 + }, + { + "epoch": 0.8269230769230769, + "grad_norm": 43.4163446151237, + "learning_rate": 3.959745495250818e-07, + "logits/chosen": -1.3475706577301025, + "logits/rejected": -1.2560746669769287, + "logps/chosen": -285.245849609375, + "logps/rejected": -497.93670654296875, + "loss": 0.394, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2378212213516235, + "rewards/margins": 9.36059284210205, + "rewards/rejected": -8.122771263122559, + "step": 258 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 73.8757445310392, + "learning_rate": 3.935304980649813e-07, + "logits/chosen": -1.1998732089996338, + "logits/rejected": -1.1121838092803955, + "logps/chosen": -319.75421142578125, + "logps/rejected": -405.37738037109375, + "loss": 0.553, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.1936397552490234, + "rewards/margins": 6.627713203430176, + "rewards/rejected": -5.4340739250183105, + "step": 260 + }, + { + "epoch": 0.8397435897435898, + "grad_norm": 39.0321986974153, + "learning_rate": 3.9106581009968833e-07, + "logits/chosen": -1.2883763313293457, + "logits/rejected": -1.2416361570358276, + "logps/chosen": -327.863037109375, + "logps/rejected": -415.50421142578125, + "loss": 0.4384, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5670347213745117, + "rewards/margins": 1.3804373741149902, + "rewards/rejected": 0.18659722805023193, + "step": 262 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 37.173255869527146, + "learning_rate": 3.885808399967185e-07, + "logits/chosen": -1.3988786935806274, + "logits/rejected": -1.3844860792160034, + "logps/chosen": -415.4576416015625, + "logps/rejected": -434.8994140625, + "loss": 0.3855, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.2823258638381958, + "rewards/margins": 1.6761102676391602, + "rewards/rejected": -0.39378437399864197, + "step": 264 + }, + { + "epoch": 0.8525641025641025, + "grad_norm": 61.46738933307515, + "learning_rate": 3.8607594503970926e-07, + "logits/chosen": -1.0272738933563232, + "logits/rejected": -0.990746021270752, + "logps/chosen": -274.5499267578125, + "logps/rejected": -338.88360595703125, + "loss": 0.5373, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.2696378231048584, + "rewards/margins": 3.3679614067077637, + "rewards/rejected": -2.0983235836029053, + "step": 266 + }, + { + "epoch": 0.8589743589743589, + "grad_norm": 41.171512816946695, + "learning_rate": 3.8355148537705047e-07, + "logits/chosen": -1.089869499206543, + "logits/rejected": -1.118644118309021, + "logps/chosen": -424.0999450683594, + "logps/rejected": -418.88470458984375, + "loss": 0.4486, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2740938663482666, + "rewards/margins": 3.08381986618042, + "rewards/rejected": -1.8097259998321533, + "step": 268 + }, + { + "epoch": 0.8653846153846154, + "grad_norm": 44.2947402342335, + "learning_rate": 3.810078239701032e-07, + "logits/chosen": -1.2524669170379639, + "logits/rejected": -1.2814539670944214, + "logps/chosen": -349.0198669433594, + "logps/rejected": -340.69561767578125, + "loss": 0.5365, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6394474506378174, + "rewards/margins": 0.5701440572738647, + "rewards/rejected": 0.06930342316627502, + "step": 270 + }, + { + "epoch": 0.8717948717948718, + "grad_norm": 38.68538904924371, + "learning_rate": 3.78445326541014e-07, + "logits/chosen": -0.9664976596832275, + "logits/rejected": -0.9965358972549438, + "logps/chosen": -241.77255249023438, + "logps/rejected": -353.2283935546875, + "loss": 0.3547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.460767149925232, + "rewards/margins": 8.431722640991211, + "rewards/rejected": -6.970954895019531, + "step": 272 + }, + { + "epoch": 0.8782051282051282, + "grad_norm": 39.922548889921686, + "learning_rate": 3.758643615201319e-07, + "logits/chosen": -1.0898842811584473, + "logits/rejected": -1.0922670364379883, + "logps/chosen": -276.4876708984375, + "logps/rejected": -423.4794006347656, + "loss": 0.3842, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8416784405708313, + "rewards/margins": 7.02686882019043, + "rewards/rejected": -6.185190200805664, + "step": 274 + }, + { + "epoch": 0.8846153846153846, + "grad_norm": 34.621715120082015, + "learning_rate": 3.7326529999303633e-07, + "logits/chosen": -1.0924513339996338, + "logits/rejected": -1.1039786338806152, + "logps/chosen": -360.8890380859375, + "logps/rejected": -414.1465148925781, + "loss": 0.4771, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8111865520477295, + "rewards/margins": 4.153421878814697, + "rewards/rejected": -3.342235565185547, + "step": 276 + }, + { + "epoch": 0.8910256410256411, + "grad_norm": 52.14279289160271, + "learning_rate": 3.7064851564718353e-07, + "logits/chosen": -1.2747771739959717, + "logits/rejected": -1.1346726417541504, + "logps/chosen": -312.43463134765625, + "logps/rejected": -488.4071044921875, + "loss": 0.4599, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7376309633255005, + "rewards/margins": 10.913593292236328, + "rewards/rejected": -10.175962448120117, + "step": 278 + }, + { + "epoch": 0.8974358974358975, + "grad_norm": 41.96313198375718, + "learning_rate": 3.6801438471817826e-07, + "logits/chosen": -1.0474282503128052, + "logits/rejected": -1.0190608501434326, + "logps/chosen": -259.101318359375, + "logps/rejected": -354.8406982421875, + "loss": 0.4132, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7173309922218323, + "rewards/margins": 6.0161590576171875, + "rewards/rejected": -5.298828125, + "step": 280 + }, + { + "epoch": 0.9038461538461539, + "grad_norm": 43.17696149305258, + "learning_rate": 3.6536328593567943e-07, + "logits/chosen": -1.0989081859588623, + "logits/rejected": -1.0352321863174438, + "logps/chosen": -272.3264465332031, + "logps/rejected": -378.7626953125, + "loss": 0.4524, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7457245588302612, + "rewards/margins": 4.060770034790039, + "rewards/rejected": -3.315045118331909, + "step": 282 + }, + { + "epoch": 0.9102564102564102, + "grad_norm": 96.45436486468698, + "learning_rate": 3.626956004689476e-07, + "logits/chosen": -1.0897564888000488, + "logits/rejected": -1.0287668704986572, + "logps/chosen": -344.04449462890625, + "logps/rejected": -423.11126708984375, + "loss": 0.421, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09141358733177185, + "rewards/margins": 4.10267448425293, + "rewards/rejected": -4.011261463165283, + "step": 284 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 41.686764484355685, + "learning_rate": 3.600117118720407e-07, + "logits/chosen": -1.1550977230072021, + "logits/rejected": -1.1425789594650269, + "logps/chosen": -279.7099609375, + "logps/rejected": -370.15618896484375, + "loss": 0.5009, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.6953952312469482, + "rewards/margins": 5.950080871582031, + "rewards/rejected": -4.254685401916504, + "step": 286 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 31.132209200677924, + "learning_rate": 3.5731200602866787e-07, + "logits/chosen": -1.3340139389038086, + "logits/rejected": -1.283268928527832, + "logps/chosen": -341.46246337890625, + "logps/rejected": -407.7066650390625, + "loss": 0.4074, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06659600138664246, + "rewards/margins": 2.822984457015991, + "rewards/rejected": -2.7563881874084473, + "step": 288 + }, + { + "epoch": 0.9294871794871795, + "grad_norm": 54.27334364101727, + "learning_rate": 3.545968710967078e-07, + "logits/chosen": -1.0896086692810059, + "logits/rejected": -1.0489652156829834, + "logps/chosen": -170.53924560546875, + "logps/rejected": -285.4649353027344, + "loss": 0.5121, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.838397741317749, + "rewards/margins": 4.948480606079102, + "rewards/rejected": -4.110082626342773, + "step": 290 + }, + { + "epoch": 0.9358974358974359, + "grad_norm": 76.59619006606502, + "learning_rate": 3.518666974524002e-07, + "logits/chosen": -1.2549858093261719, + "logits/rejected": -1.2538197040557861, + "logps/chosen": -334.96136474609375, + "logps/rejected": -434.1978759765625, + "loss": 0.5223, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2638781070709229, + "rewards/margins": 4.786343574523926, + "rewards/rejected": -3.522465705871582, + "step": 292 + }, + { + "epoch": 0.9423076923076923, + "grad_norm": 133.78311730376316, + "learning_rate": 3.491218776342185e-07, + "logits/chosen": -1.2152208089828491, + "logits/rejected": -1.1914293766021729, + "logps/chosen": -374.551513671875, + "logps/rejected": -472.5165710449219, + "loss": 0.5035, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.3415427207946777, + "rewards/margins": 5.4076337814331055, + "rewards/rejected": -7.749176979064941, + "step": 294 + }, + { + "epoch": 0.9487179487179487, + "grad_norm": 40.552483645365314, + "learning_rate": 3.463628062864312e-07, + "logits/chosen": -1.0271166563034058, + "logits/rejected": -1.0384609699249268, + "logps/chosen": -343.7560729980469, + "logps/rejected": -459.90338134765625, + "loss": 0.4179, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3456639051437378, + "rewards/margins": 7.615706443786621, + "rewards/rejected": -6.27004337310791, + "step": 296 + }, + { + "epoch": 0.9551282051282052, + "grad_norm": 47.2413814634478, + "learning_rate": 3.4358988010236103e-07, + "logits/chosen": -1.1437180042266846, + "logits/rejected": -1.1581155061721802, + "logps/chosen": -411.29815673828125, + "logps/rejected": -408.15777587890625, + "loss": 0.4472, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.3157517910003662, + "rewards/margins": 3.5424256324768066, + "rewards/rejected": -2.2266736030578613, + "step": 298 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 45.7725086599915, + "learning_rate": 3.4080349776734924e-07, + "logits/chosen": -1.3444430828094482, + "logits/rejected": -1.28860604763031, + "logps/chosen": -361.0887451171875, + "logps/rejected": -429.32000732421875, + "loss": 0.4061, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5946578979492188, + "rewards/margins": 3.071869373321533, + "rewards/rejected": -1.4772114753723145, + "step": 300 + }, + { + "epoch": 0.9615384615384616, + "eval_logits/chosen": -0.8096151351928711, + "eval_logits/rejected": -0.7950038909912109, + "eval_logps/chosen": -451.3063659667969, + "eval_logps/rejected": -508.31512451171875, + "eval_loss": 0.7446603178977966, + "eval_rewards/accuracies": 0.5384615659713745, + "eval_rewards/chosen": 0.27998173236846924, + "eval_rewards/margins": 6.458725452423096, + "eval_rewards/rejected": -6.178744316101074, + "eval_runtime": 15.6681, + "eval_samples_per_second": 6.382, + "eval_steps_per_second": 0.83, + "step": 300 + }, + { + "epoch": 0.967948717948718, + "grad_norm": 40.99834778427273, + "learning_rate": 3.3800405990143347e-07, + "logits/chosen": -1.4369513988494873, + "logits/rejected": -1.2584969997406006, + "logps/chosen": -312.66259765625, + "logps/rejected": -421.14788818359375, + "loss": 0.4197, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.3124394416809082, + "rewards/margins": 4.349525451660156, + "rewards/rejected": -3.03708553314209, + "step": 302 + }, + { + "epoch": 0.9743589743589743, + "grad_norm": 60.50808207902098, + "learning_rate": 3.3519196900174724e-07, + "logits/chosen": -1.1461973190307617, + "logits/rejected": -1.1371371746063232, + "logps/chosen": -283.01593017578125, + "logps/rejected": -396.82647705078125, + "loss": 0.4483, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.740763783454895, + "rewards/margins": 5.299368858337402, + "rewards/rejected": -3.5586047172546387, + "step": 304 + }, + { + "epoch": 0.9807692307692307, + "grad_norm": 33.45693878022023, + "learning_rate": 3.3236762938465003e-07, + "logits/chosen": -1.1708471775054932, + "logits/rejected": -1.147695541381836, + "logps/chosen": -321.81402587890625, + "logps/rejected": -411.64605712890625, + "loss": 0.3723, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9521249532699585, + "rewards/margins": 2.2914817333221436, + "rewards/rejected": -1.3393568992614746, + "step": 306 + }, + { + "epoch": 0.9871794871794872, + "grad_norm": 42.812956948839926, + "learning_rate": 3.2953144712759537e-07, + "logits/chosen": -1.2811245918273926, + "logits/rejected": -1.2404944896697998, + "logps/chosen": -341.8775329589844, + "logps/rejected": -527.7686767578125, + "loss": 0.4197, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17797891795635223, + "rewards/margins": 10.833751678466797, + "rewards/rejected": -10.65577220916748, + "step": 308 + }, + { + "epoch": 0.9935897435897436, + "grad_norm": 52.96225977983095, + "learning_rate": 3.266838300107464e-07, + "logits/chosen": -1.3686065673828125, + "logits/rejected": -1.4041410684585571, + "logps/chosen": -291.25531005859375, + "logps/rejected": -328.734130859375, + "loss": 0.4868, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.1023082733154297, + "rewards/margins": 1.7836296558380127, + "rewards/rejected": -0.6813213229179382, + "step": 310 + }, + { + "epoch": 1.0, + "grad_norm": 43.66474290909873, + "learning_rate": 3.2382518745834516e-07, + "logits/chosen": -1.2051764726638794, + "logits/rejected": -1.0639588832855225, + "logps/chosen": -289.6334228515625, + "logps/rejected": -334.64666748046875, + "loss": 0.6303, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.24570894241333, + "rewards/margins": 1.4176597595214844, + "rewards/rejected": -0.1719507873058319, + "step": 312 + }, + { + "epoch": 1.0064102564102564, + "grad_norm": 36.86675950827572, + "learning_rate": 3.209559304798474e-07, + "logits/chosen": -1.0935571193695068, + "logits/rejected": -0.9197241067886353, + "logps/chosen": -238.86378479003906, + "logps/rejected": -356.7698974609375, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2590787410736084, + "rewards/margins": 5.775833606719971, + "rewards/rejected": -4.516754627227783, + "step": 314 + }, + { + "epoch": 1.0128205128205128, + "grad_norm": 36.554222603370235, + "learning_rate": 3.1807647161082797e-07, + "logits/chosen": -1.3647656440734863, + "logits/rejected": -1.3482074737548828, + "logps/chosen": -333.5447082519531, + "logps/rejected": -462.37841796875, + "loss": 0.322, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09278309345245361, + "rewards/margins": 11.315669059753418, + "rewards/rejected": -11.222885131835938, + "step": 316 + }, + { + "epoch": 1.0192307692307692, + "grad_norm": 26.84501628081038, + "learning_rate": 3.1518722485366754e-07, + "logits/chosen": -1.2064971923828125, + "logits/rejected": -1.0343093872070312, + "logps/chosen": -370.1424560546875, + "logps/rejected": -446.01336669921875, + "loss": 0.2436, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5165338516235352, + "rewards/margins": 3.0939137935638428, + "rewards/rejected": -2.5773799419403076, + "step": 318 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 42.29424038669826, + "learning_rate": 3.122886056180284e-07, + "logits/chosen": -1.4304347038269043, + "logits/rejected": -1.360546350479126, + "logps/chosen": -371.29150390625, + "logps/rejected": -404.85052490234375, + "loss": 0.3502, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0550906658172607, + "rewards/margins": 3.6349360942840576, + "rewards/rejected": -2.5798451900482178, + "step": 320 + }, + { + "epoch": 1.032051282051282, + "grad_norm": 21.606341502385636, + "learning_rate": 3.093810306611272e-07, + "logits/chosen": -1.241143822669983, + "logits/rejected": -1.1153712272644043, + "logps/chosen": -274.459716796875, + "logps/rejected": -434.70330810546875, + "loss": 0.2671, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6185639500617981, + "rewards/margins": 8.699484825134277, + "rewards/rejected": -8.080921173095703, + "step": 322 + }, + { + "epoch": 1.0384615384615385, + "grad_norm": 30.97588918593489, + "learning_rate": 3.0646491802781514e-07, + "logits/chosen": -1.0533857345581055, + "logits/rejected": -1.1453428268432617, + "logps/chosen": -445.8006591796875, + "logps/rejected": -426.56231689453125, + "loss": 0.3039, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9802995920181274, + "rewards/margins": 1.7898352146148682, + "rewards/rejected": -0.8095357418060303, + "step": 324 + }, + { + "epoch": 1.044871794871795, + "grad_norm": 48.1211063288669, + "learning_rate": 3.035406869904721e-07, + "logits/chosen": -1.2862086296081543, + "logits/rejected": -1.1623508930206299, + "logps/chosen": -439.55523681640625, + "logps/rejected": -539.0735473632812, + "loss": 0.3575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7564919590950012, + "rewards/margins": 3.1305952072143555, + "rewards/rejected": -2.37410306930542, + "step": 326 + }, + { + "epoch": 1.0512820512820513, + "grad_norm": 29.796987637514043, + "learning_rate": 3.006087579887244e-07, + "logits/chosen": -1.12770414352417, + "logits/rejected": -1.1531617641448975, + "logps/chosen": -383.5997314453125, + "logps/rejected": -674.30322265625, + "loss": 0.2827, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5838356018066406, + "rewards/margins": 12.498905181884766, + "rewards/rejected": -15.08273983001709, + "step": 328 + }, + { + "epoch": 1.0576923076923077, + "grad_norm": 30.06533719920392, + "learning_rate": 2.976695525689952e-07, + "logits/chosen": -1.1804733276367188, + "logits/rejected": -1.0108273029327393, + "logps/chosen": -276.89349365234375, + "logps/rejected": -487.67901611328125, + "loss": 0.227, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2702795267105103, + "rewards/margins": 14.07183837890625, + "rewards/rejected": -12.801559448242188, + "step": 330 + }, + { + "epoch": 1.064102564102564, + "grad_norm": 27.841789975882595, + "learning_rate": 2.9472349332389523e-07, + "logits/chosen": -1.2643177509307861, + "logits/rejected": -1.2623913288116455, + "logps/chosen": -306.5585021972656, + "logps/rejected": -438.843994140625, + "loss": 0.2917, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08365759253501892, + "rewards/margins": 5.136476993560791, + "rewards/rejected": -5.05281925201416, + "step": 332 + }, + { + "epoch": 1.0705128205128205, + "grad_norm": 51.76424451062161, + "learning_rate": 2.9177100383146364e-07, + "logits/chosen": -1.2345194816589355, + "logits/rejected": -1.2358707189559937, + "logps/chosen": -339.4765625, + "logps/rejected": -411.02496337890625, + "loss": 0.2568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.040745496749878, + "rewards/margins": 6.698721408843994, + "rewards/rejected": -5.657975673675537, + "step": 334 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 31.35345871161808, + "learning_rate": 2.888125085942664e-07, + "logits/chosen": -1.426588535308838, + "logits/rejected": -1.373186707496643, + "logps/chosen": -356.6693115234375, + "logps/rejected": -432.9941711425781, + "loss": 0.2557, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.4465351104736328, + "rewards/margins": 6.831225872039795, + "rewards/rejected": -5.38469123840332, + "step": 336 + }, + { + "epoch": 1.0833333333333333, + "grad_norm": 34.366453819676366, + "learning_rate": 2.8584843297836277e-07, + "logits/chosen": -1.5102860927581787, + "logits/rejected": -1.461169719696045, + "logps/chosen": -351.3536071777344, + "logps/rejected": -431.31890869140625, + "loss": 0.3277, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3297797441482544, + "rewards/margins": 6.9336700439453125, + "rewards/rejected": -6.6038899421691895, + "step": 338 + }, + { + "epoch": 1.0897435897435896, + "grad_norm": 41.10937974388239, + "learning_rate": 2.828792031521464e-07, + "logits/chosen": -1.1086995601654053, + "logits/rejected": -1.062842845916748, + "logps/chosen": -238.65838623046875, + "logps/rejected": -356.7580871582031, + "loss": 0.3668, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2526164054870605, + "rewards/margins": 6.96852970123291, + "rewards/rejected": -5.71591329574585, + "step": 340 + }, + { + "epoch": 1.0961538461538463, + "grad_norm": 27.99031443752752, + "learning_rate": 2.799052460250727e-07, + "logits/chosen": -1.262937307357788, + "logits/rejected": -1.2391104698181152, + "logps/chosen": -222.76937866210938, + "logps/rejected": -488.0319519042969, + "loss": 0.269, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5808814764022827, + "rewards/margins": 13.928915023803711, + "rewards/rejected": -12.348031997680664, + "step": 342 + }, + { + "epoch": 1.1025641025641026, + "grad_norm": 38.81780059160134, + "learning_rate": 2.7692698918627775e-07, + "logits/chosen": -1.288144826889038, + "logits/rejected": -1.2487950325012207, + "logps/chosen": -318.6227722167969, + "logps/rejected": -463.7633972167969, + "loss": 0.3641, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.2282376289367676, + "rewards/margins": 9.634057998657227, + "rewards/rejected": -8.405821800231934, + "step": 344 + }, + { + "epoch": 1.108974358974359, + "grad_norm": 27.694568920223812, + "learning_rate": 2.7394486084310126e-07, + "logits/chosen": -1.3042877912521362, + "logits/rejected": -1.27223539352417, + "logps/chosen": -425.68048095703125, + "logps/rejected": -526.4976196289062, + "loss": 0.2686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40143609046936035, + "rewards/margins": 5.7390217781066895, + "rewards/rejected": -5.337584972381592, + "step": 346 + }, + { + "epoch": 1.1153846153846154, + "grad_norm": 32.92380903061096, + "learning_rate": 2.709592897595191e-07, + "logits/chosen": -1.3557491302490234, + "logits/rejected": -1.3039772510528564, + "logps/chosen": -304.20098876953125, + "logps/rejected": -436.463134765625, + "loss": 0.3106, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8321805596351624, + "rewards/margins": 6.894351959228516, + "rewards/rejected": -6.062171936035156, + "step": 348 + }, + { + "epoch": 1.1217948717948718, + "grad_norm": 34.96483979580043, + "learning_rate": 2.6797070519449735e-07, + "logits/chosen": -1.1945276260375977, + "logits/rejected": -1.1270620822906494, + "logps/chosen": -384.7686462402344, + "logps/rejected": -426.8640441894531, + "loss": 0.3377, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0590187311172485, + "rewards/margins": 3.7418627738952637, + "rewards/rejected": -2.6828436851501465, + "step": 350 + }, + { + "epoch": 1.1282051282051282, + "grad_norm": 41.1390435271783, + "learning_rate": 2.6497953684027346e-07, + "logits/chosen": -1.3725982904434204, + "logits/rejected": -1.2870327234268188, + "logps/chosen": -356.6553955078125, + "logps/rejected": -484.23419189453125, + "loss": 0.3285, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.49378708004951477, + "rewards/margins": 7.542490005493164, + "rewards/rejected": -7.048703193664551, + "step": 352 + }, + { + "epoch": 1.1346153846153846, + "grad_norm": 32.22297305865707, + "learning_rate": 2.6198621476057704e-07, + "logits/chosen": -1.2978630065917969, + "logits/rejected": -1.3309122323989868, + "logps/chosen": -295.10028076171875, + "logps/rejected": -371.45758056640625, + "loss": 0.3236, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2298011779785156, + "rewards/margins": 3.266383409500122, + "rewards/rejected": -2.0365824699401855, + "step": 354 + }, + { + "epoch": 1.141025641025641, + "grad_norm": 26.020190516651358, + "learning_rate": 2.5899116932879534e-07, + "logits/chosen": -1.221419095993042, + "logits/rejected": -1.159762978553772, + "logps/chosen": -336.4085998535156, + "logps/rejected": -428.48248291015625, + "loss": 0.2479, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2515459954738617, + "rewards/margins": 6.693026542663574, + "rewards/rejected": -6.44148063659668, + "step": 356 + }, + { + "epoch": 1.1474358974358974, + "grad_norm": 28.21713959382367, + "learning_rate": 2.5599483116609544e-07, + "logits/chosen": -1.3974900245666504, + "logits/rejected": -1.4186005592346191, + "logps/chosen": -421.5855407714844, + "logps/rejected": -486.21038818359375, + "loss": 0.296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.946840226650238, + "rewards/margins": 5.974102973937988, + "rewards/rejected": -5.0272626876831055, + "step": 358 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 33.14375110583809, + "learning_rate": 2.5299763107951076e-07, + "logits/chosen": -1.321262240409851, + "logits/rejected": -1.2191853523254395, + "logps/chosen": -285.807373046875, + "logps/rejected": -434.14349365234375, + "loss": 0.3047, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.639596939086914, + "rewards/margins": 9.497468948364258, + "rewards/rejected": -7.8578715324401855, + "step": 360 + }, + { + "epoch": 1.1602564102564104, + "grad_norm": 35.10447965371206, + "learning_rate": 2.5e-07, + "logits/chosen": -1.20210599899292, + "logits/rejected": -1.1646621227264404, + "logps/chosen": -387.5125732421875, + "logps/rejected": -394.4703369140625, + "loss": 0.3758, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.577614426612854, + "rewards/margins": 1.861274003982544, + "rewards/rejected": -3.4388885498046875, + "step": 362 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 26.0966251405006, + "learning_rate": 2.470023689204893e-07, + "logits/chosen": -1.2510864734649658, + "logits/rejected": -1.1139371395111084, + "logps/chosen": -406.1734619140625, + "logps/rejected": -524.915283203125, + "loss": 0.332, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.043712854385376, + "rewards/margins": 7.958093643188477, + "rewards/rejected": -6.914381504058838, + "step": 364 + }, + { + "epoch": 1.1730769230769231, + "grad_norm": 30.000260216001593, + "learning_rate": 2.440051688339046e-07, + "logits/chosen": -1.2575486898422241, + "logits/rejected": -1.1823022365570068, + "logps/chosen": -299.4271545410156, + "logps/rejected": -415.7213134765625, + "loss": 0.2472, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0835425853729248, + "rewards/margins": 8.249517440795898, + "rewards/rejected": -7.1659746170043945, + "step": 366 + }, + { + "epoch": 1.1794871794871795, + "grad_norm": 31.00156504605049, + "learning_rate": 2.4100883067120474e-07, + "logits/chosen": -1.3217213153839111, + "logits/rejected": -1.3609166145324707, + "logps/chosen": -272.78875732421875, + "logps/rejected": -364.9103698730469, + "loss": 0.3003, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.5013625621795654, + "rewards/margins": 3.5059304237365723, + "rewards/rejected": -2.0045676231384277, + "step": 368 + }, + { + "epoch": 1.185897435897436, + "grad_norm": 28.150283867039814, + "learning_rate": 2.3801378523942296e-07, + "logits/chosen": -1.284231185913086, + "logits/rejected": -1.2206802368164062, + "logps/chosen": -256.67108154296875, + "logps/rejected": -352.75054931640625, + "loss": 0.2765, + "rewards/accuracies": 0.8125, + "rewards/chosen": 2.076909303665161, + "rewards/margins": 9.866533279418945, + "rewards/rejected": -7.789623737335205, + "step": 370 + }, + { + "epoch": 1.1923076923076923, + "grad_norm": 29.2415427501518, + "learning_rate": 2.3502046315972655e-07, + "logits/chosen": -1.0261542797088623, + "logits/rejected": -0.9523590803146362, + "logps/chosen": -279.00909423828125, + "logps/rejected": -356.99285888671875, + "loss": 0.294, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2333482503890991, + "rewards/margins": 4.806268692016602, + "rewards/rejected": -3.572920083999634, + "step": 372 + }, + { + "epoch": 1.1987179487179487, + "grad_norm": 27.86537802836453, + "learning_rate": 2.3202929480550268e-07, + "logits/chosen": -1.3866647481918335, + "logits/rejected": -1.3047130107879639, + "logps/chosen": -270.6109313964844, + "logps/rejected": -497.84735107421875, + "loss": 0.2712, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.9244805574417114, + "rewards/margins": 11.156976699829102, + "rewards/rejected": -9.232497215270996, + "step": 374 + }, + { + "epoch": 1.205128205128205, + "grad_norm": 29.73267404121584, + "learning_rate": 2.2904071024048089e-07, + "logits/chosen": -1.2866673469543457, + "logits/rejected": -1.1302870512008667, + "logps/chosen": -452.99066162109375, + "logps/rejected": -576.70068359375, + "loss": 0.3115, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19872701168060303, + "rewards/margins": 7.480227470397949, + "rewards/rejected": -7.281500816345215, + "step": 376 + }, + { + "epoch": 1.2115384615384615, + "grad_norm": 30.678862049283406, + "learning_rate": 2.2605513915689874e-07, + "logits/chosen": -1.2622478008270264, + "logits/rejected": -1.1896402835845947, + "logps/chosen": -413.0767822265625, + "logps/rejected": -548.7984619140625, + "loss": 0.3323, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.1370818614959717, + "rewards/margins": 7.873663902282715, + "rewards/rejected": -6.736582279205322, + "step": 378 + }, + { + "epoch": 1.217948717948718, + "grad_norm": 30.580436182063924, + "learning_rate": 2.2307301081372222e-07, + "logits/chosen": -1.3156100511550903, + "logits/rejected": -1.2855215072631836, + "logps/chosen": -322.6431884765625, + "logps/rejected": -463.2252197265625, + "loss": 0.2377, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3195626735687256, + "rewards/margins": 9.315149307250977, + "rewards/rejected": -7.995587348937988, + "step": 380 + }, + { + "epoch": 1.2243589743589745, + "grad_norm": 29.83536758042239, + "learning_rate": 2.2009475397492734e-07, + "logits/chosen": -1.272627592086792, + "logits/rejected": -1.0530091524124146, + "logps/chosen": -397.1666259765625, + "logps/rejected": -545.9334716796875, + "loss": 0.2882, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3586530685424805, + "rewards/margins": 10.215682983398438, + "rewards/rejected": -12.574337005615234, + "step": 382 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 28.3903312811834, + "learning_rate": 2.1712079684785362e-07, + "logits/chosen": -1.153627872467041, + "logits/rejected": -1.0204051733016968, + "logps/chosen": -394.9867248535156, + "logps/rejected": -521.6412963867188, + "loss": 0.2781, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8138182163238525, + "rewards/margins": 9.487817764282227, + "rewards/rejected": -8.673999786376953, + "step": 384 + }, + { + "epoch": 1.2371794871794872, + "grad_norm": 26.483269120716308, + "learning_rate": 2.1415156702163734e-07, + "logits/chosen": -1.2825031280517578, + "logits/rejected": -1.2940688133239746, + "logps/chosen": -276.8829345703125, + "logps/rejected": -407.8200378417969, + "loss": 0.2424, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.4425716400146484, + "rewards/margins": 6.671435356140137, + "rewards/rejected": -5.228863716125488, + "step": 386 + }, + { + "epoch": 1.2435897435897436, + "grad_norm": 42.44741299462779, + "learning_rate": 2.1118749140573358e-07, + "logits/chosen": -1.248936414718628, + "logits/rejected": -1.1839042901992798, + "logps/chosen": -306.0098571777344, + "logps/rejected": -434.89569091796875, + "loss": 0.3255, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0577373504638672, + "rewards/margins": 8.84072208404541, + "rewards/rejected": -7.782984733581543, + "step": 388 + }, + { + "epoch": 1.25, + "grad_norm": 29.592974048596698, + "learning_rate": 2.0822899616853639e-07, + "logits/chosen": -1.239725112915039, + "logits/rejected": -1.1840622425079346, + "logps/chosen": -198.22854614257812, + "logps/rejected": -290.4129638671875, + "loss": 0.2512, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3176386058330536, + "rewards/margins": 6.706575393676758, + "rewards/rejected": -6.388936519622803, + "step": 390 + }, + { + "epoch": 1.2564102564102564, + "grad_norm": 29.288354747979916, + "learning_rate": 2.0527650667610474e-07, + "logits/chosen": -1.4007208347320557, + "logits/rejected": -1.3942312002182007, + "logps/chosen": -341.7938232421875, + "logps/rejected": -396.8577575683594, + "loss": 0.3134, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6096081733703613, + "rewards/margins": 3.297388792037964, + "rewards/rejected": -2.6877806186676025, + "step": 392 + }, + { + "epoch": 1.2628205128205128, + "grad_norm": 35.68477015068931, + "learning_rate": 2.0233044743100485e-07, + "logits/chosen": -1.3853461742401123, + "logits/rejected": -1.394587516784668, + "logps/chosen": -224.809326171875, + "logps/rejected": -309.8160095214844, + "loss": 0.3056, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.3611345291137695, + "rewards/margins": 6.271059989929199, + "rewards/rejected": -4.90992546081543, + "step": 394 + }, + { + "epoch": 1.2692307692307692, + "grad_norm": 40.77728231699169, + "learning_rate": 1.993912420112756e-07, + "logits/chosen": -1.239671230316162, + "logits/rejected": -1.250701904296875, + "logps/chosen": -400.4630432128906, + "logps/rejected": -489.7222900390625, + "loss": 0.2694, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07648792862892151, + "rewards/margins": 4.028686046600342, + "rewards/rejected": -3.9521982669830322, + "step": 396 + }, + { + "epoch": 1.2756410256410255, + "grad_norm": 26.64706430608069, + "learning_rate": 1.9645931300952795e-07, + "logits/chosen": -1.0940332412719727, + "logits/rejected": -1.0973918437957764, + "logps/chosen": -281.67816162109375, + "logps/rejected": -425.95245361328125, + "loss": 0.2423, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8789865374565125, + "rewards/margins": 5.881922721862793, + "rewards/rejected": -5.002935886383057, + "step": 398 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 27.684182767775344, + "learning_rate": 1.935350819721849e-07, + "logits/chosen": -1.2352392673492432, + "logits/rejected": -1.1679105758666992, + "logps/chosen": -319.7848205566406, + "logps/rejected": -460.11431884765625, + "loss": 0.3574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.17852053046226501, + "rewards/margins": 7.532207012176514, + "rewards/rejected": -7.710727691650391, + "step": 400 + }, + { + "epoch": 1.282051282051282, + "eval_logits/chosen": -0.9294926524162292, + "eval_logits/rejected": -0.9306713938713074, + "eval_logps/chosen": -455.3849792480469, + "eval_logps/rejected": -510.6935729980469, + "eval_loss": 0.7832074165344238, + "eval_rewards/accuracies": 0.5384615659713745, + "eval_rewards/chosen": -0.1278804987668991, + "eval_rewards/margins": 6.288716793060303, + "eval_rewards/rejected": -6.416597366333008, + "eval_runtime": 18.4998, + "eval_samples_per_second": 5.405, + "eval_steps_per_second": 0.703, + "step": 400 + } + ], + "logging_steps": 2, + "max_steps": 624, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}