Upload bcpt-sft_metrics.jsonl with huggingface_hub
Browse files- bcpt-sft_metrics.jsonl +165 -0
bcpt-sft_metrics.jsonl
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"step": 0, "timestamp": 1778391723.2380505, "optim/grad_attn_mean": 0.005607149970292456, "optim/grad_mlp_mean": 0.013847656606230885, "optim/grad_embed_mean": 0.27258366346359253, "optim/grad_attn_res_mean": 0.032173570090794544, "optim/grad_encoding_mean": 0.17410484715623575, "optim/grad_reasoning_mean": 0.1493250436227148, "optim/grad_decoding_mean": 0.30230919922469185, "optim/grad_attn_mlp_ratio": 0.4049165920682599, "optim/grad_layer_0": 0.14073443959932774, "optim/grad_layer_12": 0.17187806242145598, "optim/grad_layer_16": 0.13815995992626995, "optim/grad_layer_20": 0.1938218242721632, "optim/grad_layer_24": 0.42784625745844096, "optim/grad_layer_4": 0.11612342746229842, "optim/grad_layer_8": 0.1368456636555493}
|
| 2 |
+
{"step": 1, "timestamp": 1778391723.489078, "perf/step_time_sec": 13.160085669718683, "perf/tokens_per_sec": 9959.813582490293, "perf/gpu_memory_allocated_gib": 97.51, "perf/gpu_memory_reserved_gib": 101.42}
|
| 3 |
+
{"step": 2, "timestamp": 1778391724.8466496, "perf/step_time_sec": 1.3158239563927054, "perf/tokens_per_sec": 99612.10947954636, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 4 |
+
{"step": 3, "timestamp": 1778391726.1861868, "perf/step_time_sec": 1.2978916019201279, "perf/tokens_per_sec": 100988.40288826074, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 5 |
+
{"step": 4, "timestamp": 1778391727.5104504, "perf/step_time_sec": 1.2826353255659342, "perf/tokens_per_sec": 102189.60712170266, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 6 |
+
{"step": 5, "timestamp": 1778391729.0897784, "perf/step_time_sec": 1.537645504809916, "perf/tokens_per_sec": 85242.01422889286, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 7 |
+
{"step": 6, "timestamp": 1778391731.0799463, "perf/step_time_sec": 1.9483532411977649, "perf/tokens_per_sec": 67273.221933525, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 8 |
+
{"step": 7, "timestamp": 1778391732.4332702, "perf/step_time_sec": 1.3115622475743294, "perf/tokens_per_sec": 99935.78287451572, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 9 |
+
{"step": 8, "timestamp": 1778391733.5202513, "perf/step_time_sec": 1.0450598942115903, "perf/tokens_per_sec": 125420.56271222884, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 10 |
+
{"step": 9, "timestamp": 1778391735.2089918, "perf/step_time_sec": 1.6458681738004088, "perf/tokens_per_sec": 79636.99771734868, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 11 |
+
{"step": 10, "timestamp": 1778391737.1444798, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.021739130434782608, "turn/im_end_acc_top5": 0.1358695652173913, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.027777777777777776, "turn/im_start_acc_top5": 0.3819444444444444, "perf/step_time_sec": 1.8934066593647003, "perf/tokens_per_sec": 69225.48801216265, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 12 |
+
{"step": 10, "timestamp": 1778391737.1461942, "loss/perplexity": 20.391396417012515}
|
| 13 |
+
{"step": 10, "timestamp": 1778391738.2331932, "optim/grad_attn_mean": 0.0031259987056041516, "optim/grad_mlp_mean": 0.008228839996653343, "optim/grad_embed_mean": 0.19654342532157898, "optim/grad_attn_res_mean": 0.01993452344628933, "optim/grad_encoding_mean": 0.10646357963883525, "optim/grad_reasoning_mean": 0.09788762137436101, "optim/grad_decoding_mean": 0.1800479550496675, "optim/grad_attn_mlp_ratio": 0.3798828155666331, "optim/grad_layer_0": 0.08334688376635313, "optim/grad_layer_12": 0.12542492122156546, "optim/grad_layer_16": 0.08420550829032436, "optim/grad_layer_20": 0.12868904217611998, "optim/grad_layer_24": 0.2489809119142592, "optim/grad_layer_4": 0.0640980873722583, "optim/grad_layer_8": 0.07345996604999527}
|
| 14 |
+
{"step": 10, "timestamp": 1778391738.2458434, "weight/attn_drift_mean": 0.0005828541054578167, "weight/mlp_drift_mean": 0.00024043900128289342, "weight/attn_mlp_drift_ratio": 2.4240238152292273}
|
| 15 |
+
{"step": 11, "timestamp": 1778391738.259392, "perf/step_time_sec": 1.071016880683601, "perf/tokens_per_sec": 122380.89087479211, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 16 |
+
{"step": 12, "timestamp": 1778391740.0554197, "perf/step_time_sec": 1.7541710380464792, "perf/tokens_per_sec": 74720.19384493284, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 17 |
+
{"step": 13, "timestamp": 1778391741.5558105, "perf/step_time_sec": 1.4583220481872559, "perf/tokens_per_sec": 89878.63837272911, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 18 |
+
{"step": 14, "timestamp": 1778391742.8274622, "perf/step_time_sec": 1.2294410234317183, "perf/tokens_per_sec": 106611.05128421768, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 19 |
+
{"step": 15, "timestamp": 1778391743.9616148, "perf/step_time_sec": 1.0922118686139584, "perf/tokens_per_sec": 120006.02059592462, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 20 |
+
{"step": 16, "timestamp": 1778391745.4909966, "perf/step_time_sec": 1.487229073420167, "perf/tokens_per_sec": 88131.68216149442, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 21 |
+
{"step": 17, "timestamp": 1778391746.84329, "perf/step_time_sec": 1.3101703124120831, "perf/tokens_per_sec": 100041.95542997039, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 22 |
+
{"step": 18, "timestamp": 1778391748.514898, "perf/step_time_sec": 1.6294457353651524, "perf/tokens_per_sec": 80439.62260002925, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 23 |
+
{"step": 19, "timestamp": 1778391749.6520286, "perf/step_time_sec": 1.0951051069423556, "perf/tokens_per_sec": 119688.9679073512, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 24 |
+
{"step": 20, "timestamp": 1778391751.5502133, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.03260869565217391, "turn/im_end_acc_top5": 0.24456521739130435, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.006944444444444444, "turn/im_start_acc_top5": 0.2222222222222222, "perf/step_time_sec": 1.8560429075732827, "perf/tokens_per_sec": 70619.05706230277, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 25 |
+
{"step": 20, "timestamp": 1778391751.5519154, "loss/perplexity": 17.512983232926715}
|
| 26 |
+
{"step": 20, "timestamp": 1778391753.3456645, "optim/grad_attn_mean": 0.002410108447775758, "optim/grad_mlp_mean": 0.006154326000666645, "optim/grad_embed_mean": 0.14452828466892242, "optim/grad_attn_res_mean": 0.011639517951902746, "optim/grad_encoding_mean": 0.0813135298004555, "optim/grad_reasoning_mean": 0.06808476048819204, "optim/grad_decoding_mean": 0.09815882210677955, "optim/grad_attn_mlp_ratio": 0.3916114504496817, "optim/grad_layer_0": 0.11744277481921017, "optim/grad_layer_12": 0.08906600682530552, "optim/grad_layer_16": 0.0731896452489309, "optim/grad_layer_20": 0.08238712092861533, "optim/grad_layer_24": 0.1735345805354882, "optim/grad_layer_4": 0.06520773516967893, "optim/grad_layer_8": 0.05036545320763253}
|
| 27 |
+
{"step": 20, "timestamp": 1778391753.3611739, "weight/attn_drift_mean": 0.0008778240251696281, "weight/mlp_drift_mean": 0.0005661850320312965, "weight/attn_mlp_drift_ratio": 1.5503916062638752}
|
| 28 |
+
{"step": 21, "timestamp": 1778391753.3800747, "perf/step_time_sec": 1.7859508004039526, "perf/tokens_per_sec": 73390.59954526949, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 29 |
+
{"step": 22, "timestamp": 1778391754.6932511, "perf/step_time_sec": 1.2708057761192322, "perf/tokens_per_sec": 103140.85949488342, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 30 |
+
{"step": 23, "timestamp": 1778391756.004011, "perf/step_time_sec": 1.2690817872062325, "perf/tokens_per_sec": 103280.97158225162, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 31 |
+
{"step": 24, "timestamp": 1778391757.681017, "perf/step_time_sec": 1.6348698493093252, "perf/tokens_per_sec": 80172.74283660763, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 32 |
+
{"step": 25, "timestamp": 1778391759.1583848, "perf/step_time_sec": 1.434982068836689, "perf/tokens_per_sec": 91340.51417538438, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 33 |
+
{"step": 25, "timestamp": 1778391781.094945, "eval/loss": 2.7366621494293213, "eval/perplexity": 15.435378030548936, "overfit/train_eval_gap": -0.12628035545349103, "overfit/train_eval_ratio": 0.9558914072507461}
|
| 34 |
+
{"step": 26, "timestamp": 1778391782.4057033, "perf/step_time_sec": 1.2688584867864847, "perf/tokens_per_sec": 103299.14751325294, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 35 |
+
{"step": 27, "timestamp": 1778391783.553805, "perf/step_time_sec": 1.1066506383940578, "perf/tokens_per_sec": 118440.26963216525, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 36 |
+
{"step": 28, "timestamp": 1778391784.6903539, "perf/step_time_sec": 1.0946864262223244, "perf/tokens_per_sec": 119734.74490984512, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 37 |
+
{"step": 29, "timestamp": 1778391785.8365319, "perf/step_time_sec": 1.1041927365586162, "perf/tokens_per_sec": 118703.91432613996, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 38 |
+
{"step": 30, "timestamp": 1778391787.3495517, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.16304347826086957, "turn/im_end_acc_top5": 0.44021739130434784, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.09722222222222222, "perf/step_time_sec": 1.470966625958681, "perf/tokens_per_sec": 89106.03251421543, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 39 |
+
{"step": 30, "timestamp": 1778391787.35134, "loss/perplexity": 15.382052510413176}
|
| 40 |
+
{"step": 30, "timestamp": 1778391788.821421, "optim/grad_attn_mean": 0.0021397544142946943, "optim/grad_mlp_mean": 0.00602059765307266, "optim/grad_embed_mean": 0.14539097249507904, "optim/grad_attn_res_mean": 0.012064390936805131, "optim/grad_encoding_mean": 0.06549571784368406, "optim/grad_reasoning_mean": 0.062444598511016615, "optim/grad_decoding_mean": 0.10903491442441009, "optim/grad_attn_mlp_ratio": 0.3554050583586285, "optim/grad_attn_mlp_ratio_slope": -0.0013680596624584562, "optim/grad_layer_0": 0.07963570160791278, "optim/grad_layer_12": 0.07423208601539955, "optim/grad_layer_16": 0.07945822703186423, "optim/grad_layer_20": 0.07475204434012994, "optim/grad_layer_24": 0.13067203614627942, "optim/grad_layer_4": 0.04958098553470336, "optim/grad_layer_8": 0.051910202542785555}
|
| 41 |
+
{"step": 30, "timestamp": 1778391788.8371425, "weight/attn_drift_mean": 0.001164636467272996, "weight/mlp_drift_mean": 0.0008712623475027723, "weight/attn_mlp_drift_ratio": 1.3367077132782414}
|
| 42 |
+
{"step": 31, "timestamp": 1778391788.8558073, "perf/step_time_sec": 1.46199606731534, "perf/tokens_per_sec": 89652.77193986384, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 43 |
+
{"step": 32, "timestamp": 1778391790.6253629, "perf/step_time_sec": 1.7272616671398282, "perf/tokens_per_sec": 75884.27537851984, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 44 |
+
{"step": 33, "timestamp": 1778391792.3213694, "perf/step_time_sec": 1.6535834316164255, "perf/tokens_per_sec": 79265.42894293113, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 45 |
+
{"step": 34, "timestamp": 1778391793.5710478, "perf/step_time_sec": 1.2075994974002242, "perf/tokens_per_sec": 108539.29658150556, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 46 |
+
{"step": 35, "timestamp": 1778391794.6817834, "perf/step_time_sec": 1.0686829816550016, "perf/tokens_per_sec": 122648.1587617472, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 47 |
+
{"step": 36, "timestamp": 1778391796.2609582, "perf/step_time_sec": 1.5371738839894533, "perf/tokens_per_sec": 85268.16735906717, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 48 |
+
{"step": 37, "timestamp": 1778391797.9807124, "perf/step_time_sec": 1.6772715765982866, "perf/tokens_per_sec": 78145.96147025288, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 49 |
+
{"step": 38, "timestamp": 1778391799.2768052, "perf/step_time_sec": 1.2543825171887875, "perf/tokens_per_sec": 104491.25223280943, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 50 |
+
{"step": 39, "timestamp": 1778391800.568428, "perf/step_time_sec": 1.2496276339516044, "perf/tokens_per_sec": 104888.84563597619, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 51 |
+
{"step": 40, "timestamp": 1778391802.3115537, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.266304347826087, "turn/im_end_acc_top5": 0.46195652173913043, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.04861111111111111, "perf/step_time_sec": 1.5516524882987142, "perf/tokens_per_sec": 84472.52267401182, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 52 |
+
{"step": 40, "timestamp": 1778391802.313479, "loss/perplexity": 14.84409408306163, "loss/slope": -0.010822740554809581, "loss/relative_slope": -0.004011985529562925, "loss/window_mean": 2.827214765548706, "loss/window_median": 2.8629425048828123, "loss/window_std": 0.12471988867581617, "loss/window_min": 2.697602081298828, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.3710631802678108, "optim/grad_norm_std": 0.1449638776117363, "optim/grad_norm_cv": 0.39067168428597576}
|
| 53 |
+
{"step": 40, "timestamp": 1778391804.1230543, "optim/grad_attn_mean": 0.0019922484859066295, "optim/grad_mlp_mean": 0.005624019421312758, "optim/grad_embed_mean": 0.1322483867406845, "optim/grad_attn_res_mean": 0.010669837862670855, "optim/grad_encoding_mean": 0.05378440907710077, "optim/grad_reasoning_mean": 0.05611951515634751, "optim/grad_decoding_mean": 0.09023630625742954, "optim/grad_attn_mlp_ratio": 0.3542386315329054, "optim/grad_attn_mlp_ratio_slope": -0.0012583367827871362, "optim/grad_layer_0": 0.055335686658509076, "optim/grad_layer_12": 0.06730972335208207, "optim/grad_layer_16": 0.06106350617483258, "optim/grad_layer_20": 0.07972957059973851, "optim/grad_layer_24": 0.08978597691748291, "optim/grad_layer_4": 0.04212304242537357, "optim/grad_layer_8": 0.051547162467613816}
|
| 54 |
+
{"step": 40, "timestamp": 1778391804.1396494, "weight/attn_drift_mean": 0.0014225977770735427, "weight/mlp_drift_mean": 0.0011766676080262506, "weight/attn_mlp_drift_ratio": 1.208995367439512}
|
| 55 |
+
{"step": 41, "timestamp": 1778391804.15793, "perf/step_time_sec": 1.8021178599447012, "perf/tokens_per_sec": 72732.20187941648, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 56 |
+
{"step": 42, "timestamp": 1778391805.3864727, "perf/step_time_sec": 1.1870685135945678, "perf/tokens_per_sec": 110416.54167298251, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 57 |
+
{"step": 43, "timestamp": 1778391806.4979038, "perf/step_time_sec": 1.0694999350234866, "perf/tokens_per_sec": 122554.47214881936, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 58 |
+
{"step": 44, "timestamp": 1778391808.0715065, "perf/step_time_sec": 1.531661245971918, "perf/tokens_per_sec": 85575.05802585483, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 59 |
+
{"step": 45, "timestamp": 1778391809.806425, "perf/step_time_sec": 1.6928241085261106, "perf/tokens_per_sec": 77428.00881665155, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 60 |
+
{"step": 46, "timestamp": 1778391810.946561, "perf/step_time_sec": 1.0982772968709469, "perf/tokens_per_sec": 119343.26638038628, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 61 |
+
{"step": 47, "timestamp": 1778391812.2503016, "perf/step_time_sec": 1.2615410825237632, "perf/tokens_per_sec": 103898.32072514456, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 62 |
+
{"step": 48, "timestamp": 1778391813.6447096, "perf/step_time_sec": 1.351813673041761, "perf/tokens_per_sec": 96960.10819676837, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 63 |
+
{"step": 49, "timestamp": 1778391815.4394438, "perf/step_time_sec": 1.7526414580643177, "perf/tokens_per_sec": 74785.4042804401, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 64 |
+
{"step": 50, "timestamp": 1778391817.1720586, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.30434782608695654, "turn/im_end_acc_top5": 0.5, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.034722222222222224, "perf/step_time_sec": 1.6884042406454682, "perf/tokens_per_sec": 77630.69817325965, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43, "system/disk_free_gib": 55.9, "attn_res/attn_query_norm_mean": 0.9881077451365334, "attn_res/attn_query_norm_std": 0.30346167672791585, "attn_res/mlp_query_norm_mean": 0.8005333095788956, "attn_res/mlp_query_norm_std": 0.15772826250452399, "attn_res/final_query_norm": 0.9273664951324463}
|
| 65 |
+
{"step": 50, "timestamp": 1778391818.7871199, "loss/perplexity": 14.892843768705635, "loss/slope": -0.007938049316406258, "loss/relative_slope": -0.002939059453141536, "loss/window_mean": 2.8019479751586913, "loss/window_median": 2.7332014083862304, "loss/window_std": 0.1224650288510893, "loss/window_min": 2.697602081298828, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.34221955835819245, "optim/grad_norm_std": 0.141913490370875, "optim/grad_norm_cv": 0.41468550497729817}
|
| 66 |
+
{"step": 50, "timestamp": 1778391837.9673176, "eval/loss": 2.6248295307159424, "eval/perplexity": 13.8022211307564, "overfit/train_eval_gap": -0.07605128288269025, "overfit/train_eval_ratio": 0.9718420404861255}
|
| 67 |
+
{"step": 50, "timestamp": 1778391839.3854058, "optim/grad_attn_mean": 0.0016985774498755987, "optim/grad_mlp_mean": 0.004956663723403055, "optim/grad_embed_mean": 0.12150838226079941, "optim/grad_attn_res_mean": 0.008105495935300848, "optim/grad_encoding_mean": 0.05245885798972773, "optim/grad_reasoning_mean": 0.0499605290307146, "optim/grad_decoding_mean": 0.0778164661693154, "optim/grad_attn_mlp_ratio": 0.3426849424959573, "optim/grad_attn_mlp_ratio_slope": -0.0012122776915821408, "optim/grad_layer_0": 0.044563287985511124, "optim/grad_layer_12": 0.05465900324634276, "optim/grad_layer_16": 0.05364583249320276, "optim/grad_layer_20": 0.06511865369975567, "optim/grad_layer_24": 0.08040724653983489, "optim/grad_layer_4": 0.0376324261596892, "optim/grad_layer_8": 0.03588124952511862}
|
| 68 |
+
{"step": 50, "timestamp": 1778391839.40079, "weight/attn_drift_mean": 0.0017010194170702139, "weight/mlp_drift_mean": 0.0014809189004877789, "weight/attn_mlp_drift_ratio": 1.1486165314958356}
|
| 69 |
+
{"step": 51, "timestamp": 1778391839.4198408, "perf/step_time_sec": 1.4110433459281921, "perf/tokens_per_sec": 92890.13011416748, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 70 |
+
{"step": 52, "timestamp": 1778391840.8379436, "perf/step_time_sec": 1.3764761835336685, "perf/tokens_per_sec": 95222.86078609365, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 71 |
+
{"step": 53, "timestamp": 1778391842.1218302, "perf/step_time_sec": 1.2418522648513317, "perf/tokens_per_sec": 105545.56585335155, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 72 |
+
{"step": 54, "timestamp": 1778391843.4429274, "perf/step_time_sec": 1.2794396923854947, "perf/tokens_per_sec": 102444.84423929226, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 73 |
+
{"step": 55, "timestamp": 1778391844.7348564, "perf/step_time_sec": 1.2501363418996334, "perf/tokens_per_sec": 104846.16405985825, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 74 |
+
{"step": 56, "timestamp": 1778391846.5446615, "perf/step_time_sec": 1.7682264680042863, "perf/tokens_per_sec": 74126.25156998966, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 75 |
+
{"step": 57, "timestamp": 1778391847.828107, "perf/step_time_sec": 1.2418601494282484, "perf/tokens_per_sec": 105544.89574397364, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 76 |
+
{"step": 58, "timestamp": 1778391849.2847755, "perf/step_time_sec": 1.4151897700503469, "perf/tokens_per_sec": 92617.96740894826, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 77 |
+
{"step": 59, "timestamp": 1778391850.426048, "perf/step_time_sec": 1.099775922484696, "perf/tokens_per_sec": 119180.64154730024, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 78 |
+
{"step": 60, "timestamp": 1778391851.9694679, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3423913043478261, "turn/im_end_acc_top5": 0.5163043478260869, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.020833333333333332, "perf/step_time_sec": 1.5021540159359574, "perf/tokens_per_sec": 87256.03274330833, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 79 |
+
{"step": 60, "timestamp": 1778391851.9708154, "loss/perplexity": 13.710115614413736, "loss/slope": -0.0071619431631905735, "loss/relative_slope": -0.0027355144301980973, "loss/window_mean": 2.771312300364176, "loss/window_median": 2.7332014083862304, "loss/window_std": 0.13111366160762264, "loss/window_min": 2.6181339263916015, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.33196381976207096, "optim/grad_norm_std": 0.1315627853731071, "optim/grad_norm_cv": 0.3963166391668897}
|
| 80 |
+
{"step": 60, "timestamp": 1778391853.231086, "optim/grad_attn_mean": 0.0019153693139408144, "optim/grad_mlp_mean": 0.005162952373211738, "optim/grad_embed_mean": 0.14194262027740479, "optim/grad_attn_res_mean": 0.008663317723594924, "optim/grad_encoding_mean": 0.05411940982675231, "optim/grad_reasoning_mean": 0.052230770654407226, "optim/grad_decoding_mean": 0.09038694326300174, "optim/grad_attn_mlp_ratio": 0.3709826211166663, "optim/grad_attn_mlp_ratio_slope": -0.0007627517068318168, "optim/grad_layer_0": 0.057921779924072325, "optim/grad_layer_12": 0.05759686202509329, "optim/grad_layer_16": 0.05591319309314713, "optim/grad_layer_20": 0.08410324924625456, "optim/grad_layer_24": 0.11520591314183548, "optim/grad_layer_4": 0.04985895560821518, "optim/grad_layer_8": 0.040158802876248956}
|
| 81 |
+
{"step": 60, "timestamp": 1778391853.2465637, "weight/attn_drift_mean": 0.0018974457914004363, "weight/mlp_drift_mean": 0.0017862723493925565, "weight/attn_mlp_drift_ratio": 1.062231730636359}
|
| 82 |
+
{"step": 61, "timestamp": 1778391853.2635307, "perf/step_time_sec": 1.2507664449512959, "perf/tokens_per_sec": 104793.34533563048, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 83 |
+
{"step": 62, "timestamp": 1778391854.6386786, "perf/step_time_sec": 1.2281742561608553, "perf/tokens_per_sec": 106721.01238281725, "perf/gpu_memory_allocated_gib": 73.75, "perf/gpu_memory_reserved_gib": 101.43}
|
| 84 |
+
{"step": 63, "timestamp": 1778391863.6415727, "perf/step_time_sec": 1.815531151369214, "perf/tokens_per_sec": 72194.8504718026, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 85 |
+
{"step": 64, "timestamp": 1778391864.7797515, "perf/step_time_sec": 1.0952351232990623, "perf/tokens_per_sec": 119674.75952121177, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 86 |
+
{"step": 65, "timestamp": 1778391866.1544938, "perf/step_time_sec": 1.3321023238822818, "perf/tokens_per_sec": 98394.84373693117, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 87 |
+
{"step": 66, "timestamp": 1778391868.4239862, "perf/step_time_sec": 2.2268356708809733, "perf/tokens_per_sec": 58860.203163597485, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 88 |
+
{"step": 67, "timestamp": 1778391869.6359243, "perf/step_time_sec": 1.1695700967684388, "perf/tokens_per_sec": 112068.52873731665, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 89 |
+
{"step": 68, "timestamp": 1778391871.0505147, "perf/step_time_sec": 1.372063091956079, "perf/tokens_per_sec": 95529.1347522055, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 90 |
+
{"step": 69, "timestamp": 1778391872.4124608, "perf/step_time_sec": 1.3194221425801516, "perf/tokens_per_sec": 99340.45804604019, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 91 |
+
{"step": 70, "timestamp": 1778391874.305407, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.34782608695652173, "turn/im_end_acc_top5": 0.5217391304347826, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.020833333333333332, "perf/step_time_sec": 1.850598362274468, "perf/tokens_per_sec": 70826.82156862317, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 92 |
+
{"step": 70, "timestamp": 1778391874.3071384, "loss/perplexity": 11.825884801448572, "loss/slope": -0.007701445307050434, "loss/relative_slope": -0.003117627047125974, "loss/window_mean": 2.728309222630092, "loss/window_median": 2.7008808135986326, "loss/window_std": 0.16071886311831443, "loss/window_min": 2.470290756225586, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.31868053334099905, "optim/grad_norm_std": 0.1260744214982559, "optim/grad_norm_cv": 0.395613814802274}
|
| 93 |
+
{"step": 70, "timestamp": 1778391875.5906672, "optim/grad_attn_mean": 0.0015735373257187775, "optim/grad_mlp_mean": 0.004527339738810302, "optim/grad_embed_mean": 0.10916879773139954, "optim/grad_attn_res_mean": 0.007718450162971286, "optim/grad_encoding_mean": 0.04715052763640415, "optim/grad_reasoning_mean": 0.05125156634959341, "optim/grad_decoding_mean": 0.06728546510857995, "optim/grad_attn_mlp_ratio": 0.3475625733593694, "optim/grad_attn_mlp_ratio_slope": -0.0007070536355940046, "optim/grad_layer_0": 0.05174644378712401, "optim/grad_layer_12": 0.05498325332882814, "optim/grad_layer_16": 0.07326713879592717, "optim/grad_layer_20": 0.0632443314534612, "optim/grad_layer_24": 0.10002498637186363, "optim/grad_layer_4": 0.033090550714405254, "optim/grad_layer_8": 0.0342416305502411}
|
| 94 |
+
{"step": 70, "timestamp": 1778391875.6059036, "weight/attn_drift_mean": 0.0021116307518588263, "weight/mlp_drift_mean": 0.002077757399575769, "weight/attn_mlp_drift_ratio": 1.016297951488685}
|
| 95 |
+
{"step": 71, "timestamp": 1778391875.6225815, "perf/step_time_sec": 1.2732505751773715, "perf/tokens_per_sec": 102942.81624945732, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 96 |
+
{"step": 72, "timestamp": 1778391876.928687, "perf/step_time_sec": 1.2637302316725254, "perf/tokens_per_sec": 103718.33854645421, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 97 |
+
{"step": 73, "timestamp": 1778391878.8047652, "perf/step_time_sec": 1.8336276998743415, "perf/tokens_per_sec": 71482.34072215552, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 98 |
+
{"step": 74, "timestamp": 1778391880.0760942, "perf/step_time_sec": 1.2291615651920438, "perf/tokens_per_sec": 106635.29003164149, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 99 |
+
{"step": 75, "timestamp": 1778391881.3732743, "perf/step_time_sec": 1.2552891876548529, "perf/tokens_per_sec": 104415.78027519728, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 100 |
+
{"step": 75, "timestamp": 1778391900.5509832, "eval/loss": 2.561070203781128, "eval/perplexity": 12.949668686719015, "eval/loss_slope": -0.003511838912963867, "overfit/train_eval_gap": 0.0907794475555419, "overfit/train_eval_ratio": 1.036748482727904}
|
| 101 |
+
{"step": 76, "timestamp": 1778391901.9221559, "perf/step_time_sec": 1.3292055884376168, "perf/tokens_per_sec": 98609.27545005696, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 102 |
+
{"step": 77, "timestamp": 1778391903.200217, "perf/step_time_sec": 1.2362710321322083, "perf/tokens_per_sec": 106022.05875028786, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 103 |
+
{"step": 78, "timestamp": 1778391904.5447526, "perf/step_time_sec": 1.3024569936096668, "perf/tokens_per_sec": 100634.41683148652, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 104 |
+
{"step": 79, "timestamp": 1778391905.6342971, "perf/step_time_sec": 1.0473622642457485, "perf/tokens_per_sec": 125144.85624932333, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 105 |
+
{"step": 80, "timestamp": 1778391907.1508968, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3423913043478261, "turn/im_end_acc_top5": 0.5271739130434783, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.006944444444444444, "perf/step_time_sec": 1.4743675561621785, "perf/tokens_per_sec": 88900.49123244696, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 106 |
+
{"step": 80, "timestamp": 1778391907.1525555, "loss/perplexity": 11.8081262090617, "loss/slope": -0.0072969740913027825, "loss/relative_slope": -0.002955690886011043, "loss/window_mean": 2.6958690643310548, "loss/window_median": 2.7008808135986326, "loss/window_std": 0.1731134914876778, "loss/window_min": 2.468787956237793, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.3178029917180538, "optim/grad_norm_std": 0.1179546749463747, "optim/grad_norm_cv": 0.3711565907819423}
|
| 107 |
+
{"step": 80, "timestamp": 1778391908.9466197, "optim/grad_attn_mean": 0.0017534459996624014, "optim/grad_mlp_mean": 0.004922915135726466, "optim/grad_embed_mean": 0.11325805634260178, "optim/grad_attn_res_mean": 0.007539343620776076, "optim/grad_encoding_mean": 0.05030641635595304, "optim/grad_reasoning_mean": 0.05152084257861134, "optim/grad_decoding_mean": 0.07501831349509303, "optim/grad_attn_mlp_ratio": 0.3561797003446913, "optim/grad_attn_mlp_ratio_slope": -0.0005764767800746126, "optim/grad_layer_0": 0.04352153040235862, "optim/grad_layer_12": 0.059318930172594264, "optim/grad_layer_16": 0.0534560076193884, "optim/grad_layer_20": 0.06295430011232384, "optim/grad_layer_24": 0.08845239682705142, "optim/grad_layer_4": 0.037617832160321996, "optim/grad_layer_8": 0.04162172906217165}
|
| 108 |
+
{"step": 80, "timestamp": 1778391908.9620767, "weight/attn_drift_mean": 0.0023486021803825275, "weight/mlp_drift_mean": 0.0023619054034637694, "weight/attn_mlp_drift_ratio": 0.994363378526717}
|
| 109 |
+
{"step": 81, "timestamp": 1778391908.9796607, "perf/step_time_sec": 1.7852794919162989, "perf/tokens_per_sec": 73418.19619476433, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 110 |
+
{"step": 82, "timestamp": 1778391910.3501258, "perf/step_time_sec": 1.328305191360414, "perf/tokens_per_sec": 98676.11814854057, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 111 |
+
{"step": 83, "timestamp": 1778391911.4367, "perf/step_time_sec": 1.044896213337779, "perf/tokens_per_sec": 125440.20958914982, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 112 |
+
{"step": 84, "timestamp": 1778391912.9174469, "perf/step_time_sec": 1.4386355699971318, "perf/tokens_per_sec": 91108.54947112236, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 113 |
+
{"step": 85, "timestamp": 1778391914.490359, "perf/step_time_sec": 1.5301904901862144, "perf/tokens_per_sec": 85657.30923085881, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 114 |
+
{"step": 86, "timestamp": 1778391916.0931213, "perf/step_time_sec": 1.395975787192583, "perf/tokens_per_sec": 93892.74599353624, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 115 |
+
{"step": 87, "timestamp": 1778391917.364928, "perf/step_time_sec": 1.2301791347563267, "perf/tokens_per_sec": 106547.08432033574, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 116 |
+
{"step": 88, "timestamp": 1778391918.6687126, "perf/step_time_sec": 1.261822858825326, "perf/tokens_per_sec": 103875.1193032114, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 117 |
+
{"step": 89, "timestamp": 1778391920.4349809, "perf/step_time_sec": 1.72307582013309, "perf/tokens_per_sec": 76068.62012019647, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 118 |
+
{"step": 90, "timestamp": 1778391922.1148367, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3641304347826087, "turn/im_end_acc_top5": 0.5489130434782609, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.006944444444444444, "perf/step_time_sec": 1.6375183686614037, "perf/tokens_per_sec": 80043.07158224146, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 119 |
+
{"step": 90, "timestamp": 1778391922.1166697, "loss/perplexity": 11.795549750597345, "loss/slope": -0.006628860155741375, "loss/relative_slope": -0.0026862261096636886, "loss/window_mean": 2.6705194261338976, "loss/window_median": 2.697602081298828, "loss/window_std": 0.1782675102089122, "loss/window_min": 2.4677223205566405, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.3004880944887797, "optim/grad_norm_std": 0.12151472278881485, "optim/grad_norm_cv": 0.4043911390085115}
|
| 120 |
+
{"step": 90, "timestamp": 1778391923.7118294, "optim/grad_attn_mean": 0.0014378004466843962, "optim/grad_mlp_mean": 0.0041943221024536926, "optim/grad_embed_mean": 0.0927867665886879, "optim/grad_attn_res_mean": 0.006848641192686556, "optim/grad_encoding_mean": 0.04340432093886193, "optim/grad_reasoning_mean": 0.04586171049353046, "optim/grad_decoding_mean": 0.05936836771725211, "optim/grad_attn_mlp_ratio": 0.34279604274618125, "optim/grad_attn_mlp_ratio_slope": -0.000551558870707089, "optim/grad_layer_0": 0.031529795844107866, "optim/grad_layer_12": 0.046008294651983306, "optim/grad_layer_16": 0.04916725668590516, "optim/grad_layer_20": 0.05204834078904241, "optim/grad_layer_24": 0.05718585685826838, "optim/grad_layer_4": 0.02985221560811624, "optim/grad_layer_8": 0.035604429576778784}
|
| 121 |
+
{"step": 90, "timestamp": 1778391923.7273626, "weight/attn_drift_mean": 0.002621073929370202, "weight/mlp_drift_mean": 0.002643442225417525, "weight/attn_mlp_drift_ratio": 0.9915344427895654}
|
| 122 |
+
{"step": 91, "timestamp": 1778391923.7533746, "perf/step_time_sec": 1.5950184911489487, "perf/tokens_per_sec": 82175.84982703502, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 123 |
+
{"step": 92, "timestamp": 1778391925.164126, "perf/step_time_sec": 1.3689637128263712, "perf/tokens_per_sec": 95745.41587328704, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 124 |
+
{"step": 93, "timestamp": 1778391926.340103, "perf/step_time_sec": 1.1337102064862847, "perf/tokens_per_sec": 115613.3192151743, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 125 |
+
{"step": 94, "timestamp": 1778391927.6084971, "perf/step_time_sec": 1.2262970404699445, "perf/tokens_per_sec": 106884.38092435604, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 126 |
+
{"step": 95, "timestamp": 1778391929.0398428, "perf/step_time_sec": 1.2366791348904371, "perf/tokens_per_sec": 105987.07158717628, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 127 |
+
{"step": 96, "timestamp": 1778391930.7112863, "perf/step_time_sec": 1.6294057257473469, "perf/tokens_per_sec": 80441.59777325088, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 128 |
+
{"step": 97, "timestamp": 1778391932.367796, "perf/step_time_sec": 1.6142162447795272, "perf/tokens_per_sec": 81198.53856253446, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 129 |
+
{"step": 98, "timestamp": 1778391933.5704217, "perf/step_time_sec": 1.1606513680890203, "perf/tokens_per_sec": 112929.69069239659, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 130 |
+
{"step": 99, "timestamp": 1778391934.848859, "perf/step_time_sec": 1.2366593284532428, "perf/tokens_per_sec": 105988.76908480437, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 131 |
+
{"step": 100, "timestamp": 1778391936.6118407, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.358695652173913, "turn/im_end_acc_top5": 0.5434782608695652, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.006944444444444444, "perf/step_time_sec": 1.71869639120996, "perf/tokens_per_sec": 76262.45139650609, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43, "system/disk_free_gib": 54.1, "attn_res/attn_query_norm_mean": 0.987907105258533, "attn_res/attn_query_norm_std": 0.3026705289361537, "attn_res/mlp_query_norm_mean": 0.8017747827938625, "attn_res/mlp_query_norm_std": 0.1569321360446304, "attn_res/final_query_norm": 0.9299718737602234}
|
| 132 |
+
{"step": 100, "timestamp": 1778391938.3518643, "loss/perplexity": 11.183106561401834, "loss/slope": -0.006217980818314987, "loss/relative_slope": -0.0025753685083352977, "loss/window_mean": 2.6449079132080078, "loss/window_median": 2.697602081298828, "loss/window_std": 0.1857550026934761, "loss/window_min": 2.414404296875, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.3040605276823044, "optim/grad_norm_std": 0.115776101071413, "optim/grad_norm_cv": 0.3807666254936612}
|
| 133 |
+
{"step": 100, "timestamp": 1778391956.2391362, "eval/loss": 2.516568422317505, "eval/perplexity": 12.38602006287931, "eval/loss_slope": -0.0028961620330810547, "overfit/train_eval_gap": 0.10216412544250497, "overfit/train_eval_ratio": 1.0423144189859144}
|
| 134 |
+
{"step": 100, "timestamp": 1778391957.368172, "optim/grad_attn_mean": 0.0015393581233350846, "optim/grad_mlp_mean": 0.004445875929377507, "optim/grad_embed_mean": 0.09313605725765228, "optim/grad_attn_res_mean": 0.006531383067659478, "optim/grad_encoding_mean": 0.04861190055251225, "optim/grad_reasoning_mean": 0.04629849175560392, "optim/grad_decoding_mean": 0.061612299556145445, "optim/grad_attn_mlp_ratio": 0.3462432792445978, "optim/grad_attn_mlp_ratio_slope": -0.0004972271692089514, "optim/grad_layer_0": 0.035079238936305046, "optim/grad_layer_12": 0.05240923006203957, "optim/grad_layer_16": 0.052535263035679236, "optim/grad_layer_20": 0.051668144995346665, "optim/grad_layer_24": 0.06681203699554317, "optim/grad_layer_4": 0.031007035242510028, "optim/grad_layer_8": 0.03559409631998278}
|
| 135 |
+
{"step": 100, "timestamp": 1778391957.3814118, "weight/attn_drift_mean": 0.0028707052261231067, "weight/mlp_drift_mean": 0.0029280827481311398, "weight/attn_mlp_drift_ratio": 0.9804010572941514}
|
| 136 |
+
{"step": 101, "timestamp": 1778391957.3969688, "perf/step_time_sec": 1.1163629023358226, "perf/tokens_per_sec": 117409.84918591563, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 137 |
+
{"step": 102, "timestamp": 1778391958.5398784, "perf/step_time_sec": 1.1015765210613608, "perf/tokens_per_sec": 118985.83302566498, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 138 |
+
{"step": 103, "timestamp": 1778391960.045196, "perf/step_time_sec": 1.4632154684513807, "perf/tokens_per_sec": 89578.05793204354, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 139 |
+
{"step": 104, "timestamp": 1778391961.3487167, "perf/step_time_sec": 1.261335989460349, "perf/tokens_per_sec": 103915.21457821713, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 140 |
+
{"step": 105, "timestamp": 1778391962.7746546, "perf/step_time_sec": 1.3838018849492073, "perf/tokens_per_sec": 94718.76099143413, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 141 |
+
{"step": 106, "timestamp": 1778391964.3343732, "perf/step_time_sec": 1.5174068436026573, "perf/tokens_per_sec": 86378.94349336547, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 142 |
+
{"step": 107, "timestamp": 1778391965.4464471, "perf/step_time_sec": 1.0699432771652937, "perf/tokens_per_sec": 122503.69042671307, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 143 |
+
{"step": 108, "timestamp": 1778391966.703443, "perf/step_time_sec": 1.2150089498609304, "perf/tokens_per_sec": 107877.39466034589, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 144 |
+
{"step": 109, "timestamp": 1778391967.9882858, "perf/step_time_sec": 1.242740299552679, "perf/tokens_per_sec": 105470.14532897904, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 145 |
+
{"step": 110, "timestamp": 1778391969.9961169, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.391304347826087, "turn/im_end_acc_top5": 0.5597826086956522, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.020833333333333332, "perf/step_time_sec": 1.9646047111600637, "perf/tokens_per_sec": 66716.7289457452, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 146 |
+
{"step": 110, "timestamp": 1778391969.9992023, "loss/perplexity": 11.169248468003957, "loss/slope": -0.005716865539550781, "loss/relative_slope": -0.0023690328279370515, "loss/window_mean": 2.623840314691717, "loss/window_median": 2.6181339263916015, "loss/window_std": 0.1892261797314457, "loss/window_min": 2.4131643295288088, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.30399443073706195, "optim/grad_norm_std": 0.11038837898391426, "optim/grad_norm_cv": 0.3631263201640493}
|
| 147 |
+
{"step": 110, "timestamp": 1778391971.1332746, "optim/grad_attn_mean": 0.0015445960519600147, "optim/grad_mlp_mean": 0.004434834640830688, "optim/grad_embed_mean": 0.10270267724990845, "optim/grad_attn_res_mean": 0.0065566472662714915, "optim/grad_encoding_mean": 0.043812002068282, "optim/grad_reasoning_mean": 0.04904438103807883, "optim/grad_decoding_mean": 0.06199742297758348, "optim/grad_attn_mlp_ratio": 0.34828639491432045, "optim/grad_attn_mlp_ratio_slope": -0.00043889969772228677, "optim/grad_layer_0": 0.03336532897083089, "optim/grad_layer_12": 0.060221275547519326, "optim/grad_layer_16": 0.04664182796841487, "optim/grad_layer_20": 0.05427854476147331, "optim/grad_layer_24": 0.06399455855716951, "optim/grad_layer_4": 0.03089194549829699, "optim/grad_layer_8": 0.04518289110274054}
|
| 148 |
+
{"step": 110, "timestamp": 1778391971.1469605, "weight/attn_drift_mean": 0.0031631283175064737, "weight/mlp_drift_mean": 0.0032088736313148334, "weight/attn_mlp_drift_ratio": 0.985741049204202}
|
| 149 |
+
{"step": 111, "timestamp": 1778391971.1620288, "perf/step_time_sec": 1.119654681533575, "perf/tokens_per_sec": 117064.66481296944, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 150 |
+
{"step": 112, "timestamp": 1778391972.5173411, "perf/step_time_sec": 1.3136117346584797, "perf/tokens_per_sec": 99779.86382260574, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 151 |
+
{"step": 113, "timestamp": 1778391973.9150531, "perf/step_time_sec": 1.3559846868738532, "perf/tokens_per_sec": 96661.85855105722, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 152 |
+
{"step": 114, "timestamp": 1778391975.044327, "perf/step_time_sec": 1.0875315675511956, "perf/tokens_per_sec": 120522.47852918511, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 153 |
+
{"step": 115, "timestamp": 1778391976.401981, "perf/step_time_sec": 1.3155466336756945, "perf/tokens_per_sec": 99633.10812766792, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 154 |
+
{"step": 116, "timestamp": 1778391977.699029, "perf/step_time_sec": 1.2550924876704812, "perf/tokens_per_sec": 104432.14447349346, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 155 |
+
{"step": 117, "timestamp": 1778391979.4389834, "perf/step_time_sec": 1.6985237197950482, "perf/tokens_per_sec": 77168.18933551059, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 156 |
+
{"step": 118, "timestamp": 1778391980.9512935, "perf/step_time_sec": 1.4708621501922607, "perf/tokens_per_sec": 89112.36174162697, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 157 |
+
{"step": 119, "timestamp": 1778391982.0781765, "perf/step_time_sec": 1.0858142850920558, "perf/tokens_per_sec": 120713.09228436579, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 158 |
+
{"step": 120, "timestamp": 1778391983.6073582, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3967391304347826, "turn/im_end_acc_top5": 0.5760869565217391, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.013888888888888888, "perf/step_time_sec": 1.4879436632618308, "perf/tokens_per_sec": 88089.35663106183, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 159 |
+
{"step": 120, "timestamp": 1778391983.6086771, "loss/perplexity": 10.996194382834036, "loss/slope": -0.005267939134077591, "loss/relative_slope": -0.0021972183124360795, "loss/window_mean": 2.6049827257792155, "loss/window_median": 2.6181339263916015, "loss/window_std": 0.1916621074355409, "loss/window_min": 2.3975492477416993, "loss/window_max": 3.0151130676269533, "optim/grad_norm_mean": 0.29398542642593384, "optim/grad_norm_std": 0.11077955529085794, "optim/grad_norm_cv": 0.37681988742652023}
|
| 160 |
+
{"step": 120, "timestamp": 1778391984.7037272, "optim/grad_attn_mean": 0.0015130338856857745, "optim/grad_mlp_mean": 0.004552566760269526, "optim/grad_embed_mean": 0.10384875535964966, "optim/grad_attn_res_mean": 0.006707596784460628, "optim/grad_encoding_mean": 0.046613131041845515, "optim/grad_reasoning_mean": 0.04336729035680441, "optim/grad_decoding_mean": 0.06853657200408633, "optim/grad_attn_mlp_ratio": 0.3323467050330851, "optim/grad_attn_mlp_ratio_slope": -0.000441726037716102, "optim/grad_layer_0": 0.0545381605043076, "optim/grad_layer_12": 0.04448346997378394, "optim/grad_layer_16": 0.046296359243569896, "optim/grad_layer_20": 0.07617831422248855, "optim/grad_layer_24": 0.08077212021453306, "optim/grad_layer_4": 0.03287012681539636, "optim/grad_layer_8": 0.033426328271161765}
|
| 161 |
+
{"step": 120, "timestamp": 1778391984.716573, "weight/attn_drift_mean": 0.0033581490973188743, "weight/mlp_drift_mean": 0.0034883668575204302, "weight/attn_mlp_drift_ratio": 0.9626680930643133}
|
| 162 |
+
{"step": 121, "timestamp": 1778391984.7300868, "perf/step_time_sec": 1.080027999356389, "perf/tokens_per_sec": 121359.81666966829, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 163 |
+
{"step": 122, "timestamp": 1778391985.828414, "perf/step_time_sec": 1.0572870122268796, "perf/tokens_per_sec": 123970.1220995172, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 164 |
+
{"step": 123, "timestamp": 1778391987.242567, "perf/step_time_sec": 1.3730600979179144, "perf/tokens_per_sec": 95459.76916724579, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43}
|
| 165 |
+
{"step": 124, "timestamp": 1778391988.5417264, "perf/step_time_sec": 0.9844554020091891, "perf/tokens_per_sec": 133141.63316336452, "perf/gpu_memory_allocated_gib": 73.75, "perf/gpu_memory_reserved_gib": 101.43}
|