| {"step": 0, "timestamp": 1778387805.4193456, "optim/grad_attn_mean": 0.004245669168790764, "optim/grad_mlp_mean": 0.010739804058435507, "optim/grad_embed_mean": 0.2385062426328659, "optim/grad_attn_res_mean": 0.020370005653954616, "optim/grad_encoding_mean": 0.09780336054649928, "optim/grad_reasoning_mean": 0.12019853417748688, "optim/grad_decoding_mean": 0.2134573094546795, "optim/grad_attn_mlp_ratio": 0.39532054704951197, "optim/grad_layer_0": 0.12529053865000606, "optim/grad_layer_12": 0.1443721361574717, "optim/grad_layer_16": 0.11464086768683046, "optim/grad_layer_20": 0.16567943000700325, "optim/grad_layer_24": 0.1924208600539714, "optim/grad_layer_4": 0.06155146099627018, "optim/grad_layer_8": 0.08534149051411077} |
| {"step": 1, "timestamp": 1778387805.6916964, "perf/step_time_sec": 14.594803935848176, "perf/tokens_per_sec": 8980.73044188399, "perf/gpu_memory_allocated_gib": 97.51, "perf/gpu_memory_reserved_gib": 101.42} |
| {"step": 2, "timestamp": 1778387806.81891, "perf/step_time_sec": 1.0848291777074337, "perf/tokens_per_sec": 120822.70895127846, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 3, "timestamp": 1778387807.930255, "perf/step_time_sec": 1.0638272482901812, "perf/tokens_per_sec": 123207.97404904163, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 4, "timestamp": 1778387809.0232582, "perf/step_time_sec": 1.0514518832787871, "perf/tokens_per_sec": 124658.10569597593, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 5, "timestamp": 1778387810.1358342, "perf/step_time_sec": 1.0639161029830575, "perf/tokens_per_sec": 123197.68413364007, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 6, "timestamp": 1778387812.0633054, "perf/step_time_sec": 1.8786283424124122, "perf/tokens_per_sec": 69770.05352303259, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 7, "timestamp": 1778387813.3775985, "perf/step_time_sec": 1.2661810452118516, "perf/tokens_per_sec": 103517.58186213381, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 8, "timestamp": 1778387814.577105, "perf/step_time_sec": 1.151889594271779, "perf/tokens_per_sec": 113788.68309237857, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 9, "timestamp": 1778387816.032337, "perf/step_time_sec": 1.4067471977323294, "perf/tokens_per_sec": 93173.81275845974, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 10, "timestamp": 1778387817.76404, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.016304347826086956, "turn/im_end_acc_top5": 0.09239130434782608, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.006944444444444444, "turn/im_start_acc_top5": 0.1527777777777778, "perf/step_time_sec": 1.6839288659393787, "perf/tokens_per_sec": 77837.01713960558, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 10, "timestamp": 1778387817.765946, "loss/perplexity": 16.982588380197924} |
| {"step": 10, "timestamp": 1778387819.408087, "optim/grad_attn_mean": 0.002805942272302951, "optim/grad_mlp_mean": 0.007694486433008153, "optim/grad_embed_mean": 0.19310292601585388, "optim/grad_attn_res_mean": 0.015575358466824247, "optim/grad_encoding_mean": 0.07529038294807025, "optim/grad_reasoning_mean": 0.08318280090073434, "optim/grad_decoding_mean": 0.14565421853912994, "optim/grad_attn_mlp_ratio": 0.3646687339103713, "optim/grad_layer_0": 0.06998008384834975, "optim/grad_layer_12": 0.10068591937306337, "optim/grad_layer_16": 0.07506245019612834, "optim/grad_layer_20": 0.09822841419372708, "optim/grad_layer_24": 0.19075251737376675, "optim/grad_layer_4": 0.05321074792300351, "optim/grad_layer_8": 0.05532025365391746} |
| {"step": 10, "timestamp": 1778387819.4244957, "weight/attn_drift_mean": 0.0005542777407108499, "weight/mlp_drift_mean": 0.00021698596165316066, "weight/attn_mlp_drift_ratio": 2.554322838490375} |
| {"step": 11, "timestamp": 1778387819.4442053, "perf/step_time_sec": 1.6290641548112035, "perf/tokens_per_sec": 80458.46421265729, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 12, "timestamp": 1778387821.1570752, "perf/step_time_sec": 1.6641259137541056, "perf/tokens_per_sec": 78763.27080582164, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 13, "timestamp": 1778387823.0789208, "perf/step_time_sec": 1.8723092321306467, "perf/tokens_per_sec": 70005.52993633586, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 14, "timestamp": 1778387825.117408, "perf/step_time_sec": 1.989820203743875, "perf/tokens_per_sec": 65871.27809506918, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 15, "timestamp": 1778387827.1903691, "perf/step_time_sec": 2.023644581437111, "perf/tokens_per_sec": 64770.2670727475, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 16, "timestamp": 1778387829.0270703, "perf/step_time_sec": 1.7872896799817681, "perf/tokens_per_sec": 73335.62178982483, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 17, "timestamp": 1778387830.665192, "perf/step_time_sec": 1.58958575502038, "perf/tokens_per_sec": 82456.70268875775, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 18, "timestamp": 1778387832.642268, "perf/step_time_sec": 1.9280737228691578, "perf/tokens_per_sec": 67980.80303949807, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 19, "timestamp": 1778387834.976124, "perf/step_time_sec": 2.283752713352442, "perf/tokens_per_sec": 57393.254196770045, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 20, "timestamp": 1778387838.0393503, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.09239130434782608, "turn/im_end_acc_top5": 0.33695652173913043, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.2152777777777778, "perf/step_time_sec": 3.0134363379329443, "perf/tokens_per_sec": 43495.85831632613, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 20, "timestamp": 1778387838.0425663, "loss/perplexity": 15.112805213065954} |
| {"step": 20, "timestamp": 1778387839.3542879, "optim/grad_attn_mean": 0.002064355759232837, "optim/grad_mlp_mean": 0.005572739297349472, "optim/grad_embed_mean": 0.1447547972202301, "optim/grad_attn_res_mean": 0.010611715801483993, "optim/grad_encoding_mean": 0.05934383376896019, "optim/grad_reasoning_mean": 0.0657717383841777, "optim/grad_decoding_mean": 0.09146494309825356, "optim/grad_attn_mlp_ratio": 0.3704375792061366, "optim/grad_layer_0": 0.08500559546519071, "optim/grad_layer_12": 0.08116429793881252, "optim/grad_layer_16": 0.059295783430570737, "optim/grad_layer_20": 0.08053597575053573, "optim/grad_layer_24": 0.13789449707837775, "optim/grad_layer_4": 0.038532577629666775, "optim/grad_layer_8": 0.05521802557632327} |
| {"step": 20, "timestamp": 1778387839.3686378, "weight/attn_drift_mean": 0.0008536548717050327, "weight/mlp_drift_mean": 0.0005237206633101192, "weight/attn_mlp_drift_ratio": 1.6299501471036724} |
| {"step": 21, "timestamp": 1778387839.3856602, "perf/step_time_sec": 1.2930320771411061, "perf/tokens_per_sec": 101367.94153614517, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 22, "timestamp": 1778387840.5634487, "perf/step_time_sec": 1.129216962493956, "perf/tokens_per_sec": 116073.3537959952, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 23, "timestamp": 1778387841.8868315, "perf/step_time_sec": 1.2746815225109458, "perf/tokens_per_sec": 102827.25346312884, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 24, "timestamp": 1778387843.0786653, "perf/step_time_sec": 1.1434907047078013, "perf/tokens_per_sec": 114624.45602781976, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 25, "timestamp": 1778387844.3831484, "perf/step_time_sec": 1.2560907071456313, "perf/tokens_per_sec": 104349.15189990614, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 25, "timestamp": 1778387865.4089072, "eval/loss": 2.5574700832366943, "eval/perplexity": 12.903132137293005, "overfit/train_eval_gap": -0.15807232856750497, "overfit/train_eval_ratio": 0.9417897738226155} |
| {"step": 26, "timestamp": 1778387866.758634, "perf/step_time_sec": 1.3075668597593904, "perf/tokens_per_sec": 100241.14562227356, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 27, "timestamp": 1778387868.288166, "perf/step_time_sec": 1.481961701065302, "perf/tokens_per_sec": 88444.93073321629, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 28, "timestamp": 1778387869.570562, "perf/step_time_sec": 1.2332221064716578, "perf/tokens_per_sec": 106284.17972088334, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 29, "timestamp": 1778387870.7635705, "perf/step_time_sec": 1.144172583706677, "perf/tokens_per_sec": 114556.14464679566, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 30, "timestamp": 1778387872.7651463, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.22826086956521738, "turn/im_end_acc_top5": 0.45652173913043476, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.11805555555555555, "perf/step_time_sec": 1.9522706866264343, "perf/tokens_per_sec": 67138.23082929921, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 30, "timestamp": 1778387872.7673419, "loss/perplexity": 13.3648725594128} |
| {"step": 30, "timestamp": 1778387874.3851817, "optim/grad_attn_mean": 0.0019635289737584107, "optim/grad_mlp_mean": 0.0055457863168807565, "optim/grad_embed_mean": 0.13382333517074585, "optim/grad_attn_res_mean": 0.008531327434754706, "optim/grad_encoding_mean": 0.05151895184260664, "optim/grad_reasoning_mean": 0.05447837177295393, "optim/grad_decoding_mean": 0.09505185043381062, "optim/grad_attn_mlp_ratio": 0.35405717440102474, "optim/grad_attn_mlp_ratio_slope": -0.0011802127264969634, "optim/grad_layer_0": 0.0718762407777831, "optim/grad_layer_12": 0.06221258104778826, "optim/grad_layer_16": 0.05687790393130854, "optim/grad_layer_20": 0.084640180808492, "optim/grad_layer_24": 0.0910064042545855, "optim/grad_layer_4": 0.03229723055846989, "optim/grad_layer_8": 0.03841014968929812} |
| {"step": 30, "timestamp": 1778387874.4022305, "weight/attn_drift_mean": 0.001175667007085779, "weight/mlp_drift_mean": 0.0008320324998583761, "weight/attn_mlp_drift_ratio": 1.4129891289037424} |
| {"step": 31, "timestamp": 1778387874.4213045, "perf/step_time_sec": 1.6051195142790675, "perf/tokens_per_sec": 81658.71689552689, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 32, "timestamp": 1778387876.3804631, "perf/step_time_sec": 1.9104906069114804, "perf/tokens_per_sec": 68606.4613591021, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 33, "timestamp": 1778387878.2409189, "perf/step_time_sec": 1.8129950566217303, "perf/tokens_per_sec": 72295.8397052857, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 34, "timestamp": 1778387880.4407372, "perf/step_time_sec": 2.1580227399244905, "perf/tokens_per_sec": 60737.080094246936, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 35, "timestamp": 1778387882.3548048, "perf/step_time_sec": 1.8657795721665025, "perf/tokens_per_sec": 70250.52795909972, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 36, "timestamp": 1778387884.6427977, "perf/step_time_sec": 2.238195694051683, "perf/tokens_per_sec": 58561.45660021691, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 37, "timestamp": 1778387885.9907386, "perf/step_time_sec": 1.2996103437617421, "perf/tokens_per_sec": 100854.84516890661, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 38, "timestamp": 1778387887.603697, "perf/step_time_sec": 1.5641895961016417, "perf/tokens_per_sec": 83795.46848199525, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 39, "timestamp": 1778387889.3911662, "perf/step_time_sec": 1.7390564903616905, "perf/tokens_per_sec": 75369.60456801465, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 40, "timestamp": 1778387891.4029756, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.2826086956521739, "turn/im_end_acc_top5": 0.483695652173913, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.04861111111111111, "perf/step_time_sec": 1.9696122566238046, "perf/tokens_per_sec": 66547.10822356277, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 40, "timestamp": 1778387891.404503, "loss/perplexity": 13.211248306365414, "loss/slope": -0.008762725830078125, "loss/relative_slope": -0.0033949991713861463, "loss/window_mean": 2.6803573608398437, "loss/window_median": 2.7155424118041993, "loss/window_std": 0.10228036610514527, "loss/window_min": 2.5810686111450196, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.41709866747260094, "optim/grad_norm_std": 0.26846542360919734, "optim/grad_norm_cv": 0.6436496794294667} |
| {"step": 40, "timestamp": 1778387892.6891024, "optim/grad_attn_mean": 0.001966256757886909, "optim/grad_mlp_mean": 0.005567996535059397, "optim/grad_embed_mean": 0.12290946394205093, "optim/grad_attn_res_mean": 0.010410823662312938, "optim/grad_encoding_mean": 0.050096599385142326, "optim/grad_reasoning_mean": 0.058506871604347706, "optim/grad_decoding_mean": 0.08867759688582737, "optim/grad_attn_mlp_ratio": 0.3531347791182026, "optim/grad_attn_mlp_ratio_slope": -0.0009498309537196525, "optim/grad_layer_0": 0.046380183135624975, "optim/grad_layer_12": 0.06467829112079926, "optim/grad_layer_16": 0.06501592672429979, "optim/grad_layer_20": 0.07580780133139342, "optim/grad_layer_24": 0.08359756803838536, "optim/grad_layer_4": 0.03601966335554607, "optim/grad_layer_8": 0.043856185307959095} |
| {"step": 40, "timestamp": 1778387892.7035348, "weight/attn_drift_mean": 0.0013710597239969806, "weight/mlp_drift_mean": 0.0011424704915225371, "weight/attn_mlp_drift_ratio": 1.2000727663803041} |
| {"step": 41, "timestamp": 1778387892.7187114, "perf/step_time_sec": 1.2658877754583955, "perf/tokens_per_sec": 103541.56390564481, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 42, "timestamp": 1778387893.8623278, "perf/step_time_sec": 1.096148100681603, "perf/tokens_per_sec": 119575.08289116887, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 43, "timestamp": 1778387895.3296125, "perf/step_time_sec": 1.419101390056312, "perf/tokens_per_sec": 92362.67466047572, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 44, "timestamp": 1778387896.4668548, "perf/step_time_sec": 1.0886477250605822, "perf/tokens_per_sec": 120398.91048567246, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 45, "timestamp": 1778387897.6861782, "perf/step_time_sec": 1.1716395011171699, "perf/tokens_per_sec": 111870.58807339762, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 46, "timestamp": 1778387899.2347283, "perf/step_time_sec": 1.5066011799499393, "perf/tokens_per_sec": 86998.471622301, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 47, "timestamp": 1778387900.5274713, "perf/step_time_sec": 1.2451349440962076, "perf/tokens_per_sec": 105267.30505916351, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 48, "timestamp": 1778387901.737539, "perf/step_time_sec": 1.1626026574522257, "perf/tokens_per_sec": 112740.15172753559, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 49, "timestamp": 1778387903.0448081, "perf/step_time_sec": 1.2591728530824184, "perf/tokens_per_sec": 104093.7308004533, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 50, "timestamp": 1778387904.558889, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.33695652173913043, "turn/im_end_acc_top5": 0.5380434782608695, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.04861111111111111, "perf/step_time_sec": 1.4629031494259834, "perf/tokens_per_sec": 89597.1821862782, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43, "system/disk_free_gib": 77.47, "attn_res/attn_query_norm_mean": 0.972218177148274, "attn_res/attn_query_norm_std": 0.29970614313795296, "attn_res/mlp_query_norm_mean": 0.8010603615215847, "attn_res/mlp_query_norm_std": 0.15063653640315902, "attn_res/final_query_norm": 0.9122786521911621} |
| {"step": 50, "timestamp": 1778387906.2556422, "loss/perplexity": 13.462199888334347, "loss/slope": -0.00599079513549805, "loss/relative_slope": -0.002304253220695348, "loss/window_mean": 2.664263038635254, "loss/window_median": 2.5998857498168944, "loss/window_std": 0.09698003640637932, "loss/window_min": 2.5810686111450196, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.37471914291381836, "optim/grad_norm_std": 0.2546429723087728, "optim/grad_norm_cv": 0.6795568817986386} |
| {"step": 50, "timestamp": 1778387925.2363458, "eval/loss": 2.47749400138855, "eval/perplexity": 11.911377090240862, "overfit/train_eval_gap": -0.12239174842834455, "overfit/train_eval_ratio": 0.9529241783158332} |
| {"step": 50, "timestamp": 1778387927.059854, "optim/grad_attn_mean": 0.0017790931411508033, "optim/grad_mlp_mean": 0.005020586134508319, "optim/grad_embed_mean": 0.12452990561723709, "optim/grad_attn_res_mean": 0.009387188089915486, "optim/grad_encoding_mean": 0.056458187612911895, "optim/grad_reasoning_mean": 0.050922929710294634, "optim/grad_decoding_mean": 0.08326425578561611, "optim/grad_attn_mlp_ratio": 0.35435894333791795, "optim/grad_attn_mlp_ratio_slope": -0.0007308293935416799, "optim/grad_layer_0": 0.05205837404355407, "optim/grad_layer_12": 0.06020081735914573, "optim/grad_layer_16": 0.06490260473219678, "optim/grad_layer_20": 0.05665968335233629, "optim/grad_layer_24": 0.1300416239828337, "optim/grad_layer_4": 0.033157684447360225, "optim/grad_layer_8": 0.03560964981443249} |
| {"step": 50, "timestamp": 1778387927.079882, "weight/attn_drift_mean": 0.001636672311085462, "weight/mlp_drift_mean": 0.0014433242555630592, "weight/attn_mlp_drift_ratio": 1.1339523778204652} |
| {"step": 51, "timestamp": 1778387927.106571, "perf/step_time_sec": 1.8271464556455612, "perf/tokens_per_sec": 71735.90250251182, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 52, "timestamp": 1778387928.6846523, "perf/step_time_sec": 1.5292534539476037, "perf/tokens_per_sec": 85709.79497326077, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 53, "timestamp": 1778387930.2665522, "perf/step_time_sec": 1.5324156265705824, "perf/tokens_per_sec": 85532.93096686056, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 54, "timestamp": 1778387931.8860269, "perf/step_time_sec": 1.5709617994725704, "perf/tokens_per_sec": 83434.23757599051, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 55, "timestamp": 1778387933.473116, "perf/step_time_sec": 1.5396822011098266, "perf/tokens_per_sec": 85129.25583313315, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 56, "timestamp": 1778387935.442581, "perf/step_time_sec": 1.9214994218200445, "perf/tokens_per_sec": 68213.39549290553, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 57, "timestamp": 1778387937.0424252, "perf/step_time_sec": 1.5507875913754106, "perf/tokens_per_sec": 84519.63423549889, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 58, "timestamp": 1778387938.5737998, "perf/step_time_sec": 1.4828864531591535, "perf/tokens_per_sec": 88389.77503689722, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 59, "timestamp": 1778387940.075698, "perf/step_time_sec": 1.4600971341133118, "perf/tokens_per_sec": 89769.37009029707, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 60, "timestamp": 1778387941.5858612, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3641304347826087, "turn/im_end_acc_top5": 0.5271739130434783, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.041666666666666664, "perf/step_time_sec": 1.4620114220306277, "perf/tokens_per_sec": 89651.83036528573, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 60, "timestamp": 1778387941.587219, "loss/perplexity": 12.396853659126727, "loss/slope": -0.005520744868687223, "loss/relative_slope": -0.0021929972188190956, "loss/window_mean": 2.6397929827372235, "loss/window_median": 2.5998857498168944, "loss/window_std": 0.10407460975705166, "loss/window_min": 2.5174427032470703, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.37724167108535767, "optim/grad_norm_std": 0.23252459052127458, "optim/grad_norm_cv": 0.6163809789419095} |
| {"step": 60, "timestamp": 1778387942.7025242, "optim/grad_attn_mean": 0.001924609875737699, "optim/grad_mlp_mean": 0.005302116007702092, "optim/grad_embed_mean": 0.15040066838264465, "optim/grad_attn_res_mean": 0.007617318284087161, "optim/grad_encoding_mean": 0.04756541916220966, "optim/grad_reasoning_mean": 0.052573222553797275, "optim/grad_decoding_mean": 0.08181709398922976, "optim/grad_attn_mlp_ratio": 0.3629883320279317, "optim/grad_attn_mlp_ratio_slope": -0.0004818536653485056, "optim/grad_layer_0": 0.04535449342802167, "optim/grad_layer_12": 0.05966261652065441, "optim/grad_layer_16": 0.054252392175840214, "optim/grad_layer_20": 0.06805584981339052, "optim/grad_layer_24": 0.09669328277232125, "optim/grad_layer_4": 0.03458810056326911, "optim/grad_layer_8": 0.044237920752493665} |
| {"step": 60, "timestamp": 1778387942.7166183, "weight/attn_drift_mean": 0.0018430310578621326, "weight/mlp_drift_mean": 0.0017418102962300924, "weight/attn_mlp_drift_ratio": 1.0581063166223839} |
| {"step": 61, "timestamp": 1778387942.7309597, "perf/step_time_sec": 1.0954304654151201, "perf/tokens_per_sec": 119653.418576714, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 62, "timestamp": 1778387944.109567, "perf/step_time_sec": 1.225368862040341, "perf/tokens_per_sec": 106965.34248614269, "perf/gpu_memory_allocated_gib": 73.75, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 63, "timestamp": 1778387956.0232978, "perf/step_time_sec": 1.9023579843342304, "perf/tokens_per_sec": 68899.75550310072, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 64, "timestamp": 1778387957.6823936, "perf/step_time_sec": 1.6097943810746074, "perf/tokens_per_sec": 81421.57876865228, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 65, "timestamp": 1778387959.2818918, "perf/step_time_sec": 1.404926653020084, "perf/tokens_per_sec": 93294.55008789436, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 66, "timestamp": 1778387960.9523115, "perf/step_time_sec": 1.622287828475237, "perf/tokens_per_sec": 80794.54070933427, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 67, "timestamp": 1778387962.4964318, "perf/step_time_sec": 1.4952811226248741, "perf/tokens_per_sec": 87657.0953894684, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 68, "timestamp": 1778387963.6753654, "perf/step_time_sec": 1.136628782376647, "perf/tokens_per_sec": 115316.45338589218, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 69, "timestamp": 1778387964.979624, "perf/step_time_sec": 1.2558399140834808, "perf/tokens_per_sec": 104369.99057770599, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 70, "timestamp": 1778387966.671921, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3695652173913043, "turn/im_end_acc_top5": 0.5434782608695652, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.006944444444444444, "perf/step_time_sec": 1.6423691203817725, "perf/tokens_per_sec": 79806.66366250969, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 70, "timestamp": 1778387966.6754322, "loss/perplexity": 10.767845312828422, "loss/slope": -0.006270771707807271, "loss/relative_slope": -0.0026385868870338193, "loss/window_mean": 2.602188900538853, "loss/window_median": 2.592629814147949, "loss/window_std": 0.1332987837752522, "loss/window_min": 2.3765644073486327, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.35279604366847445, "optim/grad_norm_std": 0.2234485676117921, "optim/grad_norm_cv": 0.6333647205572653} |
| {"step": 70, "timestamp": 1778387968.2749171, "optim/grad_attn_mean": 0.0016113156356257197, "optim/grad_mlp_mean": 0.004591165214411116, "optim/grad_embed_mean": 0.10926338285207748, "optim/grad_attn_res_mean": 0.006957788237821404, "optim/grad_encoding_mean": 0.04079967819739573, "optim/grad_reasoning_mean": 0.049956963709620245, "optim/grad_decoding_mean": 0.0698424332862487, "optim/grad_attn_mlp_ratio": 0.35095929917203, "optim/grad_attn_mlp_ratio_slope": -0.00043820124695482143, "optim/grad_layer_0": 0.03479381604120135, "optim/grad_layer_12": 0.05463982524815947, "optim/grad_layer_16": 0.05920767830684781, "optim/grad_layer_20": 0.05861565284430981, "optim/grad_layer_24": 0.07555905191111378, "optim/grad_layer_4": 0.03430591034702957, "optim/grad_layer_8": 0.041817024524789304} |
| {"step": 70, "timestamp": 1778387968.2907467, "weight/attn_drift_mean": 0.002052091090369654, "weight/mlp_drift_mean": 0.0020255629605360277, "weight/attn_mlp_drift_ratio": 1.0130916685551572} |
| {"step": 71, "timestamp": 1778387968.3097699, "perf/step_time_sec": 1.5846897568553686, "perf/tokens_per_sec": 82711.45783140358, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 72, "timestamp": 1778387969.4587173, "perf/step_time_sec": 1.1008769683539867, "perf/tokens_per_sec": 119061.4426205834, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 73, "timestamp": 1778387970.7738726, "perf/step_time_sec": 1.2673306139186025, "perf/tokens_per_sec": 103423.68326030072, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 74, "timestamp": 1778387972.1058364, "perf/step_time_sec": 1.2879648990929127, "perf/tokens_per_sec": 101766.7485288701, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 75, "timestamp": 1778387973.7811298, "perf/step_time_sec": 1.6246182220056653, "perf/tokens_per_sec": 80678.6469735552, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 75, "timestamp": 1778387997.9535737, "eval/loss": 2.433123826980591, "eval/perplexity": 11.394420741727767, "eval/loss_slope": -0.0024869251251220703, "overfit/train_eval_gap": 0.0565594196319581, "overfit/train_eval_ratio": 1.0237988119400767} |
| {"step": 76, "timestamp": 1778387999.1223755, "perf/step_time_sec": 1.1206639958545566, "perf/tokens_per_sec": 116959.23174550792, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 77, "timestamp": 1778388000.412062, "perf/step_time_sec": 1.2410970097407699, "perf/tokens_per_sec": 105609.79437649055, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 78, "timestamp": 1778388001.817967, "perf/step_time_sec": 1.3574987752363086, "perf/tokens_per_sec": 96554.04659734109, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 79, "timestamp": 1778388002.9797606, "perf/step_time_sec": 1.1125546069815755, "perf/tokens_per_sec": 117811.74530893892, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 80, "timestamp": 1778388004.588938, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3532608695652174, "turn/im_end_acc_top5": 0.5380434782608695, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.006944444444444444, "perf/step_time_sec": 1.5674451747909188, "perf/tokens_per_sec": 83621.42555798398, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 80, "timestamp": 1778388004.5904472, "loss/perplexity": 10.78992988659153, "loss/slope": -0.006043644632611958, "loss/relative_slope": -0.0025408268989516682, "loss/window_mean": 2.5742419481277468, "loss/window_median": 2.592629814147949, "loss/window_std": 0.14496454766578926, "loss/window_min": 2.3765644073486327, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.33775643445551395, "optim/grad_norm_std": 0.212770847365755, "optim/grad_norm_cv": 0.629953498025155} |
| {"step": 80, "timestamp": 1778388005.9438548, "optim/grad_attn_mean": 0.0017612044248380698, "optim/grad_mlp_mean": 0.005022599125887999, "optim/grad_embed_mean": 0.10495331138372421, "optim/grad_attn_res_mean": 0.00968674501117468, "optim/grad_encoding_mean": 0.0581580371597536, "optim/grad_reasoning_mean": 0.050439721681565665, "optim/grad_decoding_mean": 0.07736918785376475, "optim/grad_attn_mlp_ratio": 0.3506552830799248, "optim/grad_attn_mlp_ratio_slope": -0.0003906434758548156, "optim/grad_layer_0": 0.061383603722788393, "optim/grad_layer_12": 0.0624060588888824, "optim/grad_layer_16": 0.05508934936369769, "optim/grad_layer_20": 0.06813788216095418, "optim/grad_layer_24": 0.0846796155674383, "optim/grad_layer_4": 0.03813040745444596, "optim/grad_layer_8": 0.04260218242416158} |
| {"step": 80, "timestamp": 1778388005.9587686, "weight/attn_drift_mean": 0.0022652356011666802, "weight/mlp_drift_mean": 0.0023015744229821352, "weight/attn_mlp_drift_ratio": 0.9842070438726909} |
| {"step": 81, "timestamp": 1778388005.9759529, "perf/step_time_sec": 1.336971390992403, "perf/tokens_per_sec": 98036.5031616034, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 82, "timestamp": 1778388007.18746, "perf/step_time_sec": 1.1633538575842977, "perf/tokens_per_sec": 112667.3532266191, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 83, "timestamp": 1778388008.5243037, "perf/step_time_sec": 1.2885813545435667, "perf/tokens_per_sec": 101718.06346400807, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 84, "timestamp": 1778388010.0226886, "perf/step_time_sec": 1.456012548878789, "perf/tokens_per_sec": 90021.2021530534, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 85, "timestamp": 1778388011.335323, "perf/step_time_sec": 1.2703841710463166, "perf/tokens_per_sec": 103175.08906935308, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 86, "timestamp": 1778388012.9917123, "perf/step_time_sec": 1.6077986750751734, "perf/tokens_per_sec": 81522.64461461361, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 87, "timestamp": 1778388014.5412443, "perf/step_time_sec": 1.5004536425694823, "perf/tokens_per_sec": 87354.91472801725, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 88, "timestamp": 1778388015.9932036, "perf/step_time_sec": 1.403391228057444, "perf/tokens_per_sec": 93396.62196793703, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 89, "timestamp": 1778388017.3157752, "perf/step_time_sec": 1.119550496339798, "perf/tokens_per_sec": 117075.55883233511, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 90, "timestamp": 1778388019.0868165, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3804347826086957, "turn/im_end_acc_top5": 0.5434782608695652, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.013888888888888888, "perf/step_time_sec": 1.7280838955193758, "perf/tokens_per_sec": 75848.16937409529, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 90, "timestamp": 1778388019.0888946, "loss/perplexity": 10.871377137650416, "loss/slope": -0.005484608332316081, "loss/relative_slope": -0.0022985338307102863, "loss/window_mean": 2.553340996636285, "loss/window_median": 2.5810686111450196, "loss/window_std": 0.1489112130370394, "loss/window_min": 2.3765644073486327, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.3226619495285882, "optim/grad_norm_std": 0.20509515356021482, "optim/grad_norm_cv": 0.6356347684003663} |
| {"step": 90, "timestamp": 1778388020.5072036, "optim/grad_attn_mean": 0.0014703716559760862, "optim/grad_mlp_mean": 0.004293577865090421, "optim/grad_embed_mean": 0.09539306908845901, "optim/grad_attn_res_mean": 0.007850330131827433, "optim/grad_encoding_mean": 0.049249682771915104, "optim/grad_reasoning_mean": 0.042706369055344515, "optim/grad_decoding_mean": 0.0687270048278151, "optim/grad_attn_mlp_ratio": 0.3424575674649949, "optim/grad_attn_mlp_ratio_slope": -0.0003898392334205311, "optim/grad_layer_0": 0.051295708341058344, "optim/grad_layer_12": 0.05261199361120816, "optim/grad_layer_16": 0.04614755482180044, "optim/grad_layer_20": 0.04878749107592739, "optim/grad_layer_24": 0.10923500789795071, "optim/grad_layer_4": 0.0335238277039025, "optim/grad_layer_8": 0.04758687628782354} |
| {"step": 90, "timestamp": 1778388020.5212252, "weight/attn_drift_mean": 0.0025312722313165625, "weight/mlp_drift_mean": 0.002578348067726738, "weight/attn_mlp_drift_ratio": 0.9817380537639251} |
| {"step": 91, "timestamp": 1778388020.5366678, "perf/step_time_sec": 1.4053952191025019, "perf/tokens_per_sec": 93263.44519921148, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 92, "timestamp": 1778388021.9077678, "perf/step_time_sec": 1.3239262448623776, "perf/tokens_per_sec": 99002.49391432297, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 93, "timestamp": 1778388023.6944604, "perf/step_time_sec": 1.7394245518371463, "perf/tokens_per_sec": 75353.65639260657, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 94, "timestamp": 1778388025.420808, "perf/step_time_sec": 1.6763059785589576, "perf/tokens_per_sec": 78190.97567896078, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 95, "timestamp": 1778388027.3776026, "perf/step_time_sec": 1.9072397677227855, "perf/tokens_per_sec": 68723.3992381031, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 96, "timestamp": 1778388028.9099941, "perf/step_time_sec": 1.4889203859493136, "perf/tokens_per_sec": 88031.57055065132, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 97, "timestamp": 1778388030.6158834, "perf/step_time_sec": 1.6632148390635848, "perf/tokens_per_sec": 78806.41569660089, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 98, "timestamp": 1778388032.221663, "perf/step_time_sec": 1.5561007987707853, "perf/tokens_per_sec": 84231.04730974886, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 99, "timestamp": 1778388033.9356542, "perf/step_time_sec": 1.6642546625807881, "perf/tokens_per_sec": 78757.17758046981, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 100, "timestamp": 1778388035.7506464, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.3858695652173913, "turn/im_end_acc_top5": 0.5434782608695652, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.027777777777777776, "perf/step_time_sec": 1.7621817849576473, "perf/tokens_per_sec": 74380.52141887859, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43, "system/disk_free_gib": 75.67, "attn_res/attn_query_norm_mean": 0.9723120480775833, "attn_res/attn_query_norm_std": 0.29945521266472575, "attn_res/mlp_query_norm_mean": 0.8019473850727081, "attn_res/mlp_query_norm_std": 0.14998114920222716, "attn_res/final_query_norm": 0.9136019349098206} |
| {"step": 100, "timestamp": 1778388038.372969, "loss/perplexity": 10.29485689243895, "loss/slope": -0.005198060006806345, "loss/relative_slope": -0.002229353625284184, "loss/window_mean": 2.531171340942383, "loss/window_median": 2.5810686111450196, "loss/window_std": 0.15614268197268066, "loss/window_min": 2.3316444396972655, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.32321807742118835, "optim/grad_norm_std": 0.19457749951265085, "optim/grad_norm_cv": 0.6020006710797156} |
| {"step": 100, "timestamp": 1778388058.5039742, "eval/loss": 2.401432991027832, "eval/perplexity": 11.038983816745327, "eval/loss_slope": -0.0020499258041381837, "overfit/train_eval_gap": 0.0697885513305665, "overfit/train_eval_ratio": 1.0299310391597774} |
| {"step": 100, "timestamp": 1778388059.854374, "optim/grad_attn_mean": 0.0015654046471113082, "optim/grad_mlp_mean": 0.00447613289621326, "optim/grad_embed_mean": 0.09111861884593964, "optim/grad_attn_res_mean": 0.007374073665950594, "optim/grad_encoding_mean": 0.04458240288820687, "optim/grad_reasoning_mean": 0.05001007332531218, "optim/grad_decoding_mean": 0.06711307096702512, "optim/grad_attn_mlp_ratio": 0.34972177685292705, "optim/grad_attn_mlp_ratio_slope": -0.00033866145699211475, "optim/grad_layer_0": 0.0365140998037532, "optim/grad_layer_12": 0.05449590180069208, "optim/grad_layer_16": 0.051187760051107034, "optim/grad_layer_20": 0.05607781244907528, "optim/grad_layer_24": 0.10554599060560577, "optim/grad_layer_4": 0.028873976800241508, "optim/grad_layer_8": 0.03850291622802615} |
| {"step": 100, "timestamp": 1778388059.8688915, "weight/attn_drift_mean": 0.002723062883546985, "weight/mlp_drift_mean": 0.002853097337908269, "weight/attn_mlp_drift_ratio": 0.9544200624233701} |
| {"step": 101, "timestamp": 1778388059.8867538, "perf/step_time_sec": 1.3352875653654337, "perf/tokens_per_sec": 98160.12924836081, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 102, "timestamp": 1778388061.183187, "perf/step_time_sec": 1.2471230393275619, "perf/tokens_per_sec": 105099.4936880269, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 103, "timestamp": 1778388062.5283847, "perf/step_time_sec": 1.3030532952398062, "perf/tokens_per_sec": 100588.36463467772, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 104, "timestamp": 1778388063.8351727, "perf/step_time_sec": 1.2564629325643182, "perf/tokens_per_sec": 104318.238606924, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 105, "timestamp": 1778388065.1544695, "perf/step_time_sec": 1.2706075385212898, "perf/tokens_per_sec": 103156.95132152233, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 106, "timestamp": 1778388066.2898667, "perf/step_time_sec": 1.087289486080408, "perf/tokens_per_sec": 120549.31246737619, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 107, "timestamp": 1778388068.0306928, "perf/step_time_sec": 1.6984104756265879, "perf/tokens_per_sec": 77173.3346449386, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 108, "timestamp": 1778388069.7267277, "perf/step_time_sec": 1.6487473342567682, "perf/tokens_per_sec": 79497.92989905599, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 109, "timestamp": 1778388071.0539064, "perf/step_time_sec": 1.2800576286390424, "perf/tokens_per_sec": 102395.38991643352, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 110, "timestamp": 1778388073.1140654, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.44565217391304346, "turn/im_end_acc_top5": 0.5543478260869565, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.020833333333333332, "perf/step_time_sec": 2.01228270214051, "perf/tokens_per_sec": 65135.977097341136, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 110, "timestamp": 1778388073.115806, "loss/perplexity": 10.236712958487988, "loss/slope": -0.00483123033696955, "loss/relative_slope": -0.002077072518492252, "loss/window_mean": 2.512517634305087, "loss/window_median": 2.5174427032470703, "loss/window_std": 0.1601366053718879, "loss/window_min": 2.325980567932129, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.3153070685538379, "optim/grad_norm_std": 0.1872014665610966, "optim/grad_norm_cv": 0.593711607607466} |
| {"step": 110, "timestamp": 1778388074.4280617, "optim/grad_attn_mean": 0.0016032862919501541, "optim/grad_mlp_mean": 0.004598388872441969, "optim/grad_embed_mean": 0.09722605347633362, "optim/grad_attn_res_mean": 0.007191502939812173, "optim/grad_encoding_mean": 0.04881696830286981, "optim/grad_reasoning_mean": 0.04909831257989734, "optim/grad_decoding_mean": 0.06794906024297234, "optim/grad_attn_mlp_ratio": 0.348661857404017, "optim/grad_attn_mlp_ratio_slope": -0.0003001869893564472, "optim/grad_layer_0": 0.048961493477690965, "optim/grad_layer_12": 0.05699628253933042, "optim/grad_layer_16": 0.05266229767585173, "optim/grad_layer_20": 0.05921280803158879, "optim/grad_layer_24": 0.09072956049931236, "optim/grad_layer_4": 0.03301451978040859, "optim/grad_layer_8": 0.04490721330512315} |
| {"step": 110, "timestamp": 1778388074.4434657, "weight/attn_drift_mean": 0.0030271188559815097, "weight/mlp_drift_mean": 0.0031320026233185435, "weight/attn_mlp_drift_ratio": 0.9665091492428619} |
| {"step": 111, "timestamp": 1778388074.4593098, "perf/step_time_sec": 1.2950584208592772, "perf/tokens_per_sec": 101209.33379440375, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 112, "timestamp": 1778388075.933768, "perf/step_time_sec": 1.4321300247684121, "perf/tokens_per_sec": 91522.41607475234, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 113, "timestamp": 1778388077.276876, "perf/step_time_sec": 1.2946642693132162, "perf/tokens_per_sec": 101240.14627323429, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 114, "timestamp": 1778388080.0860791, "perf/step_time_sec": 2.760446559637785, "perf/tokens_per_sec": 47482.17260079788, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 115, "timestamp": 1778388082.0883126, "perf/step_time_sec": 1.9522073604166508, "perf/tokens_per_sec": 67140.4086766817, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 116, "timestamp": 1778388083.6966913, "perf/step_time_sec": 1.2800091346725821, "perf/tokens_per_sec": 102399.26923140853, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 117, "timestamp": 1778388085.8777409, "perf/step_time_sec": 2.131965051405132, "perf/tokens_per_sec": 61479.43181039168, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 118, "timestamp": 1778388087.4455304, "perf/step_time_sec": 1.5249635558575392, "perf/tokens_per_sec": 85950.90649644655, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 119, "timestamp": 1778388089.0246484, "perf/step_time_sec": 1.5307293292135, "perf/tokens_per_sec": 85627.15660994472, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 120, "timestamp": 1778388091.248632, "turn/im_end_count": 184, "turn/im_end_acc_top1": 0.42391304347826086, "turn/im_end_acc_top5": 0.5597826086956522, "turn/im_start_count": 144, "turn/im_start_acc_top1": 0.0, "turn/im_start_acc_top5": 0.020833333333333332, "perf/step_time_sec": 2.1751573337242007, "perf/tokens_per_sec": 60258.6295565042, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 120, "timestamp": 1778388091.2509954, "loss/perplexity": 10.166646346519252, "loss/slope": -0.004460197328687548, "loss/relative_slope": -0.0019232346464475429, "loss/window_mean": 2.4964005311330157, "loss/window_median": 2.5174427032470703, "loss/window_std": 0.16237033547710242, "loss/window_min": 2.3191123962402345, "loss/window_max": 2.832188606262207, "optim/grad_norm_mean": 0.3059195689857006, "optim/grad_norm_std": 0.18191591451904224, "optim/grad_norm_cv": 0.5946527550434193} |
| {"step": 120, "timestamp": 1778388092.7950778, "optim/grad_attn_mean": 0.0015867151550435442, "optim/grad_mlp_mean": 0.0047132576177578555, "optim/grad_embed_mean": 0.10161958634853363, "optim/grad_attn_res_mean": 0.007007830766828224, "optim/grad_encoding_mean": 0.047784059025515385, "optim/grad_reasoning_mean": 0.04870217228987409, "optim/grad_decoding_mean": 0.0686742081947159, "optim/grad_attn_mlp_ratio": 0.3366486445762991, "optim/grad_attn_mlp_ratio_slope": -0.0003066409141947363, "optim/grad_layer_0": 0.05283569125458598, "optim/grad_layer_12": 0.04811211673950311, "optim/grad_layer_16": 0.0454462499183137, "optim/grad_layer_20": 0.05357791410642676, "optim/grad_layer_24": 0.09659254614962265, "optim/grad_layer_4": 0.03284201241331175, "optim/grad_layer_8": 0.042798117065103725} |
| {"step": 120, "timestamp": 1778388092.8109953, "weight/attn_drift_mean": 0.0032353522475322897, "weight/mlp_drift_mean": 0.003407749098393873, "weight/attn_mlp_drift_ratio": 0.9494075590780929} |
| {"step": 121, "timestamp": 1778388092.8292186, "perf/step_time_sec": 1.5351424515247345, "perf/tokens_per_sec": 85381.00152843578, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 122, "timestamp": 1778388094.2897866, "perf/step_time_sec": 1.4117135349661112, "perf/tokens_per_sec": 92846.03197003876, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 123, "timestamp": 1778388095.5582414, "perf/step_time_sec": 1.2204844141378999, "perf/tokens_per_sec": 107393.42385833242, "perf/gpu_memory_allocated_gib": 98.03, "perf/gpu_memory_reserved_gib": 101.43} |
| {"step": 124, "timestamp": 1778388096.7707164, "perf/step_time_sec": 0.9786218106746674, "perf/tokens_per_sec": 133935.29407405935, "perf/gpu_memory_allocated_gib": 73.75, "perf/gpu_memory_reserved_gib": 101.43} |
|
|