diff --git "a/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json" "b/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-3093/trainer_state.json" @@ -0,0 +1,4432 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.000969696969697, + "eval_steps": 516, + "global_step": 3093, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 0.8898435831069946, + "eval_ppl": 2.43475, + "eval_runtime": 12.6383, + "eval_samples_per_second": 15.825, + "eval_steps_per_second": 7.912, + "memory/device_reserved (GiB)": 13.84, + "memory/max_active (GiB)": 13.69, + "memory/max_allocated (GiB)": 13.69, + "step": 0 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 2.995619058609009, + "learning_rate": 3.4951456310679615e-06, + "loss": 0.8680612564086914, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 2.38229, + "step": 10, + "tokens/total": 163840, + "tokens/train_per_sec_per_gpu": 14.27, + "tokens/trainable": 51990 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 2.1244935989379883, + "learning_rate": 7.378640776699029e-06, + "loss": 0.7699687004089355, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 2.1597, + "step": 20, + "tokens/total": 327680, + "tokens/train_per_sec_per_gpu": 16.06, + "tokens/trainable": 104391 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 0.9706138372421265, + "learning_rate": 1.1262135922330098e-05, + "loss": 0.5319457054138184, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.70224, + "step": 30, + "tokens/total": 491520, + "tokens/train_per_sec_per_gpu": 16.48, + "tokens/trainable": 156787 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 0.7689842581748962, + "learning_rate": 1.5145631067961166e-05, + "loss": 0.30234951972961427, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.35303, + "step": 40, + "tokens/total": 655360, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 208924 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.45850396156311035, + "learning_rate": 1.9029126213592234e-05, + "loss": 0.1519382953643799, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.16409, + "step": 50, + "tokens/total": 819200, + "tokens/train_per_sec_per_gpu": 14.61, + "tokens/trainable": 261170 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 0.41381561756134033, + "learning_rate": 2.29126213592233e-05, + "loss": 0.062263429164886475, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.06424, + "step": 60, + "tokens/total": 983040, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 313808 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 0.4865979254245758, + "learning_rate": 2.6796116504854367e-05, + "loss": 0.018695920705795288, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.01887, + "step": 70, + "tokens/total": 1146880, + "tokens/train_per_sec_per_gpu": 14.62, + "tokens/trainable": 366068 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 0.39099738001823425, + "learning_rate": 3.067961165048544e-05, + "loss": 0.006136053055524826, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00615, + "step": 80, + "tokens/total": 1310720, + "tokens/train_per_sec_per_gpu": 13.81, + "tokens/trainable": 418120 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 0.08230593055486679, + "learning_rate": 3.456310679611651e-05, + "loss": 0.004204501211643219, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00421, + "step": 90, + "tokens/total": 1474560, + "tokens/train_per_sec_per_gpu": 15.07, + "tokens/trainable": 470244 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.13297680020332336, + "learning_rate": 3.844660194174757e-05, + "loss": 0.0036250378936529158, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00363, + "step": 100, + "tokens/total": 1638400, + "tokens/train_per_sec_per_gpu": 14.91, + "tokens/trainable": 522666 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.2430051565170288, + "learning_rate": 4.2330097087378647e-05, + "loss": 0.003873714804649353, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00388, + "step": 110, + "tokens/total": 1802240, + "tokens/train_per_sec_per_gpu": 14.17, + "tokens/trainable": 574329 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 0.09347938001155853, + "learning_rate": 4.621359223300971e-05, + "loss": 0.00237951148301363, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00238, + "step": 120, + "tokens/total": 1966080, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 626194 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 0.13388365507125854, + "learning_rate": 5.0097087378640786e-05, + "loss": 0.0015400107949972153, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00154, + "step": 130, + "tokens/total": 2129920, + "tokens/train_per_sec_per_gpu": 14.01, + "tokens/trainable": 678140 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 0.13342970609664917, + "learning_rate": 5.398058252427185e-05, + "loss": 0.001996887102723122, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.002, + "step": 140, + "tokens/total": 2293760, + "tokens/train_per_sec_per_gpu": 14.41, + "tokens/trainable": 730201 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.0299234539270401, + "learning_rate": 5.786407766990292e-05, + "loss": 0.0015132850036025046, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00151, + "step": 150, + "tokens/total": 2457600, + "tokens/train_per_sec_per_gpu": 15.8, + "tokens/trainable": 782196 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 0.04437975212931633, + "learning_rate": 6.174757281553398e-05, + "loss": 0.0012883609160780907, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00129, + "step": 160, + "tokens/total": 2621440, + "tokens/train_per_sec_per_gpu": 14.64, + "tokens/trainable": 833614 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 0.014039761386811733, + "learning_rate": 6.563106796116505e-05, + "loss": 0.0011639594100415706, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00116, + "step": 170, + "tokens/total": 2785280, + "tokens/train_per_sec_per_gpu": 13.95, + "tokens/trainable": 885591 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 0.0033261056523770094, + "learning_rate": 6.951456310679612e-05, + "loss": 0.0007388167083263397, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00074, + "step": 180, + "tokens/total": 2949120, + "tokens/train_per_sec_per_gpu": 14.37, + "tokens/trainable": 937712 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 0.010476192459464073, + "learning_rate": 7.339805825242719e-05, + "loss": 0.0008642122149467469, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00086, + "step": 190, + "tokens/total": 3112960, + "tokens/train_per_sec_per_gpu": 15.52, + "tokens/trainable": 989913 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.01253255270421505, + "learning_rate": 7.728155339805826e-05, + "loss": 0.0007610846310853958, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00076, + "step": 200, + "tokens/total": 3276800, + "tokens/train_per_sec_per_gpu": 14.17, + "tokens/trainable": 1041978 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 0.01779557578265667, + "learning_rate": 8.116504854368933e-05, + "loss": 0.0007697530556470156, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00077, + "step": 210, + "tokens/total": 3440640, + "tokens/train_per_sec_per_gpu": 14.12, + "tokens/trainable": 1093395 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.16895800828933716, + "learning_rate": 8.504854368932039e-05, + "loss": 0.0006535804830491542, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 220, + "tokens/total": 3604480, + "tokens/train_per_sec_per_gpu": 14.72, + "tokens/trainable": 1145329 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 0.08973463624715805, + "learning_rate": 8.893203883495146e-05, + "loss": 0.0009510296396911145, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00095, + "step": 230, + "tokens/total": 3768320, + "tokens/train_per_sec_per_gpu": 14.67, + "tokens/trainable": 1197537 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 0.044939588755369186, + "learning_rate": 9.281553398058253e-05, + "loss": 0.001187363639473915, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00119, + "step": 240, + "tokens/total": 3932160, + "tokens/train_per_sec_per_gpu": 15.39, + "tokens/trainable": 1249924 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.08850465714931488, + "learning_rate": 9.66990291262136e-05, + "loss": 0.0013382930308580398, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00134, + "step": 250, + "tokens/total": 4096000, + "tokens/train_per_sec_per_gpu": 15.06, + "tokens/trainable": 1301558 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 0.101528100669384, + "learning_rate": 0.00010058252427184467, + "loss": 0.0008709387853741646, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00087, + "step": 260, + "tokens/total": 4259840, + "tokens/train_per_sec_per_gpu": 15.16, + "tokens/trainable": 1353706 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 0.08298433572053909, + "learning_rate": 0.00010446601941747574, + "loss": 0.0013300922699272632, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00133, + "step": 270, + "tokens/total": 4423680, + "tokens/train_per_sec_per_gpu": 15.11, + "tokens/trainable": 1405519 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 0.03734389320015907, + "learning_rate": 0.00010834951456310681, + "loss": 0.0006868645548820495, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00069, + "step": 280, + "tokens/total": 4587520, + "tokens/train_per_sec_per_gpu": 15.07, + "tokens/trainable": 1457494 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 0.07898428291082382, + "learning_rate": 0.00011223300970873786, + "loss": 0.0013550779782235622, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00136, + "step": 290, + "tokens/total": 4751360, + "tokens/train_per_sec_per_gpu": 14.75, + "tokens/trainable": 1509320 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.06320006400346756, + "learning_rate": 0.00011611650485436893, + "loss": 0.0010121697559952736, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00101, + "step": 300, + "tokens/total": 4915200, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 1561332 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 0.013749867677688599, + "learning_rate": 0.00012, + "loss": 0.0006499682553112507, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 310, + "tokens/total": 5079040, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 1613189 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 0.033964402973651886, + "learning_rate": 0.00012388349514563107, + "loss": 0.0008866124786436558, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00089, + "step": 320, + "tokens/total": 5242880, + "tokens/train_per_sec_per_gpu": 15.78, + "tokens/trainable": 1665681 + }, + { + "epoch": 0.32, + "grad_norm": 0.04327597841620445, + "learning_rate": 0.00012776699029126213, + "loss": 0.0005569641944020987, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00056, + "step": 330, + "tokens/total": 5406720, + "tokens/train_per_sec_per_gpu": 14.92, + "tokens/trainable": 1718317 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 0.02717934548854828, + "learning_rate": 0.0001316504854368932, + "loss": 0.0003776244120672345, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00038, + "step": 340, + "tokens/total": 5570560, + "tokens/train_per_sec_per_gpu": 14.42, + "tokens/trainable": 1770210 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.0028237912338227034, + "learning_rate": 0.0001355339805825243, + "loss": 0.0005292522720992566, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00053, + "step": 350, + "tokens/total": 5734400, + "tokens/train_per_sec_per_gpu": 16.4, + "tokens/trainable": 1821987 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 0.0310799703001976, + "learning_rate": 0.00013941747572815535, + "loss": 0.0006786303594708443, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 360, + "tokens/total": 5898240, + "tokens/train_per_sec_per_gpu": 14.72, + "tokens/trainable": 1874266 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 0.17325043678283691, + "learning_rate": 0.0001433009708737864, + "loss": 0.0013975565321743487, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0014, + "step": 370, + "tokens/total": 6062080, + "tokens/train_per_sec_per_gpu": 13.73, + "tokens/trainable": 1926124 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 0.07738752663135529, + "learning_rate": 0.0001471844660194175, + "loss": 0.0006820175796747208, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 380, + "tokens/total": 6225920, + "tokens/train_per_sec_per_gpu": 14.04, + "tokens/trainable": 1978693 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 0.10022349655628204, + "learning_rate": 0.00015106796116504855, + "loss": 0.00063879219815135, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 390, + "tokens/total": 6389760, + "tokens/train_per_sec_per_gpu": 13.34, + "tokens/trainable": 2030378 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.0495997779071331, + "learning_rate": 0.00015495145631067963, + "loss": 0.0021283581852912905, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00213, + "step": 400, + "tokens/total": 6553600, + "tokens/train_per_sec_per_gpu": 15.34, + "tokens/trainable": 2083047 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 0.07361701130867004, + "learning_rate": 0.0001588349514563107, + "loss": 0.001862115040421486, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00186, + "step": 410, + "tokens/total": 6717440, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 2135527 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 0.05466209724545479, + "learning_rate": 0.00016271844660194174, + "loss": 0.0011581303551793098, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00116, + "step": 420, + "tokens/total": 6881280, + "tokens/train_per_sec_per_gpu": 14.77, + "tokens/trainable": 2187636 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 0.04331392049789429, + "learning_rate": 0.00016660194174757283, + "loss": 0.0051729224622249605, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00519, + "step": 430, + "tokens/total": 7045120, + "tokens/train_per_sec_per_gpu": 13.76, + "tokens/trainable": 2239006 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.05931795388460159, + "learning_rate": 0.00017048543689320388, + "loss": 0.00242764875292778, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00243, + "step": 440, + "tokens/total": 7208960, + "tokens/train_per_sec_per_gpu": 14.59, + "tokens/trainable": 2290540 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.04634418711066246, + "learning_rate": 0.00017436893203883494, + "loss": 0.001389546226710081, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00139, + "step": 450, + "tokens/total": 7372800, + "tokens/train_per_sec_per_gpu": 14.78, + "tokens/trainable": 2341852 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 0.04817213863134384, + "learning_rate": 0.00017825242718446602, + "loss": 0.001370794139802456, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00137, + "step": 460, + "tokens/total": 7536640, + "tokens/train_per_sec_per_gpu": 13.77, + "tokens/trainable": 2393320 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 0.011335949413478374, + "learning_rate": 0.00018213592233009708, + "loss": 0.0009715131483972073, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00097, + "step": 470, + "tokens/total": 7700480, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 2445170 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 0.05298445746302605, + "learning_rate": 0.00018601941747572816, + "loss": 0.0008222623728215694, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00082, + "step": 480, + "tokens/total": 7864320, + "tokens/train_per_sec_per_gpu": 13.87, + "tokens/trainable": 2497473 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 0.061686884611845016, + "learning_rate": 0.00018990291262135925, + "loss": 0.000748783303424716, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00075, + "step": 490, + "tokens/total": 8028160, + "tokens/train_per_sec_per_gpu": 15.41, + "tokens/trainable": 2549206 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.03281249850988388, + "learning_rate": 0.0001937864077669903, + "loss": 0.0006062469445168972, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00061, + "step": 500, + "tokens/total": 8192000, + "tokens/train_per_sec_per_gpu": 14.49, + "tokens/trainable": 2600583 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 0.008482079952955246, + "learning_rate": 0.0001976699029126214, + "loss": 0.0008583014830946922, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00086, + "step": 510, + "tokens/total": 8355840, + "tokens/train_per_sec_per_gpu": 13.86, + "tokens/trainable": 2652927 + }, + { + "epoch": 0.5003636363636363, + "eval_loss": 0.0009036393603309989, + "eval_ppl": 1.0009, + "eval_runtime": 12.7872, + "eval_samples_per_second": 15.641, + "eval_steps_per_second": 7.82, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "step": 516 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 0.04333305358886719, + "learning_rate": 0.0001999996332640321, + "loss": 0.0005093200132250785, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 520, + "tokens/total": 8519680, + "tokens/train_per_sec_per_gpu": 14.09, + "tokens/trainable": 2705083 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 0.02485118806362152, + "learning_rate": 0.00019999550751528488, + "loss": 0.0006649125367403031, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 530, + "tokens/total": 8683520, + "tokens/train_per_sec_per_gpu": 14.44, + "tokens/trainable": 2756975 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 0.03736363351345062, + "learning_rate": 0.00019998679778759294, + "loss": 0.0006726076360791921, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 540, + "tokens/total": 8847360, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 2808076 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.05156765505671501, + "learning_rate": 0.0001999735044802263, + "loss": 0.000789718609303236, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00079, + "step": 550, + "tokens/total": 9011200, + "tokens/train_per_sec_per_gpu": 16.36, + "tokens/trainable": 2859893 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 0.647550106048584, + "learning_rate": 0.00019995562820257474, + "loss": 0.003008325584232807, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00301, + "step": 560, + "tokens/total": 9175040, + "tokens/train_per_sec_per_gpu": 14.21, + "tokens/trainable": 2911399 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 0.185165673494339, + "learning_rate": 0.00019993316977411993, + "loss": 0.013715097308158874, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.01381, + "step": 570, + "tokens/total": 9338880, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 2962403 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 0.2401553839445114, + "learning_rate": 0.0001999061302243977, + "loss": 0.009026474505662917, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00907, + "step": 580, + "tokens/total": 9502720, + "tokens/train_per_sec_per_gpu": 14.38, + "tokens/trainable": 3015083 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 0.08092579245567322, + "learning_rate": 0.000199874510792951, + "loss": 0.005716494470834732, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00573, + "step": 590, + "tokens/total": 9666560, + "tokens/train_per_sec_per_gpu": 16.38, + "tokens/trainable": 3066501 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 3.418715476989746, + "learning_rate": 0.00019983831292927305, + "loss": 0.048504295945167544, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0497, + "step": 600, + "tokens/total": 9830400, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 3118633 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 0.2194036841392517, + "learning_rate": 0.00019979753829274085, + "loss": 0.03429323434829712, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.03489, + "step": 610, + "tokens/total": 9994240, + "tokens/train_per_sec_per_gpu": 13.14, + "tokens/trainable": 3170577 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 0.022929901257157326, + "learning_rate": 0.0001997521887525391, + "loss": 0.0015171168372035027, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00152, + "step": 620, + "tokens/total": 10158080, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 3221696 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 0.10083670169115067, + "learning_rate": 0.00019970226638757458, + "loss": 0.0025377947837114333, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00254, + "step": 630, + "tokens/total": 10321920, + "tokens/train_per_sec_per_gpu": 14.7, + "tokens/trainable": 3273775 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 0.01761380024254322, + "learning_rate": 0.00019964777348638083, + "loss": 0.002281896211206913, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00228, + "step": 640, + "tokens/total": 10485760, + "tokens/train_per_sec_per_gpu": 14.89, + "tokens/trainable": 3325516 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.004510029684752226, + "learning_rate": 0.00019958871254701315, + "loss": 0.0009477110579609871, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00095, + "step": 650, + "tokens/total": 10649600, + "tokens/train_per_sec_per_gpu": 16.46, + "tokens/trainable": 3377214 + }, + { + "epoch": 0.64, + "grad_norm": 0.05332477018237114, + "learning_rate": 0.0001995250862769342, + "loss": 0.0005660496186465025, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00057, + "step": 660, + "tokens/total": 10813440, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 3428627 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 0.03861689195036888, + "learning_rate": 0.0001994568975928899, + "loss": 0.0008976863697171211, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0009, + "step": 670, + "tokens/total": 10977280, + "tokens/train_per_sec_per_gpu": 15.66, + "tokens/trainable": 3480170 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 0.021123304963111877, + "learning_rate": 0.00019938414962077553, + "loss": 0.0009612766094505787, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00096, + "step": 680, + "tokens/total": 11141120, + "tokens/train_per_sec_per_gpu": 15.15, + "tokens/trainable": 3532037 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 0.02421347238123417, + "learning_rate": 0.00019930684569549264, + "loss": 0.001021684519946575, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00102, + "step": 690, + "tokens/total": 11304960, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 3583461 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.05008835345506668, + "learning_rate": 0.00019922498936079613, + "loss": 0.0007617876864969731, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00076, + "step": 700, + "tokens/total": 11468800, + "tokens/train_per_sec_per_gpu": 14.08, + "tokens/trainable": 3634649 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 0.035733792930841446, + "learning_rate": 0.00019913858436913171, + "loss": 0.0012347914278507232, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00124, + "step": 710, + "tokens/total": 11632640, + "tokens/train_per_sec_per_gpu": 14.45, + "tokens/trainable": 3685786 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 0.010948767885565758, + "learning_rate": 0.00019904763468146393, + "loss": 0.0008165687322616577, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00082, + "step": 720, + "tokens/total": 11796480, + "tokens/train_per_sec_per_gpu": 15.77, + "tokens/trainable": 3737566 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 0.03577027469873428, + "learning_rate": 0.00019895214446709463, + "loss": 0.001333119161427021, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00133, + "step": 730, + "tokens/total": 11960320, + "tokens/train_per_sec_per_gpu": 13.98, + "tokens/trainable": 3789817 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 0.03971279785037041, + "learning_rate": 0.00019885211810347184, + "loss": 0.0011184611357748508, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00112, + "step": 740, + "tokens/total": 12124160, + "tokens/train_per_sec_per_gpu": 14.67, + "tokens/trainable": 3841912 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.06546575576066971, + "learning_rate": 0.00019874756017598894, + "loss": 0.0012452728115022182, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00125, + "step": 750, + "tokens/total": 12288000, + "tokens/train_per_sec_per_gpu": 14.58, + "tokens/trainable": 3893725 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 0.047058816999197006, + "learning_rate": 0.00019863847547777467, + "loss": 0.0008146104402840138, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00081, + "step": 760, + "tokens/total": 12451840, + "tokens/train_per_sec_per_gpu": 13.49, + "tokens/trainable": 3945033 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.028811641037464142, + "learning_rate": 0.00019852486900947327, + "loss": 0.0008652995340526104, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00087, + "step": 770, + "tokens/total": 12615680, + "tokens/train_per_sec_per_gpu": 15.12, + "tokens/trainable": 3996749 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 0.012203546240925789, + "learning_rate": 0.0001984067459790153, + "loss": 0.000670672720298171, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 780, + "tokens/total": 12779520, + "tokens/train_per_sec_per_gpu": 13.71, + "tokens/trainable": 4048173 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 0.016218814998865128, + "learning_rate": 0.0001982841118013789, + "loss": 0.00046353964135050776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00046, + "step": 790, + "tokens/total": 12943360, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 4099789 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.034673016518354416, + "learning_rate": 0.00019815697209834147, + "loss": 0.000707306619733572, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00071, + "step": 800, + "tokens/total": 13107200, + "tokens/train_per_sec_per_gpu": 14.45, + "tokens/trainable": 4150960 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 0.0022127812262624502, + "learning_rate": 0.00019802533269822208, + "loss": 0.00021896373946219682, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 810, + "tokens/total": 13271040, + "tokens/train_per_sec_per_gpu": 14.75, + "tokens/trainable": 4202984 + }, + { + "epoch": 0.7951515151515152, + "grad_norm": 0.000919274752959609, + "learning_rate": 0.00019788919963561422, + "loss": 0.00043264860287308695, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00043, + "step": 820, + "tokens/total": 13434880, + "tokens/train_per_sec_per_gpu": 14.06, + "tokens/trainable": 4254907 + }, + { + "epoch": 0.8048484848484848, + "grad_norm": 0.007699873298406601, + "learning_rate": 0.00019774857915110913, + "loss": 0.0003196246922016144, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 830, + "tokens/total": 13598720, + "tokens/train_per_sec_per_gpu": 14.75, + "tokens/trainable": 4306095 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 0.015523642301559448, + "learning_rate": 0.00019760347769100987, + "loss": 0.0004476988688111305, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00045, + "step": 840, + "tokens/total": 13762560, + "tokens/train_per_sec_per_gpu": 14.14, + "tokens/trainable": 4357442 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.013460986316204071, + "learning_rate": 0.00019745390190703565, + "loss": 0.0004673306830227375, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00047, + "step": 850, + "tokens/total": 13926400, + "tokens/train_per_sec_per_gpu": 14.1, + "tokens/trainable": 4409277 + }, + { + "epoch": 0.833939393939394, + "grad_norm": 0.0014691110700368881, + "learning_rate": 0.0001972998586560169, + "loss": 0.0003277578856796026, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 860, + "tokens/total": 14090240, + "tokens/train_per_sec_per_gpu": 14.28, + "tokens/trainable": 4460714 + }, + { + "epoch": 0.8436363636363636, + "grad_norm": 0.001358041656203568, + "learning_rate": 0.00019714135499958112, + "loss": 0.00032470382284373046, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 870, + "tokens/total": 14254080, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 4511989 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.04510723799467087, + "learning_rate": 0.0001969783982038289, + "loss": 0.00023182881996035575, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 880, + "tokens/total": 14417920, + "tokens/train_per_sec_per_gpu": 15.41, + "tokens/trainable": 4563354 + }, + { + "epoch": 0.863030303030303, + "grad_norm": 0.14508692920207977, + "learning_rate": 0.00019681099573900113, + "loss": 0.00026136748492717744, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 890, + "tokens/total": 14581760, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 4615691 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.010969490744173527, + "learning_rate": 0.00019663915527913625, + "loss": 0.00016044279327616097, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 900, + "tokens/total": 14745600, + "tokens/train_per_sec_per_gpu": 15.76, + "tokens/trainable": 4667433 + }, + { + "epoch": 0.8824242424242424, + "grad_norm": 0.03874114155769348, + "learning_rate": 0.00019646288470171868, + "loss": 0.0004159804433584213, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 910, + "tokens/total": 14909440, + "tokens/train_per_sec_per_gpu": 16.01, + "tokens/trainable": 4719807 + }, + { + "epoch": 0.8921212121212121, + "grad_norm": 0.044620465487241745, + "learning_rate": 0.00019628219208731756, + "loss": 0.0006739750038832426, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 920, + "tokens/total": 15073280, + "tokens/train_per_sec_per_gpu": 15.05, + "tokens/trainable": 4771772 + }, + { + "epoch": 0.9018181818181819, + "grad_norm": 0.024856949225068092, + "learning_rate": 0.00019609708571921645, + "loss": 0.00039347023703157903, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 930, + "tokens/total": 15237120, + "tokens/train_per_sec_per_gpu": 15.16, + "tokens/trainable": 4823415 + }, + { + "epoch": 0.9115151515151515, + "grad_norm": 0.022198157384991646, + "learning_rate": 0.0001959075740830335, + "loss": 0.0005907822400331497, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00059, + "step": 940, + "tokens/total": 15400960, + "tokens/train_per_sec_per_gpu": 15.36, + "tokens/trainable": 4875269 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.01670038513839245, + "learning_rate": 0.00019571366586633245, + "loss": 0.00027316866908222437, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 950, + "tokens/total": 15564800, + "tokens/train_per_sec_per_gpu": 15.11, + "tokens/trainable": 4927244 + }, + { + "epoch": 0.9309090909090909, + "grad_norm": 0.021392742171883583, + "learning_rate": 0.00019551536995822454, + "loss": 0.0004320886451750994, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00043, + "step": 960, + "tokens/total": 15728640, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 4979068 + }, + { + "epoch": 0.9406060606060606, + "grad_norm": 0.028143158182501793, + "learning_rate": 0.00019531269544896076, + "loss": 0.0005637989845126868, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00056, + "step": 970, + "tokens/total": 15892480, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 5030980 + }, + { + "epoch": 0.9503030303030303, + "grad_norm": 0.077091746032238, + "learning_rate": 0.00019510565162951537, + "loss": 0.0010597245767712594, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00106, + "step": 980, + "tokens/total": 16056320, + "tokens/train_per_sec_per_gpu": 14.04, + "tokens/trainable": 5082759 + }, + { + "epoch": 0.96, + "grad_norm": 0.04455556347966194, + "learning_rate": 0.00019489424799115984, + "loss": 0.0009517236612737179, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00095, + "step": 990, + "tokens/total": 16220160, + "tokens/train_per_sec_per_gpu": 13.04, + "tokens/trainable": 5134379 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.03573840856552124, + "learning_rate": 0.00019467849422502784, + "loss": 0.0008812972344458103, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00088, + "step": 1000, + "tokens/total": 16384000, + "tokens/train_per_sec_per_gpu": 15.23, + "tokens/trainable": 5186184 + }, + { + "epoch": 0.9793939393939394, + "grad_norm": 0.0006549305398948491, + "learning_rate": 0.0001944584002216709, + "loss": 0.0006358013488352299, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 1010, + "tokens/total": 16547840, + "tokens/train_per_sec_per_gpu": 16.1, + "tokens/trainable": 5238320 + }, + { + "epoch": 0.9890909090909091, + "grad_norm": 0.021742813289165497, + "learning_rate": 0.00019423397607060507, + "loss": 0.000400003744289279, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 1020, + "tokens/total": 16711680, + "tokens/train_per_sec_per_gpu": 14.53, + "tokens/trainable": 5290445 + }, + { + "epoch": 0.9987878787878788, + "grad_norm": 0.04323820024728775, + "learning_rate": 0.00019400523205984833, + "loss": 0.0002954686991870403, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1030, + "tokens/total": 16875520, + "tokens/train_per_sec_per_gpu": 14.98, + "tokens/trainable": 5342720 + }, + { + "epoch": 1.001939393939394, + "eval_loss": 0.00047458006883971393, + "eval_ppl": 1.00047, + "eval_runtime": 11.7938, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 8.479, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.73, + "memory/max_allocated (GiB)": 16.73, + "step": 1032 + }, + { + "epoch": 1.0096969696969698, + "grad_norm": 0.000988126266747713, + "learning_rate": 0.00019377217867544907, + "loss": 0.0004762394353747368, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00048, + "step": 1040, + "tokens/total": 17051648, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 5398184 + }, + { + "epoch": 1.0193939393939393, + "grad_norm": 0.0011711094994097948, + "learning_rate": 0.00019353482660100537, + "loss": 0.00022675264626741408, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 1050, + "tokens/total": 17215488, + "tokens/train_per_sec_per_gpu": 14.05, + "tokens/trainable": 5450329 + }, + { + "epoch": 1.029090909090909, + "grad_norm": 0.007319436874240637, + "learning_rate": 0.0001932931867171751, + "loss": 0.0003059083363041282, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1060, + "tokens/total": 17379328, + "tokens/train_per_sec_per_gpu": 13.66, + "tokens/trainable": 5502706 + }, + { + "epoch": 1.0387878787878788, + "grad_norm": 0.00967186689376831, + "learning_rate": 0.0001930472701011773, + "loss": 0.0003639918984845281, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 1070, + "tokens/total": 17543168, + "tokens/train_per_sec_per_gpu": 15.36, + "tokens/trainable": 5554957 + }, + { + "epoch": 1.0484848484848486, + "grad_norm": 0.0018478024285286665, + "learning_rate": 0.00019279708802628437, + "loss": 0.0002576910424977541, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 1080, + "tokens/total": 17707008, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 5607534 + }, + { + "epoch": 1.0581818181818181, + "grad_norm": 0.018235478550195694, + "learning_rate": 0.00019254265196130517, + "loss": 0.0003647733014076948, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 1090, + "tokens/total": 17870848, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 5659689 + }, + { + "epoch": 1.0678787878787879, + "grad_norm": 0.024314021691679955, + "learning_rate": 0.0001922839735700593, + "loss": 0.00030459570698440077, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1100, + "tokens/total": 18034688, + "tokens/train_per_sec_per_gpu": 13.67, + "tokens/trainable": 5711346 + }, + { + "epoch": 1.0775757575757576, + "grad_norm": 0.0177497286349535, + "learning_rate": 0.0001920210647108425, + "loss": 0.00023341022897511722, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 1110, + "tokens/total": 18198528, + "tokens/train_per_sec_per_gpu": 14.13, + "tokens/trainable": 5763094 + }, + { + "epoch": 1.0872727272727274, + "grad_norm": 0.005781313870102167, + "learning_rate": 0.00019175393743588295, + "loss": 0.0002974884817376733, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1120, + "tokens/total": 18362368, + "tokens/train_per_sec_per_gpu": 14.55, + "tokens/trainable": 5815101 + }, + { + "epoch": 1.096969696969697, + "grad_norm": 0.0026403339579701424, + "learning_rate": 0.00019148260399078887, + "loss": 0.00010604445124045015, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 1130, + "tokens/total": 18526208, + "tokens/train_per_sec_per_gpu": 13.87, + "tokens/trainable": 5866763 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 0.03586777299642563, + "learning_rate": 0.000191207076813987, + "loss": 0.00027820770628750324, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00028, + "step": 1140, + "tokens/total": 18690048, + "tokens/train_per_sec_per_gpu": 13.83, + "tokens/trainable": 5918322 + }, + { + "epoch": 1.1163636363636364, + "grad_norm": 0.007715190295130014, + "learning_rate": 0.00019092736853615257, + "loss": 0.00029321699403226373, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 1150, + "tokens/total": 18853888, + "tokens/train_per_sec_per_gpu": 13.95, + "tokens/trainable": 5970153 + }, + { + "epoch": 1.126060606060606, + "grad_norm": 0.05122547224164009, + "learning_rate": 0.00019064349197963013, + "loss": 0.0005070990417152643, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 1160, + "tokens/total": 19017728, + "tokens/train_per_sec_per_gpu": 15.51, + "tokens/trainable": 6021741 + }, + { + "epoch": 1.1357575757575757, + "grad_norm": 0.032420564442873, + "learning_rate": 0.000190355460157846, + "loss": 0.00031497194431722163, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1170, + "tokens/total": 19181568, + "tokens/train_per_sec_per_gpu": 16.05, + "tokens/trainable": 6074092 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 0.03688061609864235, + "learning_rate": 0.00019006328627471132, + "loss": 0.0003225028282031417, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1180, + "tokens/total": 19345408, + "tokens/train_per_sec_per_gpu": 14.1, + "tokens/trainable": 6126315 + }, + { + "epoch": 1.1551515151515153, + "grad_norm": 0.03359396383166313, + "learning_rate": 0.00018976698372401716, + "loss": 0.0004557626787573099, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00046, + "step": 1190, + "tokens/total": 19509248, + "tokens/train_per_sec_per_gpu": 14.6, + "tokens/trainable": 6178392 + }, + { + "epoch": 1.1648484848484848, + "grad_norm": 0.020522581413388252, + "learning_rate": 0.0001894665660888202, + "loss": 0.0006435967981815339, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 1200, + "tokens/total": 19673088, + "tokens/train_per_sec_per_gpu": 15.47, + "tokens/trainable": 6230984 + }, + { + "epoch": 1.1745454545454546, + "grad_norm": 0.0025893959682434797, + "learning_rate": 0.00018916204714082034, + "loss": 0.0005178887862712145, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00052, + "step": 1210, + "tokens/total": 19836928, + "tokens/train_per_sec_per_gpu": 14.13, + "tokens/trainable": 6282713 + }, + { + "epoch": 1.1842424242424243, + "grad_norm": 0.017288153991103172, + "learning_rate": 0.00018885344083972914, + "loss": 0.0005050559528172016, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 1220, + "tokens/total": 20000768, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 6334555 + }, + { + "epoch": 1.1939393939393939, + "grad_norm": 0.00206086877733469, + "learning_rate": 0.00018854076133263003, + "loss": 0.00020185327157378196, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 1230, + "tokens/total": 20164608, + "tokens/train_per_sec_per_gpu": 14.72, + "tokens/trainable": 6386137 + }, + { + "epoch": 1.2036363636363636, + "grad_norm": 0.02184407040476799, + "learning_rate": 0.0001882240229533297, + "loss": 0.00048260441981256007, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00048, + "step": 1240, + "tokens/total": 20328448, + "tokens/train_per_sec_per_gpu": 14.35, + "tokens/trainable": 6437493 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 0.04215926304459572, + "learning_rate": 0.00018790324022170118, + "loss": 0.0003190681803971529, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1250, + "tokens/total": 20492288, + "tokens/train_per_sec_per_gpu": 14.51, + "tokens/trainable": 6488834 + }, + { + "epoch": 1.2230303030303031, + "grad_norm": 0.006890668533742428, + "learning_rate": 0.00018757842784301784, + "loss": 0.0005027144681662322, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0005, + "step": 1260, + "tokens/total": 20656128, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 6540606 + }, + { + "epoch": 1.2327272727272727, + "grad_norm": 0.005489532835781574, + "learning_rate": 0.00018724960070727972, + "loss": 0.0006080259568989277, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00061, + "step": 1270, + "tokens/total": 20819968, + "tokens/train_per_sec_per_gpu": 13.92, + "tokens/trainable": 6592727 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.005877023097127676, + "learning_rate": 0.00018691677388853068, + "loss": 0.0006749071180820465, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 1280, + "tokens/total": 20983808, + "tokens/train_per_sec_per_gpu": 14.93, + "tokens/trainable": 6645179 + }, + { + "epoch": 1.2521212121212122, + "grad_norm": 0.0061390516348183155, + "learning_rate": 0.00018657996264416745, + "loss": 0.0002642946550622582, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 1290, + "tokens/total": 21147648, + "tokens/train_per_sec_per_gpu": 14.92, + "tokens/trainable": 6697406 + }, + { + "epoch": 1.2618181818181817, + "grad_norm": 0.03444842994213104, + "learning_rate": 0.0001862391824142402, + "loss": 0.0004464905709028244, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00045, + "step": 1300, + "tokens/total": 21311488, + "tokens/train_per_sec_per_gpu": 15.07, + "tokens/trainable": 6749589 + }, + { + "epoch": 1.2715151515151515, + "grad_norm": 0.0036635284777730703, + "learning_rate": 0.00018589444882074474, + "loss": 0.0002096141455695033, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00021, + "step": 1310, + "tokens/total": 21475328, + "tokens/train_per_sec_per_gpu": 13.69, + "tokens/trainable": 6801799 + }, + { + "epoch": 1.2812121212121212, + "grad_norm": 0.003200239036232233, + "learning_rate": 0.00018554577766690636, + "loss": 0.00026335257571190595, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 1320, + "tokens/total": 21639168, + "tokens/train_per_sec_per_gpu": 14.58, + "tokens/trainable": 6854205 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 0.00109296350274235, + "learning_rate": 0.0001851931849364554, + "loss": 0.0003910743165761232, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 1330, + "tokens/total": 21803008, + "tokens/train_per_sec_per_gpu": 14.96, + "tokens/trainable": 6906145 + }, + { + "epoch": 1.3006060606060605, + "grad_norm": 0.0006913666147738695, + "learning_rate": 0.00018483668679289452, + "loss": 0.0003079640679061413, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1340, + "tokens/total": 21966848, + "tokens/train_per_sec_per_gpu": 15.13, + "tokens/trainable": 6957405 + }, + { + "epoch": 1.3103030303030303, + "grad_norm": 0.03036116063594818, + "learning_rate": 0.00018447629957875776, + "loss": 0.0003281526267528534, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 1350, + "tokens/total": 22130688, + "tokens/train_per_sec_per_gpu": 15.08, + "tokens/trainable": 7009256 + }, + { + "epoch": 1.32, + "grad_norm": 0.012580045498907566, + "learning_rate": 0.00018411203981486134, + "loss": 0.0006514057982712984, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 1360, + "tokens/total": 22294528, + "tokens/train_per_sec_per_gpu": 14.66, + "tokens/trainable": 7060734 + }, + { + "epoch": 1.3296969696969696, + "grad_norm": 0.00828342791646719, + "learning_rate": 0.00018374392419954628, + "loss": 0.0003020781092345715, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1370, + "tokens/total": 22458368, + "tokens/train_per_sec_per_gpu": 15.09, + "tokens/trainable": 7112415 + }, + { + "epoch": 1.3393939393939394, + "grad_norm": 0.09482505917549133, + "learning_rate": 0.00018337196960791302, + "loss": 0.0006797847803682089, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 1380, + "tokens/total": 22622208, + "tokens/train_per_sec_per_gpu": 15.03, + "tokens/trainable": 7164110 + }, + { + "epoch": 1.3490909090909091, + "grad_norm": 0.04534842446446419, + "learning_rate": 0.00018299619309104773, + "loss": 0.000729580270126462, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00073, + "step": 1390, + "tokens/total": 22786048, + "tokens/train_per_sec_per_gpu": 15.49, + "tokens/trainable": 7215797 + }, + { + "epoch": 1.3587878787878789, + "grad_norm": 0.010737202130258083, + "learning_rate": 0.00018261661187524072, + "loss": 0.0007514740340411663, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00075, + "step": 1400, + "tokens/total": 22949888, + "tokens/train_per_sec_per_gpu": 14.14, + "tokens/trainable": 7267691 + }, + { + "epoch": 1.3684848484848484, + "grad_norm": 0.05600081756711006, + "learning_rate": 0.00018223324336119672, + "loss": 0.001420076284557581, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00142, + "step": 1410, + "tokens/total": 23113728, + "tokens/train_per_sec_per_gpu": 15.3, + "tokens/trainable": 7319876 + }, + { + "epoch": 1.3781818181818182, + "grad_norm": 0.019460471346974373, + "learning_rate": 0.00018184610512323718, + "loss": 0.0022406818345189093, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00224, + "step": 1420, + "tokens/total": 23277568, + "tokens/train_per_sec_per_gpu": 14.38, + "tokens/trainable": 7371762 + }, + { + "epoch": 1.387878787878788, + "grad_norm": 0.03277068957686424, + "learning_rate": 0.00018145521490849477, + "loss": 0.000915923435240984, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00092, + "step": 1430, + "tokens/total": 23441408, + "tokens/train_per_sec_per_gpu": 14.66, + "tokens/trainable": 7423685 + }, + { + "epoch": 1.3975757575757575, + "grad_norm": 0.0156385600566864, + "learning_rate": 0.0001810605906360996, + "loss": 0.000897888746112585, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0009, + "step": 1440, + "tokens/total": 23605248, + "tokens/train_per_sec_per_gpu": 13.99, + "tokens/trainable": 7476266 + }, + { + "epoch": 1.4072727272727272, + "grad_norm": 0.01643913984298706, + "learning_rate": 0.00018066225039635794, + "loss": 0.000922933965921402, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00092, + "step": 1450, + "tokens/total": 23769088, + "tokens/train_per_sec_per_gpu": 14.57, + "tokens/trainable": 7528208 + }, + { + "epoch": 1.416969696969697, + "grad_norm": 0.024322666227817535, + "learning_rate": 0.00018026021244992287, + "loss": 0.0011652217246592045, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00117, + "step": 1460, + "tokens/total": 23932928, + "tokens/train_per_sec_per_gpu": 13.91, + "tokens/trainable": 7580038 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 0.05165834724903107, + "learning_rate": 0.0001798544952269572, + "loss": 0.0009731135331094265, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00097, + "step": 1470, + "tokens/total": 24096768, + "tokens/train_per_sec_per_gpu": 14.56, + "tokens/trainable": 7631772 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 0.02529827691614628, + "learning_rate": 0.0001794451173262885, + "loss": 0.0005802253726869822, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00058, + "step": 1480, + "tokens/total": 24260608, + "tokens/train_per_sec_per_gpu": 13.72, + "tokens/trainable": 7683048 + }, + { + "epoch": 1.446060606060606, + "grad_norm": 0.0670745000243187, + "learning_rate": 0.00017903209751455665, + "loss": 0.000642474414780736, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 1490, + "tokens/total": 24424448, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 7735332 + }, + { + "epoch": 1.4557575757575758, + "grad_norm": 0.02367187850177288, + "learning_rate": 0.00017861545472535348, + "loss": 0.00032834114972501993, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 1500, + "tokens/total": 24588288, + "tokens/train_per_sec_per_gpu": 16.37, + "tokens/trainable": 7787186 + }, + { + "epoch": 1.4654545454545453, + "grad_norm": 0.011678172275424004, + "learning_rate": 0.00017819520805835475, + "loss": 0.0009690596722066403, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00097, + "step": 1510, + "tokens/total": 24752128, + "tokens/train_per_sec_per_gpu": 13.55, + "tokens/trainable": 7838878 + }, + { + "epoch": 1.475151515151515, + "grad_norm": 0.05298800393939018, + "learning_rate": 0.00017777137677844461, + "loss": 0.0009098535403609276, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00091, + "step": 1520, + "tokens/total": 24915968, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 7890631 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 0.037918779999017715, + "learning_rate": 0.00017734398031483265, + "loss": 0.0006457697600126266, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 1530, + "tokens/total": 25079808, + "tokens/train_per_sec_per_gpu": 13.25, + "tokens/trainable": 7942366 + }, + { + "epoch": 1.4945454545454546, + "grad_norm": 0.02729674056172371, + "learning_rate": 0.0001769130382601629, + "loss": 0.0009943137876689434, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00099, + "step": 1540, + "tokens/total": 25243648, + "tokens/train_per_sec_per_gpu": 14.37, + "tokens/trainable": 7994307 + }, + { + "epoch": 1.5023030303030303, + "eval_loss": 0.0006865999894216657, + "eval_ppl": 1.00069, + "eval_runtime": 12.127, + "eval_samples_per_second": 16.492, + "eval_steps_per_second": 8.246, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "step": 1548 + }, + { + "epoch": 1.5042424242424244, + "grad_norm": 0.053267233073711395, + "learning_rate": 0.00017647857036961592, + "loss": 0.0006284893956035375, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00063, + "step": 1550, + "tokens/total": 25407488, + "tokens/train_per_sec_per_gpu": 14.87, + "tokens/trainable": 8046124 + }, + { + "epoch": 1.513939393939394, + "grad_norm": 0.05232734978199005, + "learning_rate": 0.0001760405965600031, + "loss": 0.0005064161494374275, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 1560, + "tokens/total": 25571328, + "tokens/train_per_sec_per_gpu": 14.39, + "tokens/trainable": 8098367 + }, + { + "epoch": 1.5236363636363637, + "grad_norm": 0.015440079383552074, + "learning_rate": 0.00017559913690885364, + "loss": 0.0004742793273180723, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00047, + "step": 1570, + "tokens/total": 25735168, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 8150005 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.005799058359116316, + "learning_rate": 0.00017515421165349414, + "loss": 0.0005522690713405609, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00055, + "step": 1580, + "tokens/total": 25899008, + "tokens/train_per_sec_per_gpu": 14.94, + "tokens/trainable": 8201985 + }, + { + "epoch": 1.543030303030303, + "grad_norm": 0.025745827704668045, + "learning_rate": 0.00017470584119012094, + "loss": 0.0004415466450154781, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 1590, + "tokens/total": 26062848, + "tokens/train_per_sec_per_gpu": 14.76, + "tokens/trainable": 8253407 + }, + { + "epoch": 1.5527272727272727, + "grad_norm": 0.006111942231655121, + "learning_rate": 0.00017425404607286508, + "loss": 0.0004033858887851238, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 1600, + "tokens/total": 26226688, + "tokens/train_per_sec_per_gpu": 13.45, + "tokens/trainable": 8305596 + }, + { + "epoch": 1.5624242424242425, + "grad_norm": 0.01315031573176384, + "learning_rate": 0.00017379884701285, + "loss": 0.0006456051021814346, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 1610, + "tokens/total": 26390528, + "tokens/train_per_sec_per_gpu": 15.34, + "tokens/trainable": 8357648 + }, + { + "epoch": 1.5721212121212123, + "grad_norm": 0.002383842132985592, + "learning_rate": 0.00017334026487724225, + "loss": 0.00028960562776774167, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 1620, + "tokens/total": 26554368, + "tokens/train_per_sec_per_gpu": 14.29, + "tokens/trainable": 8410056 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 0.006294222082942724, + "learning_rate": 0.0001728783206882948, + "loss": 0.00025043871719390156, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 1630, + "tokens/total": 26718208, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 8461798 + }, + { + "epoch": 1.5915151515151515, + "grad_norm": 8.702854393050075e-05, + "learning_rate": 0.00017241303562238336, + "loss": 0.00012461008736863732, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 1640, + "tokens/total": 26882048, + "tokens/train_per_sec_per_gpu": 15.61, + "tokens/trainable": 8514035 + }, + { + "epoch": 1.601212121212121, + "grad_norm": 0.07624056935310364, + "learning_rate": 0.00017194443100903558, + "loss": 0.00024855402298271654, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 1650, + "tokens/total": 27045888, + "tokens/train_per_sec_per_gpu": 14.48, + "tokens/trainable": 8565875 + }, + { + "epoch": 1.6109090909090908, + "grad_norm": 0.02497026138007641, + "learning_rate": 0.00017147252832995337, + "loss": 0.00044286823831498625, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 1660, + "tokens/total": 27209728, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 8617912 + }, + { + "epoch": 1.6206060606060606, + "grad_norm": 0.0016530955908820033, + "learning_rate": 0.00017099734921802802, + "loss": 0.0003104714211076498, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1670, + "tokens/total": 27373568, + "tokens/train_per_sec_per_gpu": 13.53, + "tokens/trainable": 8669875 + }, + { + "epoch": 1.6303030303030304, + "grad_norm": 0.02621961385011673, + "learning_rate": 0.00017051891545634854, + "loss": 0.0004010321106761694, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 1680, + "tokens/total": 27537408, + "tokens/train_per_sec_per_gpu": 16.09, + "tokens/trainable": 8721709 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.043721288442611694, + "learning_rate": 0.00017003724897720316, + "loss": 0.00042473864741623404, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 1690, + "tokens/total": 27701248, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 8773762 + }, + { + "epoch": 1.6496969696969697, + "grad_norm": 0.01791808009147644, + "learning_rate": 0.00016955237186107387, + "loss": 0.0003858121577650309, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 1700, + "tokens/total": 27865088, + "tokens/train_per_sec_per_gpu": 14.87, + "tokens/trainable": 8825435 + }, + { + "epoch": 1.6593939393939394, + "grad_norm": 0.017175329849123955, + "learning_rate": 0.0001690643063356241, + "loss": 0.0003785108681768179, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00038, + "step": 1710, + "tokens/total": 28028928, + "tokens/train_per_sec_per_gpu": 13.63, + "tokens/trainable": 8877227 + }, + { + "epoch": 1.669090909090909, + "grad_norm": 0.03429865464568138, + "learning_rate": 0.0001685730747746799, + "loss": 0.0003128159558400512, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1720, + "tokens/total": 28192768, + "tokens/train_per_sec_per_gpu": 13.42, + "tokens/trainable": 8928835 + }, + { + "epoch": 1.6787878787878787, + "grad_norm": 0.008623798377811909, + "learning_rate": 0.0001680786996972043, + "loss": 0.0008884714916348457, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00089, + "step": 1730, + "tokens/total": 28356608, + "tokens/train_per_sec_per_gpu": 14.8, + "tokens/trainable": 8979863 + }, + { + "epoch": 1.6884848484848485, + "grad_norm": 0.007137796841561794, + "learning_rate": 0.00016758120376626488, + "loss": 0.000342932902276516, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00034, + "step": 1740, + "tokens/total": 28520448, + "tokens/train_per_sec_per_gpu": 13.64, + "tokens/trainable": 9031317 + }, + { + "epoch": 1.6981818181818182, + "grad_norm": 0.006754934322088957, + "learning_rate": 0.00016708060978799493, + "loss": 0.00031610706355422735, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1750, + "tokens/total": 28684288, + "tokens/train_per_sec_per_gpu": 16.63, + "tokens/trainable": 9082925 + }, + { + "epoch": 1.707878787878788, + "grad_norm": 0.012158721685409546, + "learning_rate": 0.00016657694071054794, + "loss": 0.00039324900135397913, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 1760, + "tokens/total": 28848128, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 9134535 + }, + { + "epoch": 1.7175757575757575, + "grad_norm": 0.04653792828321457, + "learning_rate": 0.00016607021962304565, + "loss": 0.0003617320442572236, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 1770, + "tokens/total": 29011968, + "tokens/train_per_sec_per_gpu": 14.01, + "tokens/trainable": 9186666 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.009638557210564613, + "learning_rate": 0.00016556046975451963, + "loss": 0.00031410730443894865, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1780, + "tokens/total": 29175808, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 9238529 + }, + { + "epoch": 1.7369696969696968, + "grad_norm": 0.017064686864614487, + "learning_rate": 0.0001650477144728462, + "loss": 0.00043909624218940735, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 1790, + "tokens/total": 29339648, + "tokens/train_per_sec_per_gpu": 14.08, + "tokens/trainable": 9290289 + }, + { + "epoch": 1.7466666666666666, + "grad_norm": 0.0022802259773015976, + "learning_rate": 0.00016453197728367563, + "loss": 0.00032380607444792986, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1800, + "tokens/total": 29503488, + "tokens/train_per_sec_per_gpu": 13.73, + "tokens/trainable": 9341953 + }, + { + "epoch": 1.7563636363636363, + "grad_norm": 0.0036841712426394224, + "learning_rate": 0.00016401328182935417, + "loss": 0.0006712255533784627, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 1810, + "tokens/total": 29667328, + "tokens/train_per_sec_per_gpu": 16.36, + "tokens/trainable": 9393126 + }, + { + "epoch": 1.766060606060606, + "grad_norm": 0.0006454121321439743, + "learning_rate": 0.0001634916518878404, + "loss": 0.00010477005271241069, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 1820, + "tokens/total": 29831168, + "tokens/train_per_sec_per_gpu": 14.7, + "tokens/trainable": 9444494 + }, + { + "epoch": 1.7757575757575759, + "grad_norm": 0.035474907606840134, + "learning_rate": 0.00016296711137161535, + "loss": 0.00034273902419954536, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00034, + "step": 1830, + "tokens/total": 29995008, + "tokens/train_per_sec_per_gpu": 14.78, + "tokens/trainable": 9496432 + }, + { + "epoch": 1.7854545454545454, + "grad_norm": 0.0042278701439499855, + "learning_rate": 0.00016243968432658605, + "loss": 0.0004896576981991529, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 1840, + "tokens/total": 30158848, + "tokens/train_per_sec_per_gpu": 15.01, + "tokens/trainable": 9547913 + }, + { + "epoch": 1.7951515151515152, + "grad_norm": 0.008337569423019886, + "learning_rate": 0.00016190939493098344, + "loss": 0.0003711160738021135, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00037, + "step": 1850, + "tokens/total": 30322688, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 9599023 + }, + { + "epoch": 1.8048484848484847, + "grad_norm": 0.033457424491643906, + "learning_rate": 0.00016137626749425377, + "loss": 0.0005191094242036343, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00052, + "step": 1860, + "tokens/total": 30486528, + "tokens/train_per_sec_per_gpu": 14.35, + "tokens/trainable": 9651048 + }, + { + "epoch": 1.8145454545454545, + "grad_norm": 0.014811063185334206, + "learning_rate": 0.0001608403264559445, + "loss": 0.0002689486602321267, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 1870, + "tokens/total": 30650368, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 9703354 + }, + { + "epoch": 1.8242424242424242, + "grad_norm": 0.011829032562673092, + "learning_rate": 0.00016030159638458376, + "loss": 0.0003055253764614463, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1880, + "tokens/total": 30814208, + "tokens/train_per_sec_per_gpu": 14.05, + "tokens/trainable": 9755371 + }, + { + "epoch": 1.833939393939394, + "grad_norm": 0.003898326540365815, + "learning_rate": 0.00015976010197655397, + "loss": 0.00023026440758258104, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 1890, + "tokens/total": 30978048, + "tokens/train_per_sec_per_gpu": 13.89, + "tokens/trainable": 9807011 + }, + { + "epoch": 1.8436363636363637, + "grad_norm": 0.00993694830685854, + "learning_rate": 0.00015921586805496004, + "loss": 0.000414779270067811, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00041, + "step": 1900, + "tokens/total": 31141888, + "tokens/train_per_sec_per_gpu": 14.42, + "tokens/trainable": 9859849 + }, + { + "epoch": 1.8533333333333335, + "grad_norm": 0.00715588079765439, + "learning_rate": 0.0001586689195684911, + "loss": 0.0004666011780500412, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00047, + "step": 1910, + "tokens/total": 31305728, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 9911712 + }, + { + "epoch": 1.863030303030303, + "grad_norm": 0.021137356758117676, + "learning_rate": 0.000158119281590277, + "loss": 0.00046254890039563177, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00046, + "step": 1920, + "tokens/total": 31469568, + "tokens/train_per_sec_per_gpu": 14.81, + "tokens/trainable": 9963813 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 0.0023340010084211826, + "learning_rate": 0.000157566979316739, + "loss": 0.0004919813480228185, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 1930, + "tokens/total": 31633408, + "tokens/train_per_sec_per_gpu": 15.8, + "tokens/trainable": 10015724 + }, + { + "epoch": 1.8824242424242423, + "grad_norm": 0.01151804905384779, + "learning_rate": 0.00015701203806643433, + "loss": 0.00023937469813972712, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00024, + "step": 1940, + "tokens/total": 31797248, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 10067073 + }, + { + "epoch": 1.892121212121212, + "grad_norm": 0.016535570845007896, + "learning_rate": 0.00015645448327889603, + "loss": 0.00021827330347150563, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 1950, + "tokens/total": 31961088, + "tokens/train_per_sec_per_gpu": 14.48, + "tokens/trainable": 10119393 + }, + { + "epoch": 1.9018181818181819, + "grad_norm": 0.0034130853600800037, + "learning_rate": 0.00015589434051346634, + "loss": 0.00017861993983387948, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 1960, + "tokens/total": 32124928, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 10171930 + }, + { + "epoch": 1.9115151515151516, + "grad_norm": 0.02398502826690674, + "learning_rate": 0.0001553316354481253, + "loss": 0.00014141426654532552, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 1970, + "tokens/total": 32288768, + "tokens/train_per_sec_per_gpu": 15.59, + "tokens/trainable": 10223639 + }, + { + "epoch": 1.9212121212121214, + "grad_norm": 0.0007365989149548113, + "learning_rate": 0.00015476639387831343, + "loss": 0.00011406640987843275, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 1980, + "tokens/total": 32452608, + "tokens/train_per_sec_per_gpu": 13.45, + "tokens/trainable": 10275019 + }, + { + "epoch": 1.930909090909091, + "grad_norm": 0.028317851945757866, + "learning_rate": 0.00015419864171574944, + "loss": 0.0004076042678207159, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00041, + "step": 1990, + "tokens/total": 32616448, + "tokens/train_per_sec_per_gpu": 14.68, + "tokens/trainable": 10327234 + }, + { + "epoch": 1.9406060606060604, + "grad_norm": 0.0007216805825009942, + "learning_rate": 0.00015362840498724215, + "loss": 0.0002287053968757391, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 2000, + "tokens/total": 32780288, + "tokens/train_per_sec_per_gpu": 14.77, + "tokens/trainable": 10379906 + }, + { + "epoch": 1.9503030303030302, + "grad_norm": 0.021391045302152634, + "learning_rate": 0.00015305570983349743, + "loss": 0.0006855262909084558, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00069, + "step": 2010, + "tokens/total": 32944128, + "tokens/train_per_sec_per_gpu": 13.75, + "tokens/trainable": 10431864 + }, + { + "epoch": 1.96, + "grad_norm": 0.014411289244890213, + "learning_rate": 0.00015248058250792008, + "loss": 0.00020992583595216274, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00021, + "step": 2020, + "tokens/total": 33107968, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 10483503 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.0019180785166099668, + "learning_rate": 0.00015190304937540993, + "loss": 0.000295165297575295, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 2030, + "tokens/total": 33271808, + "tokens/train_per_sec_per_gpu": 15.32, + "tokens/trainable": 10534682 + }, + { + "epoch": 1.9793939393939395, + "grad_norm": 0.027906686067581177, + "learning_rate": 0.00015132313691115367, + "loss": 0.00030230602715164423, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 2040, + "tokens/total": 33435648, + "tokens/train_per_sec_per_gpu": 13.52, + "tokens/trainable": 10586848 + }, + { + "epoch": 1.9890909090909092, + "grad_norm": 0.030775317922234535, + "learning_rate": 0.00015074087169941085, + "loss": 0.00011671001557260752, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2050, + "tokens/total": 33599488, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 10638485 + }, + { + "epoch": 1.9987878787878788, + "grad_norm": 0.054577309638261795, + "learning_rate": 0.00015015628043229523, + "loss": 0.0003703285474330187, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00037, + "step": 2060, + "tokens/total": 33763328, + "tokens/train_per_sec_per_gpu": 14.81, + "tokens/trainable": 10689855 + }, + { + "epoch": 2.003878787878788, + "eval_loss": 0.00032737868605181575, + "eval_ppl": 1.00033, + "eval_runtime": 12.1345, + "eval_samples_per_second": 16.482, + "eval_steps_per_second": 8.241, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.73, + "memory/max_allocated (GiB)": 16.73, + "step": 2064 + }, + { + "epoch": 2.0096969696969698, + "grad_norm": 0.02574228309094906, + "learning_rate": 0.00014956938990855139, + "loss": 0.0006258985958993435, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00063, + "step": 2070, + "tokens/total": 33939456, + "tokens/train_per_sec_per_gpu": 15.27, + "tokens/trainable": 10745674 + }, + { + "epoch": 2.0193939393939395, + "grad_norm": 0.0003698334621731192, + "learning_rate": 0.00014898022703232604, + "loss": 0.00025913610588759186, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 2080, + "tokens/total": 34103296, + "tokens/train_per_sec_per_gpu": 14.61, + "tokens/trainable": 10797792 + }, + { + "epoch": 2.0290909090909093, + "grad_norm": 0.0033025413285940886, + "learning_rate": 0.00014838881881193468, + "loss": 0.0001973774516955018, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2090, + "tokens/total": 34267136, + "tokens/train_per_sec_per_gpu": 14.68, + "tokens/trainable": 10849439 + }, + { + "epoch": 2.0387878787878786, + "grad_norm": 0.0001970751181943342, + "learning_rate": 0.00014779519235862365, + "loss": 0.00029088449664413927, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 2100, + "tokens/total": 34430976, + "tokens/train_per_sec_per_gpu": 14.21, + "tokens/trainable": 10902278 + }, + { + "epoch": 2.0484848484848484, + "grad_norm": 0.0011533941142261028, + "learning_rate": 0.00014719937488532706, + "loss": 0.00021680027712136506, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 2110, + "tokens/total": 34594816, + "tokens/train_per_sec_per_gpu": 14.83, + "tokens/trainable": 10954337 + }, + { + "epoch": 2.058181818181818, + "grad_norm": 0.0012934933183714747, + "learning_rate": 0.00014660139370541953, + "loss": 0.00015767107252031564, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2120, + "tokens/total": 34758656, + "tokens/train_per_sec_per_gpu": 14.22, + "tokens/trainable": 11006253 + }, + { + "epoch": 2.067878787878788, + "grad_norm": 0.00458933599293232, + "learning_rate": 0.00014600127623146388, + "loss": 0.0001101671252399683, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2130, + "tokens/total": 34922496, + "tokens/train_per_sec_per_gpu": 14.53, + "tokens/trainable": 11058062 + }, + { + "epoch": 2.0775757575757576, + "grad_norm": 0.0032617889810353518, + "learning_rate": 0.00014539904997395468, + "loss": 0.00019488829420879483, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2140, + "tokens/total": 35086336, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 11109942 + }, + { + "epoch": 2.0872727272727274, + "grad_norm": 0.007860329002141953, + "learning_rate": 0.00014479474254005707, + "loss": 9.439463028684258e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2150, + "tokens/total": 35250176, + "tokens/train_per_sec_per_gpu": 15.29, + "tokens/trainable": 11161699 + }, + { + "epoch": 2.096969696969697, + "grad_norm": 0.0008931563934311271, + "learning_rate": 0.0001441883816323411, + "loss": 0.00016972824232652783, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00017, + "step": 2160, + "tokens/total": 35414016, + "tokens/train_per_sec_per_gpu": 15.51, + "tokens/trainable": 11213741 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 0.006945727858692408, + "learning_rate": 0.00014357999504751182, + "loss": 9.466245537623764e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2170, + "tokens/total": 35577856, + "tokens/train_per_sec_per_gpu": 14.46, + "tokens/trainable": 11265729 + }, + { + "epoch": 2.1163636363636362, + "grad_norm": 0.009756731800734997, + "learning_rate": 0.0001429696106751352, + "loss": 7.116884225979447e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2180, + "tokens/total": 35741696, + "tokens/train_per_sec_per_gpu": 14.9, + "tokens/trainable": 11318089 + }, + { + "epoch": 2.126060606060606, + "grad_norm": 0.003617421491071582, + "learning_rate": 0.00014235725649635933, + "loss": 0.00017703230259940027, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 2190, + "tokens/total": 35905536, + "tokens/train_per_sec_per_gpu": 16.31, + "tokens/trainable": 11370159 + }, + { + "epoch": 2.1357575757575757, + "grad_norm": 0.0008388167480006814, + "learning_rate": 0.00014174296058263195, + "loss": 0.0002220547990873456, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 2200, + "tokens/total": 36069376, + "tokens/train_per_sec_per_gpu": 15.31, + "tokens/trainable": 11422568 + }, + { + "epoch": 2.1454545454545455, + "grad_norm": 0.03691717982292175, + "learning_rate": 0.00014112675109441352, + "loss": 0.00018518726574257016, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2210, + "tokens/total": 36233216, + "tokens/train_per_sec_per_gpu": 14.81, + "tokens/trainable": 11473971 + }, + { + "epoch": 2.1551515151515153, + "grad_norm": 0.0008130021742545068, + "learning_rate": 0.0001405086562798863, + "loss": 0.0001568903331644833, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2220, + "tokens/total": 36397056, + "tokens/train_per_sec_per_gpu": 15.22, + "tokens/trainable": 11526106 + }, + { + "epoch": 2.164848484848485, + "grad_norm": 0.0014426361303776503, + "learning_rate": 0.00013988870447365933, + "loss": 0.00027461207937449215, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 2230, + "tokens/total": 36560896, + "tokens/train_per_sec_per_gpu": 15.95, + "tokens/trainable": 11578483 + }, + { + "epoch": 2.174545454545455, + "grad_norm": 0.029341408982872963, + "learning_rate": 0.00013926692409546964, + "loss": 0.0003196842735633254, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 2240, + "tokens/total": 36724736, + "tokens/train_per_sec_per_gpu": 16.15, + "tokens/trainable": 11630965 + }, + { + "epoch": 2.184242424242424, + "grad_norm": 0.00210795970633626, + "learning_rate": 0.00013864334364887943, + "loss": 0.0004162232857197523, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 2250, + "tokens/total": 36888576, + "tokens/train_per_sec_per_gpu": 15.03, + "tokens/trainable": 11682642 + }, + { + "epoch": 2.193939393939394, + "grad_norm": 0.003121949266642332, + "learning_rate": 0.0001380179917199692, + "loss": 0.00042150220833718776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 2260, + "tokens/total": 37052416, + "tokens/train_per_sec_per_gpu": 15.47, + "tokens/trainable": 11734687 + }, + { + "epoch": 2.2036363636363636, + "grad_norm": 0.009584403596818447, + "learning_rate": 0.00013739089697602764, + "loss": 0.0003333257278427482, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 2270, + "tokens/total": 37216256, + "tokens/train_per_sec_per_gpu": 14.88, + "tokens/trainable": 11786194 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 0.0031741419807076454, + "learning_rate": 0.00013676208816423724, + "loss": 0.00011245617642998695, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2280, + "tokens/total": 37380096, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 11837550 + }, + { + "epoch": 2.223030303030303, + "grad_norm": 0.03865548223257065, + "learning_rate": 0.00013613159411035648, + "loss": 0.00020037838257849216, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2290, + "tokens/total": 37543936, + "tokens/train_per_sec_per_gpu": 15.33, + "tokens/trainable": 11889401 + }, + { + "epoch": 2.232727272727273, + "grad_norm": 0.012145821005105972, + "learning_rate": 0.00013549944371739854, + "loss": 0.00011074641952291131, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2300, + "tokens/total": 37707776, + "tokens/train_per_sec_per_gpu": 14.18, + "tokens/trainable": 11941616 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 0.0009741581161506474, + "learning_rate": 0.00013486566596430623, + "loss": 0.00024885197635740044, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 2310, + "tokens/total": 37871616, + "tokens/train_per_sec_per_gpu": 14.86, + "tokens/trainable": 11993896 + }, + { + "epoch": 2.252121212121212, + "grad_norm": 0.011996032670140266, + "learning_rate": 0.00013423028990462344, + "loss": 0.0003463976550847292, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00035, + "step": 2320, + "tokens/total": 38035456, + "tokens/train_per_sec_per_gpu": 14.09, + "tokens/trainable": 12045275 + }, + { + "epoch": 2.2618181818181817, + "grad_norm": 0.021751079708337784, + "learning_rate": 0.0001335933446651636, + "loss": 0.0008397232741117477, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00084, + "step": 2330, + "tokens/total": 38199296, + "tokens/train_per_sec_per_gpu": 15.05, + "tokens/trainable": 12096897 + }, + { + "epoch": 2.2715151515151515, + "grad_norm": 0.02025892771780491, + "learning_rate": 0.00013295485944467405, + "loss": 0.0005815276876091957, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00058, + "step": 2340, + "tokens/total": 38363136, + "tokens/train_per_sec_per_gpu": 14.28, + "tokens/trainable": 12148443 + }, + { + "epoch": 2.2812121212121212, + "grad_norm": 0.028191884979605675, + "learning_rate": 0.0001323148635124978, + "loss": 0.00035780200269073246, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 2350, + "tokens/total": 38526976, + "tokens/train_per_sec_per_gpu": 14.5, + "tokens/trainable": 12200260 + }, + { + "epoch": 2.290909090909091, + "grad_norm": 0.018472714349627495, + "learning_rate": 0.00013167338620723165, + "loss": 0.0006046999711543322, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0006, + "step": 2360, + "tokens/total": 38690816, + "tokens/train_per_sec_per_gpu": 13.44, + "tokens/trainable": 12252405 + }, + { + "epoch": 2.3006060606060608, + "grad_norm": 0.018522929400205612, + "learning_rate": 0.00013103045693538135, + "loss": 0.000294373813085258, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 2370, + "tokens/total": 38854656, + "tokens/train_per_sec_per_gpu": 15.42, + "tokens/trainable": 12304241 + }, + { + "epoch": 2.3103030303030305, + "grad_norm": 0.024094371125102043, + "learning_rate": 0.00013038610517001332, + "loss": 0.00027109310030937195, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 2380, + "tokens/total": 39018496, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 12356446 + }, + { + "epoch": 2.32, + "grad_norm": 0.019156360998749733, + "learning_rate": 0.0001297403604494039, + "loss": 0.00016260554548352957, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2390, + "tokens/total": 39182336, + "tokens/train_per_sec_per_gpu": 15.4, + "tokens/trainable": 12408205 + }, + { + "epoch": 2.3296969696969696, + "grad_norm": 0.030154094099998474, + "learning_rate": 0.00012909325237568496, + "loss": 0.0001862394856289029, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2400, + "tokens/total": 39346176, + "tokens/train_per_sec_per_gpu": 14.29, + "tokens/trainable": 12460035 + }, + { + "epoch": 2.3393939393939394, + "grad_norm": 0.0018396849045529962, + "learning_rate": 0.00012844481061348708, + "loss": 0.00013985306723043322, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 2410, + "tokens/total": 39510016, + "tokens/train_per_sec_per_gpu": 14.37, + "tokens/trainable": 12512793 + }, + { + "epoch": 2.349090909090909, + "grad_norm": 0.007293887436389923, + "learning_rate": 0.00012779506488857945, + "loss": 0.0004945728462189436, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 2420, + "tokens/total": 39673856, + "tokens/train_per_sec_per_gpu": 14.49, + "tokens/trainable": 12564678 + }, + { + "epoch": 2.358787878787879, + "grad_norm": 0.0013043258804827929, + "learning_rate": 0.00012714404498650743, + "loss": 0.0002628775080665946, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 2430, + "tokens/total": 39837696, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 12616633 + }, + { + "epoch": 2.3684848484848486, + "grad_norm": 0.00601148558780551, + "learning_rate": 0.00012649178075122702, + "loss": 0.0005043975077569484, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0005, + "step": 2440, + "tokens/total": 40001536, + "tokens/train_per_sec_per_gpu": 15.51, + "tokens/trainable": 12669042 + }, + { + "epoch": 2.378181818181818, + "grad_norm": 0.004092884249985218, + "learning_rate": 0.00012583830208373674, + "loss": 0.00020396907348185778, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2450, + "tokens/total": 40165376, + "tokens/train_per_sec_per_gpu": 14.54, + "tokens/trainable": 12720635 + }, + { + "epoch": 2.3878787878787877, + "grad_norm": 0.004112009424716234, + "learning_rate": 0.00012518363894070683, + "loss": 0.00010208101011812686, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2460, + "tokens/total": 40329216, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 12772667 + }, + { + "epoch": 2.3975757575757575, + "grad_norm": 0.005660182796418667, + "learning_rate": 0.00012452782133310624, + "loss": 0.0001985645852982998, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2470, + "tokens/total": 40493056, + "tokens/train_per_sec_per_gpu": 14.85, + "tokens/trainable": 12824689 + }, + { + "epoch": 2.4072727272727272, + "grad_norm": 0.014492900110781193, + "learning_rate": 0.00012387087932482665, + "loss": 0.00014933901838958262, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 2480, + "tokens/total": 40656896, + "tokens/train_per_sec_per_gpu": 15.28, + "tokens/trainable": 12876411 + }, + { + "epoch": 2.416969696969697, + "grad_norm": 0.0019427158404141665, + "learning_rate": 0.00012321284303130426, + "loss": 7.200292311608792e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2490, + "tokens/total": 40820736, + "tokens/train_per_sec_per_gpu": 15.06, + "tokens/trainable": 12928897 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 0.02509615570306778, + "learning_rate": 0.00012255374261813944, + "loss": 0.00043660206720232966, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 2500, + "tokens/total": 40984576, + "tokens/train_per_sec_per_gpu": 14.48, + "tokens/trainable": 12980466 + }, + { + "epoch": 2.4363636363636365, + "grad_norm": 0.007349422667175531, + "learning_rate": 0.00012189360829971371, + "loss": 0.0001283957506529987, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2510, + "tokens/total": 41148416, + "tokens/train_per_sec_per_gpu": 15.15, + "tokens/trainable": 13032069 + }, + { + "epoch": 2.4460606060606063, + "grad_norm": 0.0029069455340504646, + "learning_rate": 0.00012123247033780476, + "loss": 6.898010615259409e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2520, + "tokens/total": 41312256, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 13084418 + }, + { + "epoch": 2.4557575757575756, + "grad_norm": 0.010700283572077751, + "learning_rate": 0.00012057035904019913, + "loss": 0.00011750553967431188, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2530, + "tokens/total": 41476096, + "tokens/train_per_sec_per_gpu": 13.79, + "tokens/trainable": 13136606 + }, + { + "epoch": 2.4654545454545453, + "grad_norm": 0.002509322250261903, + "learning_rate": 0.00011990730475930288, + "loss": 0.0003227895824238658, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 2540, + "tokens/total": 41639936, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 13188322 + }, + { + "epoch": 2.475151515151515, + "grad_norm": 0.009015699848532677, + "learning_rate": 0.00011924333789075013, + "loss": 0.00032298346050083635, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 2550, + "tokens/total": 41803776, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 13240187 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.3949132561683655, + "learning_rate": 0.00011857848887200973, + "loss": 0.0007695606444031, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00077, + "step": 2560, + "tokens/total": 41967616, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 13291657 + }, + { + "epoch": 2.4945454545454546, + "grad_norm": 0.0052930801175534725, + "learning_rate": 0.00011791278818098994, + "loss": 0.0016795439645648003, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00168, + "step": 2570, + "tokens/total": 42131456, + "tokens/train_per_sec_per_gpu": 14.83, + "tokens/trainable": 13343711 + }, + { + "epoch": 2.5042424242424244, + "grad_norm": 0.0013836952857673168, + "learning_rate": 0.00011724626633464127, + "loss": 0.0001935441978275776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2580, + "tokens/total": 42295296, + "tokens/train_per_sec_per_gpu": 13.84, + "tokens/trainable": 13396092 + }, + { + "epoch": 2.5042424242424244, + "eval_loss": 8.247328514698893e-05, + "eval_ppl": 1.00008, + "eval_runtime": 12.3103, + "eval_samples_per_second": 16.247, + "eval_steps_per_second": 8.123, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 14.2, + "memory/max_allocated (GiB)": 14.2, + "step": 2580 + }, + { + "epoch": 2.5139393939393937, + "grad_norm": 0.003271307796239853, + "learning_rate": 0.00011657895388755742, + "loss": 8.508508908562362e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2590, + "tokens/total": 42459136, + "tokens/train_per_sec_per_gpu": 13.47, + "tokens/trainable": 13448209 + }, + { + "epoch": 2.5236363636363635, + "grad_norm": 0.0009727279539220035, + "learning_rate": 0.00011591088143057483, + "loss": 3.968240635003895e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 2600, + "tokens/total": 42622976, + "tokens/train_per_sec_per_gpu": 13.54, + "tokens/trainable": 13499718 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.0010527002159506083, + "learning_rate": 0.00011524207958937001, + "loss": 0.00018399815307930113, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 2610, + "tokens/total": 42786816, + "tokens/train_per_sec_per_gpu": 15.62, + "tokens/trainable": 13551815 + }, + { + "epoch": 2.543030303030303, + "grad_norm": 0.023774035274982452, + "learning_rate": 0.00011457257902305598, + "loss": 0.0003953744191676378, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 2620, + "tokens/total": 42950656, + "tokens/train_per_sec_per_gpu": 14.46, + "tokens/trainable": 13603678 + }, + { + "epoch": 2.5527272727272727, + "grad_norm": 0.013190316036343575, + "learning_rate": 0.00011390241042277654, + "loss": 0.0005875382572412491, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00059, + "step": 2630, + "tokens/total": 43114496, + "tokens/train_per_sec_per_gpu": 13.34, + "tokens/trainable": 13655501 + }, + { + "epoch": 2.5624242424242425, + "grad_norm": 0.019383637234568596, + "learning_rate": 0.00011323160451029932, + "loss": 0.0002609423128888011, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 2640, + "tokens/total": 43278336, + "tokens/train_per_sec_per_gpu": 15.43, + "tokens/trainable": 13707528 + }, + { + "epoch": 2.5721212121212123, + "grad_norm": 0.002637348370626569, + "learning_rate": 0.00011256019203660764, + "loss": 0.0003633877262473106, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 2650, + "tokens/total": 43442176, + "tokens/train_per_sec_per_gpu": 13.95, + "tokens/trainable": 13758801 + }, + { + "epoch": 2.581818181818182, + "grad_norm": 0.008525248616933823, + "learning_rate": 0.00011188820378049065, + "loss": 0.000345646683126688, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00035, + "step": 2660, + "tokens/total": 43606016, + "tokens/train_per_sec_per_gpu": 16.61, + "tokens/trainable": 13810541 + }, + { + "epoch": 2.5915151515151518, + "grad_norm": 0.003398684086278081, + "learning_rate": 0.00011121567054713244, + "loss": 0.00010743099264800548, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2670, + "tokens/total": 43769856, + "tokens/train_per_sec_per_gpu": 13.59, + "tokens/trainable": 13861683 + }, + { + "epoch": 2.601212121212121, + "grad_norm": 0.048622433096170425, + "learning_rate": 0.00011054262316669986, + "loss": 0.0006771612912416458, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 2680, + "tokens/total": 43933696, + "tokens/train_per_sec_per_gpu": 13.89, + "tokens/trainable": 13913157 + }, + { + "epoch": 2.610909090909091, + "grad_norm": 0.015018350444734097, + "learning_rate": 0.00010986909249292922, + "loss": 0.00019932850264012814, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2690, + "tokens/total": 44097536, + "tokens/train_per_sec_per_gpu": 14.55, + "tokens/trainable": 13965160 + }, + { + "epoch": 2.6206060606060606, + "grad_norm": 0.0012435365933924913, + "learning_rate": 0.00010919510940171189, + "loss": 5.868576117791235e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00006, + "step": 2700, + "tokens/total": 44261376, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 14017024 + }, + { + "epoch": 2.6303030303030304, + "grad_norm": 0.0019523982191458344, + "learning_rate": 0.00010852070478967889, + "loss": 0.0001263051643036306, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2710, + "tokens/total": 44425216, + "tokens/train_per_sec_per_gpu": 13.44, + "tokens/trainable": 14068784 + }, + { + "epoch": 2.64, + "grad_norm": 0.007224493194371462, + "learning_rate": 0.0001078459095727845, + "loss": 0.0001929138321429491, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2720, + "tokens/total": 44589056, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 14120598 + }, + { + "epoch": 2.6496969696969694, + "grad_norm": 0.047363366931676865, + "learning_rate": 0.00010717075468488913, + "loss": 0.00019309332128614187, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2730, + "tokens/total": 44752896, + "tokens/train_per_sec_per_gpu": 15.31, + "tokens/trainable": 14172560 + }, + { + "epoch": 2.659393939393939, + "grad_norm": 0.001373408129438758, + "learning_rate": 0.00010649527107634108, + "loss": 9.99198411591351e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2740, + "tokens/total": 44916736, + "tokens/train_per_sec_per_gpu": 14.44, + "tokens/trainable": 14223646 + }, + { + "epoch": 2.669090909090909, + "grad_norm": 0.0005223056650720537, + "learning_rate": 0.00010581948971255788, + "loss": 0.0001228376990184188, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2750, + "tokens/total": 45080576, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 14275006 + }, + { + "epoch": 2.6787878787878787, + "grad_norm": 0.0011381276417523623, + "learning_rate": 0.00010514344157260673, + "loss": 5.9981108643114565e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00006, + "step": 2760, + "tokens/total": 45244416, + "tokens/train_per_sec_per_gpu": 13.74, + "tokens/trainable": 14327112 + }, + { + "epoch": 2.6884848484848485, + "grad_norm": 0.0028999936766922474, + "learning_rate": 0.00010446715764778423, + "loss": 0.0001589686726219952, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2770, + "tokens/total": 45408256, + "tokens/train_per_sec_per_gpu": 15.54, + "tokens/trainable": 14378961 + }, + { + "epoch": 2.6981818181818182, + "grad_norm": 0.0008248479571193457, + "learning_rate": 0.00010379066894019589, + "loss": 0.00013254316290840508, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2780, + "tokens/total": 45572096, + "tokens/train_per_sec_per_gpu": 14.41, + "tokens/trainable": 14430803 + }, + { + "epoch": 2.707878787878788, + "grad_norm": 0.00010997291246894747, + "learning_rate": 0.00010311400646133482, + "loss": 0.0001163567416369915, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2790, + "tokens/total": 45735936, + "tokens/train_per_sec_per_gpu": 14.58, + "tokens/trainable": 14482749 + }, + { + "epoch": 2.7175757575757578, + "grad_norm": 0.004438972566276789, + "learning_rate": 0.00010243720123066011, + "loss": 0.0008217763155698776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00082, + "step": 2800, + "tokens/total": 45899776, + "tokens/train_per_sec_per_gpu": 13.59, + "tokens/trainable": 14534668 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.0006182301440276206, + "learning_rate": 0.0001017602842741749, + "loss": 0.00021976977586746216, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 2810, + "tokens/total": 46063616, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 14586586 + }, + { + "epoch": 2.736969696969697, + "grad_norm": 0.003250017762184143, + "learning_rate": 0.000101083286623004, + "loss": 0.00012328216107562184, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2820, + "tokens/total": 46227456, + "tokens/train_per_sec_per_gpu": 13.96, + "tokens/trainable": 14638808 + }, + { + "epoch": 2.7466666666666666, + "grad_norm": 0.010098662227392197, + "learning_rate": 0.00010040623931197144, + "loss": 7.462603389285505e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2830, + "tokens/total": 46391296, + "tokens/train_per_sec_per_gpu": 13.8, + "tokens/trainable": 14690613 + }, + { + "epoch": 2.7563636363636363, + "grad_norm": 0.002696437295526266, + "learning_rate": 9.972917337817771e-05, + "loss": 4.609748430084437e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 2840, + "tokens/total": 46555136, + "tokens/train_per_sec_per_gpu": 14.39, + "tokens/trainable": 14742309 + }, + { + "epoch": 2.766060606060606, + "grad_norm": 0.0002640737220644951, + "learning_rate": 9.905211985957706e-05, + "loss": 9.76522103883326e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2850, + "tokens/total": 46718976, + "tokens/train_per_sec_per_gpu": 14.89, + "tokens/trainable": 14794633 + }, + { + "epoch": 2.775757575757576, + "grad_norm": 0.0023825804237276316, + "learning_rate": 9.837510979355457e-05, + "loss": 9.005467290990055e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2860, + "tokens/total": 46882816, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 14846334 + }, + { + "epoch": 2.785454545454545, + "grad_norm": 0.007719525136053562, + "learning_rate": 9.769817421550335e-05, + "loss": 0.00035368206445127723, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00035, + "step": 2870, + "tokens/total": 47046656, + "tokens/train_per_sec_per_gpu": 14.04, + "tokens/trainable": 14898484 + }, + { + "epoch": 2.795151515151515, + "grad_norm": 0.0010807636426761746, + "learning_rate": 9.702134415740192e-05, + "loss": 9.26341162994504e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2880, + "tokens/total": 47210496, + "tokens/train_per_sec_per_gpu": 14.7, + "tokens/trainable": 14950418 + }, + { + "epoch": 2.8048484848484847, + "grad_norm": 0.02270282432436943, + "learning_rate": 9.634465064639153e-05, + "loss": 0.00013720652787014843, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 2890, + "tokens/total": 47374336, + "tokens/train_per_sec_per_gpu": 13.75, + "tokens/trainable": 15002347 + }, + { + "epoch": 2.8145454545454545, + "grad_norm": 0.05273193120956421, + "learning_rate": 9.56681247033538e-05, + "loss": 0.0002461188472807407, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 2900, + "tokens/total": 47538176, + "tokens/train_per_sec_per_gpu": 13.76, + "tokens/trainable": 15054325 + }, + { + "epoch": 2.824242424242424, + "grad_norm": 0.021871395409107208, + "learning_rate": 9.499179734148883e-05, + "loss": 9.564256761223078e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2910, + "tokens/total": 47702016, + "tokens/train_per_sec_per_gpu": 14.34, + "tokens/trainable": 15105722 + }, + { + "epoch": 2.833939393939394, + "grad_norm": 0.0173841193318367, + "learning_rate": 9.431569956489331e-05, + "loss": 0.00014969281619414687, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 2920, + "tokens/total": 47865856, + "tokens/train_per_sec_per_gpu": 14.11, + "tokens/trainable": 15157622 + }, + { + "epoch": 2.8436363636363637, + "grad_norm": 0.015775226056575775, + "learning_rate": 9.363986236713933e-05, + "loss": 0.00022732678335160016, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 2930, + "tokens/total": 48029696, + "tokens/train_per_sec_per_gpu": 13.86, + "tokens/trainable": 15208749 + }, + { + "epoch": 2.8533333333333335, + "grad_norm": 0.0024653058499097824, + "learning_rate": 9.296431672985363e-05, + "loss": 0.0001259389566257596, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2940, + "tokens/total": 48193536, + "tokens/train_per_sec_per_gpu": 14.64, + "tokens/trainable": 15260098 + }, + { + "epoch": 2.8630303030303033, + "grad_norm": 0.000619547616224736, + "learning_rate": 9.228909362129722e-05, + "loss": 7.931838044896721e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 2950, + "tokens/total": 48357376, + "tokens/train_per_sec_per_gpu": 13.87, + "tokens/trainable": 15311590 + }, + { + "epoch": 2.8727272727272726, + "grad_norm": 0.016801398247480392, + "learning_rate": 9.16142239949458e-05, + "loss": 0.00022562453523278236, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 2960, + "tokens/total": 48521216, + "tokens/train_per_sec_per_gpu": 14.63, + "tokens/trainable": 15363058 + }, + { + "epoch": 2.8824242424242423, + "grad_norm": 0.0022071890998631716, + "learning_rate": 9.093973878807072e-05, + "loss": 0.00012458593118935823, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2970, + "tokens/total": 48685056, + "tokens/train_per_sec_per_gpu": 14.62, + "tokens/trainable": 15415260 + }, + { + "epoch": 2.892121212121212, + "grad_norm": 0.004443019162863493, + "learning_rate": 9.026566892032105e-05, + "loss": 0.0001334903878159821, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2980, + "tokens/total": 48848896, + "tokens/train_per_sec_per_gpu": 14.95, + "tokens/trainable": 15466621 + }, + { + "epoch": 2.901818181818182, + "grad_norm": 0.0007753843092359602, + "learning_rate": 8.959204529230569e-05, + "loss": 0.00028287877794355156, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00028, + "step": 2990, + "tokens/total": 49012736, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 15517914 + }, + { + "epoch": 2.9115151515151516, + "grad_norm": 0.0011952788336202502, + "learning_rate": 8.891889878417724e-05, + "loss": 0.000494527630507946, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 3000, + "tokens/total": 49176576, + "tokens/train_per_sec_per_gpu": 14.87, + "tokens/trainable": 15569029 + }, + { + "epoch": 2.9212121212121214, + "grad_norm": 0.0041526807472109795, + "learning_rate": 8.824626025421626e-05, + "loss": 0.00010177484946325422, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 3010, + "tokens/total": 49340416, + "tokens/train_per_sec_per_gpu": 15.38, + "tokens/trainable": 15620849 + }, + { + "epoch": 2.9309090909090907, + "grad_norm": 0.00011553156218724325, + "learning_rate": 8.757416053741649e-05, + "loss": 0.00010593911865726113, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 3020, + "tokens/total": 49504256, + "tokens/train_per_sec_per_gpu": 14.3, + "tokens/trainable": 15671996 + }, + { + "epoch": 2.9406060606060604, + "grad_norm": 0.0024511946830898523, + "learning_rate": 8.690263044407168e-05, + "loss": 0.0001637642504647374, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 3030, + "tokens/total": 49668096, + "tokens/train_per_sec_per_gpu": 14.01, + "tokens/trainable": 15723682 + }, + { + "epoch": 2.95030303030303, + "grad_norm": 0.007446048315614462, + "learning_rate": 8.62317007583628e-05, + "loss": 5.339759518392384e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 3040, + "tokens/total": 49831936, + "tokens/train_per_sec_per_gpu": 15.18, + "tokens/trainable": 15775385 + }, + { + "epoch": 2.96, + "grad_norm": 0.0077268267050385475, + "learning_rate": 8.556140223694718e-05, + "loss": 0.00031895393040031194, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 3050, + "tokens/total": 49995776, + "tokens/train_per_sec_per_gpu": 15.09, + "tokens/trainable": 15827301 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 0.038065724074840546, + "learning_rate": 8.489176560754834e-05, + "loss": 0.00015137892914935948, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 3060, + "tokens/total": 50159616, + "tokens/train_per_sec_per_gpu": 15.25, + "tokens/trainable": 15879448 + }, + { + "epoch": 2.9793939393939395, + "grad_norm": 0.01792677491903305, + "learning_rate": 8.422282156754741e-05, + "loss": 0.00016337501583620905, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 3070, + "tokens/total": 50323456, + "tokens/train_per_sec_per_gpu": 14.83, + "tokens/trainable": 15930723 + }, + { + "epoch": 2.9890909090909092, + "grad_norm": 0.03399665653705597, + "learning_rate": 8.355460078257607e-05, + "loss": 0.0003045425517484546, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 3080, + "tokens/total": 50487296, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 15981910 + }, + { + "epoch": 2.998787878787879, + "grad_norm": 0.029494913294911385, + "learning_rate": 8.288713388511047e-05, + "loss": 0.0003337380010634661, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 3090, + "tokens/total": 50651136, + "tokens/train_per_sec_per_gpu": 15.43, + "tokens/trainable": 16034665 + } + ], + "logging_steps": 10, + "max_steps": 5155, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1031, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1250980409971835e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}