diff --git "a/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json" "b/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoints/math_operations/lora_sft_primitive_atomic_50k/checkpoint-4124/trainer_state.json" @@ -0,0 +1,5898 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.001939393939394, + "eval_steps": 516, + "global_step": 4124, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 0.8898435831069946, + "eval_ppl": 2.43475, + "eval_runtime": 12.6383, + "eval_samples_per_second": 15.825, + "eval_steps_per_second": 7.912, + "memory/device_reserved (GiB)": 13.84, + "memory/max_active (GiB)": 13.69, + "memory/max_allocated (GiB)": 13.69, + "step": 0 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 2.995619058609009, + "learning_rate": 3.4951456310679615e-06, + "loss": 0.8680612564086914, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 2.38229, + "step": 10, + "tokens/total": 163840, + "tokens/train_per_sec_per_gpu": 14.27, + "tokens/trainable": 51990 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 2.1244935989379883, + "learning_rate": 7.378640776699029e-06, + "loss": 0.7699687004089355, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 2.1597, + "step": 20, + "tokens/total": 327680, + "tokens/train_per_sec_per_gpu": 16.06, + "tokens/trainable": 104391 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 0.9706138372421265, + "learning_rate": 1.1262135922330098e-05, + "loss": 0.5319457054138184, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.70224, + "step": 30, + "tokens/total": 491520, + "tokens/train_per_sec_per_gpu": 16.48, + "tokens/trainable": 156787 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 0.7689842581748962, + "learning_rate": 1.5145631067961166e-05, + "loss": 0.30234951972961427, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.35303, + "step": 40, + "tokens/total": 655360, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 208924 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.45850396156311035, + "learning_rate": 1.9029126213592234e-05, + "loss": 0.1519382953643799, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.16409, + "step": 50, + "tokens/total": 819200, + "tokens/train_per_sec_per_gpu": 14.61, + "tokens/trainable": 261170 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 0.41381561756134033, + "learning_rate": 2.29126213592233e-05, + "loss": 0.062263429164886475, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.06424, + "step": 60, + "tokens/total": 983040, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 313808 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 0.4865979254245758, + "learning_rate": 2.6796116504854367e-05, + "loss": 0.018695920705795288, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.01887, + "step": 70, + "tokens/total": 1146880, + "tokens/train_per_sec_per_gpu": 14.62, + "tokens/trainable": 366068 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 0.39099738001823425, + "learning_rate": 3.067961165048544e-05, + "loss": 0.006136053055524826, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00615, + "step": 80, + "tokens/total": 1310720, + "tokens/train_per_sec_per_gpu": 13.81, + "tokens/trainable": 418120 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 0.08230593055486679, + "learning_rate": 3.456310679611651e-05, + "loss": 0.004204501211643219, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00421, + "step": 90, + "tokens/total": 1474560, + "tokens/train_per_sec_per_gpu": 15.07, + "tokens/trainable": 470244 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.13297680020332336, + "learning_rate": 3.844660194174757e-05, + "loss": 0.0036250378936529158, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00363, + "step": 100, + "tokens/total": 1638400, + "tokens/train_per_sec_per_gpu": 14.91, + "tokens/trainable": 522666 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.2430051565170288, + "learning_rate": 4.2330097087378647e-05, + "loss": 0.003873714804649353, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00388, + "step": 110, + "tokens/total": 1802240, + "tokens/train_per_sec_per_gpu": 14.17, + "tokens/trainable": 574329 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 0.09347938001155853, + "learning_rate": 4.621359223300971e-05, + "loss": 0.00237951148301363, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00238, + "step": 120, + "tokens/total": 1966080, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 626194 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 0.13388365507125854, + "learning_rate": 5.0097087378640786e-05, + "loss": 0.0015400107949972153, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00154, + "step": 130, + "tokens/total": 2129920, + "tokens/train_per_sec_per_gpu": 14.01, + "tokens/trainable": 678140 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 0.13342970609664917, + "learning_rate": 5.398058252427185e-05, + "loss": 0.001996887102723122, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.002, + "step": 140, + "tokens/total": 2293760, + "tokens/train_per_sec_per_gpu": 14.41, + "tokens/trainable": 730201 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.0299234539270401, + "learning_rate": 5.786407766990292e-05, + "loss": 0.0015132850036025046, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00151, + "step": 150, + "tokens/total": 2457600, + "tokens/train_per_sec_per_gpu": 15.8, + "tokens/trainable": 782196 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 0.04437975212931633, + "learning_rate": 6.174757281553398e-05, + "loss": 0.0012883609160780907, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00129, + "step": 160, + "tokens/total": 2621440, + "tokens/train_per_sec_per_gpu": 14.64, + "tokens/trainable": 833614 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 0.014039761386811733, + "learning_rate": 6.563106796116505e-05, + "loss": 0.0011639594100415706, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00116, + "step": 170, + "tokens/total": 2785280, + "tokens/train_per_sec_per_gpu": 13.95, + "tokens/trainable": 885591 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 0.0033261056523770094, + "learning_rate": 6.951456310679612e-05, + "loss": 0.0007388167083263397, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00074, + "step": 180, + "tokens/total": 2949120, + "tokens/train_per_sec_per_gpu": 14.37, + "tokens/trainable": 937712 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 0.010476192459464073, + "learning_rate": 7.339805825242719e-05, + "loss": 0.0008642122149467469, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00086, + "step": 190, + "tokens/total": 3112960, + "tokens/train_per_sec_per_gpu": 15.52, + "tokens/trainable": 989913 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.01253255270421505, + "learning_rate": 7.728155339805826e-05, + "loss": 0.0007610846310853958, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00076, + "step": 200, + "tokens/total": 3276800, + "tokens/train_per_sec_per_gpu": 14.17, + "tokens/trainable": 1041978 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 0.01779557578265667, + "learning_rate": 8.116504854368933e-05, + "loss": 0.0007697530556470156, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00077, + "step": 210, + "tokens/total": 3440640, + "tokens/train_per_sec_per_gpu": 14.12, + "tokens/trainable": 1093395 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.16895800828933716, + "learning_rate": 8.504854368932039e-05, + "loss": 0.0006535804830491542, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 220, + "tokens/total": 3604480, + "tokens/train_per_sec_per_gpu": 14.72, + "tokens/trainable": 1145329 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 0.08973463624715805, + "learning_rate": 8.893203883495146e-05, + "loss": 0.0009510296396911145, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00095, + "step": 230, + "tokens/total": 3768320, + "tokens/train_per_sec_per_gpu": 14.67, + "tokens/trainable": 1197537 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 0.044939588755369186, + "learning_rate": 9.281553398058253e-05, + "loss": 0.001187363639473915, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00119, + "step": 240, + "tokens/total": 3932160, + "tokens/train_per_sec_per_gpu": 15.39, + "tokens/trainable": 1249924 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.08850465714931488, + "learning_rate": 9.66990291262136e-05, + "loss": 0.0013382930308580398, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00134, + "step": 250, + "tokens/total": 4096000, + "tokens/train_per_sec_per_gpu": 15.06, + "tokens/trainable": 1301558 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 0.101528100669384, + "learning_rate": 0.00010058252427184467, + "loss": 0.0008709387853741646, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00087, + "step": 260, + "tokens/total": 4259840, + "tokens/train_per_sec_per_gpu": 15.16, + "tokens/trainable": 1353706 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 0.08298433572053909, + "learning_rate": 0.00010446601941747574, + "loss": 0.0013300922699272632, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00133, + "step": 270, + "tokens/total": 4423680, + "tokens/train_per_sec_per_gpu": 15.11, + "tokens/trainable": 1405519 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 0.03734389320015907, + "learning_rate": 0.00010834951456310681, + "loss": 0.0006868645548820495, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00069, + "step": 280, + "tokens/total": 4587520, + "tokens/train_per_sec_per_gpu": 15.07, + "tokens/trainable": 1457494 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 0.07898428291082382, + "learning_rate": 0.00011223300970873786, + "loss": 0.0013550779782235622, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00136, + "step": 290, + "tokens/total": 4751360, + "tokens/train_per_sec_per_gpu": 14.75, + "tokens/trainable": 1509320 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.06320006400346756, + "learning_rate": 0.00011611650485436893, + "loss": 0.0010121697559952736, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00101, + "step": 300, + "tokens/total": 4915200, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 1561332 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 0.013749867677688599, + "learning_rate": 0.00012, + "loss": 0.0006499682553112507, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 310, + "tokens/total": 5079040, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 1613189 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 0.033964402973651886, + "learning_rate": 0.00012388349514563107, + "loss": 0.0008866124786436558, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00089, + "step": 320, + "tokens/total": 5242880, + "tokens/train_per_sec_per_gpu": 15.78, + "tokens/trainable": 1665681 + }, + { + "epoch": 0.32, + "grad_norm": 0.04327597841620445, + "learning_rate": 0.00012776699029126213, + "loss": 0.0005569641944020987, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00056, + "step": 330, + "tokens/total": 5406720, + "tokens/train_per_sec_per_gpu": 14.92, + "tokens/trainable": 1718317 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 0.02717934548854828, + "learning_rate": 0.0001316504854368932, + "loss": 0.0003776244120672345, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00038, + "step": 340, + "tokens/total": 5570560, + "tokens/train_per_sec_per_gpu": 14.42, + "tokens/trainable": 1770210 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.0028237912338227034, + "learning_rate": 0.0001355339805825243, + "loss": 0.0005292522720992566, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00053, + "step": 350, + "tokens/total": 5734400, + "tokens/train_per_sec_per_gpu": 16.4, + "tokens/trainable": 1821987 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 0.0310799703001976, + "learning_rate": 0.00013941747572815535, + "loss": 0.0006786303594708443, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 360, + "tokens/total": 5898240, + "tokens/train_per_sec_per_gpu": 14.72, + "tokens/trainable": 1874266 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 0.17325043678283691, + "learning_rate": 0.0001433009708737864, + "loss": 0.0013975565321743487, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0014, + "step": 370, + "tokens/total": 6062080, + "tokens/train_per_sec_per_gpu": 13.73, + "tokens/trainable": 1926124 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 0.07738752663135529, + "learning_rate": 0.0001471844660194175, + "loss": 0.0006820175796747208, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 380, + "tokens/total": 6225920, + "tokens/train_per_sec_per_gpu": 14.04, + "tokens/trainable": 1978693 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 0.10022349655628204, + "learning_rate": 0.00015106796116504855, + "loss": 0.00063879219815135, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 390, + "tokens/total": 6389760, + "tokens/train_per_sec_per_gpu": 13.34, + "tokens/trainable": 2030378 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.0495997779071331, + "learning_rate": 0.00015495145631067963, + "loss": 0.0021283581852912905, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00213, + "step": 400, + "tokens/total": 6553600, + "tokens/train_per_sec_per_gpu": 15.34, + "tokens/trainable": 2083047 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 0.07361701130867004, + "learning_rate": 0.0001588349514563107, + "loss": 0.001862115040421486, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00186, + "step": 410, + "tokens/total": 6717440, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 2135527 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 0.05466209724545479, + "learning_rate": 0.00016271844660194174, + "loss": 0.0011581303551793098, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00116, + "step": 420, + "tokens/total": 6881280, + "tokens/train_per_sec_per_gpu": 14.77, + "tokens/trainable": 2187636 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 0.04331392049789429, + "learning_rate": 0.00016660194174757283, + "loss": 0.0051729224622249605, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00519, + "step": 430, + "tokens/total": 7045120, + "tokens/train_per_sec_per_gpu": 13.76, + "tokens/trainable": 2239006 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.05931795388460159, + "learning_rate": 0.00017048543689320388, + "loss": 0.00242764875292778, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00243, + "step": 440, + "tokens/total": 7208960, + "tokens/train_per_sec_per_gpu": 14.59, + "tokens/trainable": 2290540 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.04634418711066246, + "learning_rate": 0.00017436893203883494, + "loss": 0.001389546226710081, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00139, + "step": 450, + "tokens/total": 7372800, + "tokens/train_per_sec_per_gpu": 14.78, + "tokens/trainable": 2341852 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 0.04817213863134384, + "learning_rate": 0.00017825242718446602, + "loss": 0.001370794139802456, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00137, + "step": 460, + "tokens/total": 7536640, + "tokens/train_per_sec_per_gpu": 13.77, + "tokens/trainable": 2393320 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 0.011335949413478374, + "learning_rate": 0.00018213592233009708, + "loss": 0.0009715131483972073, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00097, + "step": 470, + "tokens/total": 7700480, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 2445170 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 0.05298445746302605, + "learning_rate": 0.00018601941747572816, + "loss": 0.0008222623728215694, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00082, + "step": 480, + "tokens/total": 7864320, + "tokens/train_per_sec_per_gpu": 13.87, + "tokens/trainable": 2497473 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 0.061686884611845016, + "learning_rate": 0.00018990291262135925, + "loss": 0.000748783303424716, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00075, + "step": 490, + "tokens/total": 8028160, + "tokens/train_per_sec_per_gpu": 15.41, + "tokens/trainable": 2549206 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.03281249850988388, + "learning_rate": 0.0001937864077669903, + "loss": 0.0006062469445168972, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00061, + "step": 500, + "tokens/total": 8192000, + "tokens/train_per_sec_per_gpu": 14.49, + "tokens/trainable": 2600583 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 0.008482079952955246, + "learning_rate": 0.0001976699029126214, + "loss": 0.0008583014830946922, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00086, + "step": 510, + "tokens/total": 8355840, + "tokens/train_per_sec_per_gpu": 13.86, + "tokens/trainable": 2652927 + }, + { + "epoch": 0.5003636363636363, + "eval_loss": 0.0009036393603309989, + "eval_ppl": 1.0009, + "eval_runtime": 12.7872, + "eval_samples_per_second": 15.641, + "eval_steps_per_second": 7.82, + "memory/device_reserved (GiB)": 18.85, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "step": 516 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 0.04333305358886719, + "learning_rate": 0.0001999996332640321, + "loss": 0.0005093200132250785, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 520, + "tokens/total": 8519680, + "tokens/train_per_sec_per_gpu": 14.09, + "tokens/trainable": 2705083 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 0.02485118806362152, + "learning_rate": 0.00019999550751528488, + "loss": 0.0006649125367403031, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 530, + "tokens/total": 8683520, + "tokens/train_per_sec_per_gpu": 14.44, + "tokens/trainable": 2756975 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 0.03736363351345062, + "learning_rate": 0.00019998679778759294, + "loss": 0.0006726076360791921, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 540, + "tokens/total": 8847360, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 2808076 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.05156765505671501, + "learning_rate": 0.0001999735044802263, + "loss": 0.000789718609303236, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00079, + "step": 550, + "tokens/total": 9011200, + "tokens/train_per_sec_per_gpu": 16.36, + "tokens/trainable": 2859893 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 0.647550106048584, + "learning_rate": 0.00019995562820257474, + "loss": 0.003008325584232807, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00301, + "step": 560, + "tokens/total": 9175040, + "tokens/train_per_sec_per_gpu": 14.21, + "tokens/trainable": 2911399 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 0.185165673494339, + "learning_rate": 0.00019993316977411993, + "loss": 0.013715097308158874, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.01381, + "step": 570, + "tokens/total": 9338880, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 2962403 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 0.2401553839445114, + "learning_rate": 0.0001999061302243977, + "loss": 0.009026474505662917, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00907, + "step": 580, + "tokens/total": 9502720, + "tokens/train_per_sec_per_gpu": 14.38, + "tokens/trainable": 3015083 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 0.08092579245567322, + "learning_rate": 0.000199874510792951, + "loss": 0.005716494470834732, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00573, + "step": 590, + "tokens/total": 9666560, + "tokens/train_per_sec_per_gpu": 16.38, + "tokens/trainable": 3066501 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 3.418715476989746, + "learning_rate": 0.00019983831292927305, + "loss": 0.048504295945167544, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0497, + "step": 600, + "tokens/total": 9830400, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 3118633 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 0.2194036841392517, + "learning_rate": 0.00019979753829274085, + "loss": 0.03429323434829712, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.03489, + "step": 610, + "tokens/total": 9994240, + "tokens/train_per_sec_per_gpu": 13.14, + "tokens/trainable": 3170577 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 0.022929901257157326, + "learning_rate": 0.0001997521887525391, + "loss": 0.0015171168372035027, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00152, + "step": 620, + "tokens/total": 10158080, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 3221696 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 0.10083670169115067, + "learning_rate": 0.00019970226638757458, + "loss": 0.0025377947837114333, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00254, + "step": 630, + "tokens/total": 10321920, + "tokens/train_per_sec_per_gpu": 14.7, + "tokens/trainable": 3273775 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 0.01761380024254322, + "learning_rate": 0.00019964777348638083, + "loss": 0.002281896211206913, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00228, + "step": 640, + "tokens/total": 10485760, + "tokens/train_per_sec_per_gpu": 14.89, + "tokens/trainable": 3325516 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.004510029684752226, + "learning_rate": 0.00019958871254701315, + "loss": 0.0009477110579609871, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00095, + "step": 650, + "tokens/total": 10649600, + "tokens/train_per_sec_per_gpu": 16.46, + "tokens/trainable": 3377214 + }, + { + "epoch": 0.64, + "grad_norm": 0.05332477018237114, + "learning_rate": 0.0001995250862769342, + "loss": 0.0005660496186465025, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00057, + "step": 660, + "tokens/total": 10813440, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 3428627 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 0.03861689195036888, + "learning_rate": 0.0001994568975928899, + "loss": 0.0008976863697171211, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0009, + "step": 670, + "tokens/total": 10977280, + "tokens/train_per_sec_per_gpu": 15.66, + "tokens/trainable": 3480170 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 0.021123304963111877, + "learning_rate": 0.00019938414962077553, + "loss": 0.0009612766094505787, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00096, + "step": 680, + "tokens/total": 11141120, + "tokens/train_per_sec_per_gpu": 15.15, + "tokens/trainable": 3532037 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 0.02421347238123417, + "learning_rate": 0.00019930684569549264, + "loss": 0.001021684519946575, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00102, + "step": 690, + "tokens/total": 11304960, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 3583461 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.05008835345506668, + "learning_rate": 0.00019922498936079613, + "loss": 0.0007617876864969731, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00076, + "step": 700, + "tokens/total": 11468800, + "tokens/train_per_sec_per_gpu": 14.08, + "tokens/trainable": 3634649 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 0.035733792930841446, + "learning_rate": 0.00019913858436913171, + "loss": 0.0012347914278507232, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00124, + "step": 710, + "tokens/total": 11632640, + "tokens/train_per_sec_per_gpu": 14.45, + "tokens/trainable": 3685786 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 0.010948767885565758, + "learning_rate": 0.00019904763468146393, + "loss": 0.0008165687322616577, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00082, + "step": 720, + "tokens/total": 11796480, + "tokens/train_per_sec_per_gpu": 15.77, + "tokens/trainable": 3737566 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 0.03577027469873428, + "learning_rate": 0.00019895214446709463, + "loss": 0.001333119161427021, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00133, + "step": 730, + "tokens/total": 11960320, + "tokens/train_per_sec_per_gpu": 13.98, + "tokens/trainable": 3789817 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 0.03971279785037041, + "learning_rate": 0.00019885211810347184, + "loss": 0.0011184611357748508, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00112, + "step": 740, + "tokens/total": 12124160, + "tokens/train_per_sec_per_gpu": 14.67, + "tokens/trainable": 3841912 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.06546575576066971, + "learning_rate": 0.00019874756017598894, + "loss": 0.0012452728115022182, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00125, + "step": 750, + "tokens/total": 12288000, + "tokens/train_per_sec_per_gpu": 14.58, + "tokens/trainable": 3893725 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 0.047058816999197006, + "learning_rate": 0.00019863847547777467, + "loss": 0.0008146104402840138, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00081, + "step": 760, + "tokens/total": 12451840, + "tokens/train_per_sec_per_gpu": 13.49, + "tokens/trainable": 3945033 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.028811641037464142, + "learning_rate": 0.00019852486900947327, + "loss": 0.0008652995340526104, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00087, + "step": 770, + "tokens/total": 12615680, + "tokens/train_per_sec_per_gpu": 15.12, + "tokens/trainable": 3996749 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 0.012203546240925789, + "learning_rate": 0.0001984067459790153, + "loss": 0.000670672720298171, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 780, + "tokens/total": 12779520, + "tokens/train_per_sec_per_gpu": 13.71, + "tokens/trainable": 4048173 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 0.016218814998865128, + "learning_rate": 0.0001982841118013789, + "loss": 0.00046353964135050776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00046, + "step": 790, + "tokens/total": 12943360, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 4099789 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.034673016518354416, + "learning_rate": 0.00019815697209834147, + "loss": 0.000707306619733572, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00071, + "step": 800, + "tokens/total": 13107200, + "tokens/train_per_sec_per_gpu": 14.45, + "tokens/trainable": 4150960 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 0.0022127812262624502, + "learning_rate": 0.00019802533269822208, + "loss": 0.00021896373946219682, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 810, + "tokens/total": 13271040, + "tokens/train_per_sec_per_gpu": 14.75, + "tokens/trainable": 4202984 + }, + { + "epoch": 0.7951515151515152, + "grad_norm": 0.000919274752959609, + "learning_rate": 0.00019788919963561422, + "loss": 0.00043264860287308695, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00043, + "step": 820, + "tokens/total": 13434880, + "tokens/train_per_sec_per_gpu": 14.06, + "tokens/trainable": 4254907 + }, + { + "epoch": 0.8048484848484848, + "grad_norm": 0.007699873298406601, + "learning_rate": 0.00019774857915110913, + "loss": 0.0003196246922016144, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 830, + "tokens/total": 13598720, + "tokens/train_per_sec_per_gpu": 14.75, + "tokens/trainable": 4306095 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 0.015523642301559448, + "learning_rate": 0.00019760347769100987, + "loss": 0.0004476988688111305, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00045, + "step": 840, + "tokens/total": 13762560, + "tokens/train_per_sec_per_gpu": 14.14, + "tokens/trainable": 4357442 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.013460986316204071, + "learning_rate": 0.00019745390190703565, + "loss": 0.0004673306830227375, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00047, + "step": 850, + "tokens/total": 13926400, + "tokens/train_per_sec_per_gpu": 14.1, + "tokens/trainable": 4409277 + }, + { + "epoch": 0.833939393939394, + "grad_norm": 0.0014691110700368881, + "learning_rate": 0.0001972998586560169, + "loss": 0.0003277578856796026, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 860, + "tokens/total": 14090240, + "tokens/train_per_sec_per_gpu": 14.28, + "tokens/trainable": 4460714 + }, + { + "epoch": 0.8436363636363636, + "grad_norm": 0.001358041656203568, + "learning_rate": 0.00019714135499958112, + "loss": 0.00032470382284373046, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 870, + "tokens/total": 14254080, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 4511989 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.04510723799467087, + "learning_rate": 0.0001969783982038289, + "loss": 0.00023182881996035575, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 880, + "tokens/total": 14417920, + "tokens/train_per_sec_per_gpu": 15.41, + "tokens/trainable": 4563354 + }, + { + "epoch": 0.863030303030303, + "grad_norm": 0.14508692920207977, + "learning_rate": 0.00019681099573900113, + "loss": 0.00026136748492717744, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 890, + "tokens/total": 14581760, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 4615691 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.010969490744173527, + "learning_rate": 0.00019663915527913625, + "loss": 0.00016044279327616097, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 900, + "tokens/total": 14745600, + "tokens/train_per_sec_per_gpu": 15.76, + "tokens/trainable": 4667433 + }, + { + "epoch": 0.8824242424242424, + "grad_norm": 0.03874114155769348, + "learning_rate": 0.00019646288470171868, + "loss": 0.0004159804433584213, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 910, + "tokens/total": 14909440, + "tokens/train_per_sec_per_gpu": 16.01, + "tokens/trainable": 4719807 + }, + { + "epoch": 0.8921212121212121, + "grad_norm": 0.044620465487241745, + "learning_rate": 0.00019628219208731756, + "loss": 0.0006739750038832426, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 920, + "tokens/total": 15073280, + "tokens/train_per_sec_per_gpu": 15.05, + "tokens/trainable": 4771772 + }, + { + "epoch": 0.9018181818181819, + "grad_norm": 0.024856949225068092, + "learning_rate": 0.00019609708571921645, + "loss": 0.00039347023703157903, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 930, + "tokens/total": 15237120, + "tokens/train_per_sec_per_gpu": 15.16, + "tokens/trainable": 4823415 + }, + { + "epoch": 0.9115151515151515, + "grad_norm": 0.022198157384991646, + "learning_rate": 0.0001959075740830335, + "loss": 0.0005907822400331497, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00059, + "step": 940, + "tokens/total": 15400960, + "tokens/train_per_sec_per_gpu": 15.36, + "tokens/trainable": 4875269 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.01670038513839245, + "learning_rate": 0.00019571366586633245, + "loss": 0.00027316866908222437, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 950, + "tokens/total": 15564800, + "tokens/train_per_sec_per_gpu": 15.11, + "tokens/trainable": 4927244 + }, + { + "epoch": 0.9309090909090909, + "grad_norm": 0.021392742171883583, + "learning_rate": 0.00019551536995822454, + "loss": 0.0004320886451750994, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00043, + "step": 960, + "tokens/total": 15728640, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 4979068 + }, + { + "epoch": 0.9406060606060606, + "grad_norm": 0.028143158182501793, + "learning_rate": 0.00019531269544896076, + "loss": 0.0005637989845126868, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00056, + "step": 970, + "tokens/total": 15892480, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 5030980 + }, + { + "epoch": 0.9503030303030303, + "grad_norm": 0.077091746032238, + "learning_rate": 0.00019510565162951537, + "loss": 0.0010597245767712594, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00106, + "step": 980, + "tokens/total": 16056320, + "tokens/train_per_sec_per_gpu": 14.04, + "tokens/trainable": 5082759 + }, + { + "epoch": 0.96, + "grad_norm": 0.04455556347966194, + "learning_rate": 0.00019489424799115984, + "loss": 0.0009517236612737179, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00095, + "step": 990, + "tokens/total": 16220160, + "tokens/train_per_sec_per_gpu": 13.04, + "tokens/trainable": 5134379 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.03573840856552124, + "learning_rate": 0.00019467849422502784, + "loss": 0.0008812972344458103, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00088, + "step": 1000, + "tokens/total": 16384000, + "tokens/train_per_sec_per_gpu": 15.23, + "tokens/trainable": 5186184 + }, + { + "epoch": 0.9793939393939394, + "grad_norm": 0.0006549305398948491, + "learning_rate": 0.0001944584002216709, + "loss": 0.0006358013488352299, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 1010, + "tokens/total": 16547840, + "tokens/train_per_sec_per_gpu": 16.1, + "tokens/trainable": 5238320 + }, + { + "epoch": 0.9890909090909091, + "grad_norm": 0.021742813289165497, + "learning_rate": 0.00019423397607060507, + "loss": 0.000400003744289279, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 1020, + "tokens/total": 16711680, + "tokens/train_per_sec_per_gpu": 14.53, + "tokens/trainable": 5290445 + }, + { + "epoch": 0.9987878787878788, + "grad_norm": 0.04323820024728775, + "learning_rate": 0.00019400523205984833, + "loss": 0.0002954686991870403, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1030, + "tokens/total": 16875520, + "tokens/train_per_sec_per_gpu": 14.98, + "tokens/trainable": 5342720 + }, + { + "epoch": 1.001939393939394, + "eval_loss": 0.00047458006883971393, + "eval_ppl": 1.00047, + "eval_runtime": 11.7938, + "eval_samples_per_second": 16.958, + "eval_steps_per_second": 8.479, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.73, + "memory/max_allocated (GiB)": 16.73, + "step": 1032 + }, + { + "epoch": 1.0096969696969698, + "grad_norm": 0.000988126266747713, + "learning_rate": 0.00019377217867544907, + "loss": 0.0004762394353747368, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00048, + "step": 1040, + "tokens/total": 17051648, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 5398184 + }, + { + "epoch": 1.0193939393939393, + "grad_norm": 0.0011711094994097948, + "learning_rate": 0.00019353482660100537, + "loss": 0.00022675264626741408, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 1050, + "tokens/total": 17215488, + "tokens/train_per_sec_per_gpu": 14.05, + "tokens/trainable": 5450329 + }, + { + "epoch": 1.029090909090909, + "grad_norm": 0.007319436874240637, + "learning_rate": 0.0001932931867171751, + "loss": 0.0003059083363041282, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1060, + "tokens/total": 17379328, + "tokens/train_per_sec_per_gpu": 13.66, + "tokens/trainable": 5502706 + }, + { + "epoch": 1.0387878787878788, + "grad_norm": 0.00967186689376831, + "learning_rate": 0.0001930472701011773, + "loss": 0.0003639918984845281, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 1070, + "tokens/total": 17543168, + "tokens/train_per_sec_per_gpu": 15.36, + "tokens/trainable": 5554957 + }, + { + "epoch": 1.0484848484848486, + "grad_norm": 0.0018478024285286665, + "learning_rate": 0.00019279708802628437, + "loss": 0.0002576910424977541, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 1080, + "tokens/total": 17707008, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 5607534 + }, + { + "epoch": 1.0581818181818181, + "grad_norm": 0.018235478550195694, + "learning_rate": 0.00019254265196130517, + "loss": 0.0003647733014076948, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 1090, + "tokens/total": 17870848, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 5659689 + }, + { + "epoch": 1.0678787878787879, + "grad_norm": 0.024314021691679955, + "learning_rate": 0.0001922839735700593, + "loss": 0.00030459570698440077, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1100, + "tokens/total": 18034688, + "tokens/train_per_sec_per_gpu": 13.67, + "tokens/trainable": 5711346 + }, + { + "epoch": 1.0775757575757576, + "grad_norm": 0.0177497286349535, + "learning_rate": 0.0001920210647108425, + "loss": 0.00023341022897511722, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 1110, + "tokens/total": 18198528, + "tokens/train_per_sec_per_gpu": 14.13, + "tokens/trainable": 5763094 + }, + { + "epoch": 1.0872727272727274, + "grad_norm": 0.005781313870102167, + "learning_rate": 0.00019175393743588295, + "loss": 0.0002974884817376733, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1120, + "tokens/total": 18362368, + "tokens/train_per_sec_per_gpu": 14.55, + "tokens/trainable": 5815101 + }, + { + "epoch": 1.096969696969697, + "grad_norm": 0.0026403339579701424, + "learning_rate": 0.00019148260399078887, + "loss": 0.00010604445124045015, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 1130, + "tokens/total": 18526208, + "tokens/train_per_sec_per_gpu": 13.87, + "tokens/trainable": 5866763 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 0.03586777299642563, + "learning_rate": 0.000191207076813987, + "loss": 0.00027820770628750324, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00028, + "step": 1140, + "tokens/total": 18690048, + "tokens/train_per_sec_per_gpu": 13.83, + "tokens/trainable": 5918322 + }, + { + "epoch": 1.1163636363636364, + "grad_norm": 0.007715190295130014, + "learning_rate": 0.00019092736853615257, + "loss": 0.00029321699403226373, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 1150, + "tokens/total": 18853888, + "tokens/train_per_sec_per_gpu": 13.95, + "tokens/trainable": 5970153 + }, + { + "epoch": 1.126060606060606, + "grad_norm": 0.05122547224164009, + "learning_rate": 0.00019064349197963013, + "loss": 0.0005070990417152643, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 1160, + "tokens/total": 19017728, + "tokens/train_per_sec_per_gpu": 15.51, + "tokens/trainable": 6021741 + }, + { + "epoch": 1.1357575757575757, + "grad_norm": 0.032420564442873, + "learning_rate": 0.000190355460157846, + "loss": 0.00031497194431722163, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1170, + "tokens/total": 19181568, + "tokens/train_per_sec_per_gpu": 16.05, + "tokens/trainable": 6074092 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 0.03688061609864235, + "learning_rate": 0.00019006328627471132, + "loss": 0.0003225028282031417, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1180, + "tokens/total": 19345408, + "tokens/train_per_sec_per_gpu": 14.1, + "tokens/trainable": 6126315 + }, + { + "epoch": 1.1551515151515153, + "grad_norm": 0.03359396383166313, + "learning_rate": 0.00018976698372401716, + "loss": 0.0004557626787573099, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00046, + "step": 1190, + "tokens/total": 19509248, + "tokens/train_per_sec_per_gpu": 14.6, + "tokens/trainable": 6178392 + }, + { + "epoch": 1.1648484848484848, + "grad_norm": 0.020522581413388252, + "learning_rate": 0.0001894665660888202, + "loss": 0.0006435967981815339, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 1200, + "tokens/total": 19673088, + "tokens/train_per_sec_per_gpu": 15.47, + "tokens/trainable": 6230984 + }, + { + "epoch": 1.1745454545454546, + "grad_norm": 0.0025893959682434797, + "learning_rate": 0.00018916204714082034, + "loss": 0.0005178887862712145, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00052, + "step": 1210, + "tokens/total": 19836928, + "tokens/train_per_sec_per_gpu": 14.13, + "tokens/trainable": 6282713 + }, + { + "epoch": 1.1842424242424243, + "grad_norm": 0.017288153991103172, + "learning_rate": 0.00018885344083972914, + "loss": 0.0005050559528172016, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 1220, + "tokens/total": 20000768, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 6334555 + }, + { + "epoch": 1.1939393939393939, + "grad_norm": 0.00206086877733469, + "learning_rate": 0.00018854076133263003, + "loss": 0.00020185327157378196, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 1230, + "tokens/total": 20164608, + "tokens/train_per_sec_per_gpu": 14.72, + "tokens/trainable": 6386137 + }, + { + "epoch": 1.2036363636363636, + "grad_norm": 0.02184407040476799, + "learning_rate": 0.0001882240229533297, + "loss": 0.00048260441981256007, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00048, + "step": 1240, + "tokens/total": 20328448, + "tokens/train_per_sec_per_gpu": 14.35, + "tokens/trainable": 6437493 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 0.04215926304459572, + "learning_rate": 0.00018790324022170118, + "loss": 0.0003190681803971529, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1250, + "tokens/total": 20492288, + "tokens/train_per_sec_per_gpu": 14.51, + "tokens/trainable": 6488834 + }, + { + "epoch": 1.2230303030303031, + "grad_norm": 0.006890668533742428, + "learning_rate": 0.00018757842784301784, + "loss": 0.0005027144681662322, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0005, + "step": 1260, + "tokens/total": 20656128, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 6540606 + }, + { + "epoch": 1.2327272727272727, + "grad_norm": 0.005489532835781574, + "learning_rate": 0.00018724960070727972, + "loss": 0.0006080259568989277, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00061, + "step": 1270, + "tokens/total": 20819968, + "tokens/train_per_sec_per_gpu": 13.92, + "tokens/trainable": 6592727 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.005877023097127676, + "learning_rate": 0.00018691677388853068, + "loss": 0.0006749071180820465, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 1280, + "tokens/total": 20983808, + "tokens/train_per_sec_per_gpu": 14.93, + "tokens/trainable": 6645179 + }, + { + "epoch": 1.2521212121212122, + "grad_norm": 0.0061390516348183155, + "learning_rate": 0.00018657996264416745, + "loss": 0.0002642946550622582, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 1290, + "tokens/total": 21147648, + "tokens/train_per_sec_per_gpu": 14.92, + "tokens/trainable": 6697406 + }, + { + "epoch": 1.2618181818181817, + "grad_norm": 0.03444842994213104, + "learning_rate": 0.0001862391824142402, + "loss": 0.0004464905709028244, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00045, + "step": 1300, + "tokens/total": 21311488, + "tokens/train_per_sec_per_gpu": 15.07, + "tokens/trainable": 6749589 + }, + { + "epoch": 1.2715151515151515, + "grad_norm": 0.0036635284777730703, + "learning_rate": 0.00018589444882074474, + "loss": 0.0002096141455695033, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00021, + "step": 1310, + "tokens/total": 21475328, + "tokens/train_per_sec_per_gpu": 13.69, + "tokens/trainable": 6801799 + }, + { + "epoch": 1.2812121212121212, + "grad_norm": 0.003200239036232233, + "learning_rate": 0.00018554577766690636, + "loss": 0.00026335257571190595, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 1320, + "tokens/total": 21639168, + "tokens/train_per_sec_per_gpu": 14.58, + "tokens/trainable": 6854205 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 0.00109296350274235, + "learning_rate": 0.0001851931849364554, + "loss": 0.0003910743165761232, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 1330, + "tokens/total": 21803008, + "tokens/train_per_sec_per_gpu": 14.96, + "tokens/trainable": 6906145 + }, + { + "epoch": 1.3006060606060605, + "grad_norm": 0.0006913666147738695, + "learning_rate": 0.00018483668679289452, + "loss": 0.0003079640679061413, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1340, + "tokens/total": 21966848, + "tokens/train_per_sec_per_gpu": 15.13, + "tokens/trainable": 6957405 + }, + { + "epoch": 1.3103030303030303, + "grad_norm": 0.03036116063594818, + "learning_rate": 0.00018447629957875776, + "loss": 0.0003281526267528534, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 1350, + "tokens/total": 22130688, + "tokens/train_per_sec_per_gpu": 15.08, + "tokens/trainable": 7009256 + }, + { + "epoch": 1.32, + "grad_norm": 0.012580045498907566, + "learning_rate": 0.00018411203981486134, + "loss": 0.0006514057982712984, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 1360, + "tokens/total": 22294528, + "tokens/train_per_sec_per_gpu": 14.66, + "tokens/trainable": 7060734 + }, + { + "epoch": 1.3296969696969696, + "grad_norm": 0.00828342791646719, + "learning_rate": 0.00018374392419954628, + "loss": 0.0003020781092345715, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 1370, + "tokens/total": 22458368, + "tokens/train_per_sec_per_gpu": 15.09, + "tokens/trainable": 7112415 + }, + { + "epoch": 1.3393939393939394, + "grad_norm": 0.09482505917549133, + "learning_rate": 0.00018337196960791302, + "loss": 0.0006797847803682089, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 1380, + "tokens/total": 22622208, + "tokens/train_per_sec_per_gpu": 15.03, + "tokens/trainable": 7164110 + }, + { + "epoch": 1.3490909090909091, + "grad_norm": 0.04534842446446419, + "learning_rate": 0.00018299619309104773, + "loss": 0.000729580270126462, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00073, + "step": 1390, + "tokens/total": 22786048, + "tokens/train_per_sec_per_gpu": 15.49, + "tokens/trainable": 7215797 + }, + { + "epoch": 1.3587878787878789, + "grad_norm": 0.010737202130258083, + "learning_rate": 0.00018261661187524072, + "loss": 0.0007514740340411663, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00075, + "step": 1400, + "tokens/total": 22949888, + "tokens/train_per_sec_per_gpu": 14.14, + "tokens/trainable": 7267691 + }, + { + "epoch": 1.3684848484848484, + "grad_norm": 0.05600081756711006, + "learning_rate": 0.00018223324336119672, + "loss": 0.001420076284557581, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00142, + "step": 1410, + "tokens/total": 23113728, + "tokens/train_per_sec_per_gpu": 15.3, + "tokens/trainable": 7319876 + }, + { + "epoch": 1.3781818181818182, + "grad_norm": 0.019460471346974373, + "learning_rate": 0.00018184610512323718, + "loss": 0.0022406818345189093, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00224, + "step": 1420, + "tokens/total": 23277568, + "tokens/train_per_sec_per_gpu": 14.38, + "tokens/trainable": 7371762 + }, + { + "epoch": 1.387878787878788, + "grad_norm": 0.03277068957686424, + "learning_rate": 0.00018145521490849477, + "loss": 0.000915923435240984, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00092, + "step": 1430, + "tokens/total": 23441408, + "tokens/train_per_sec_per_gpu": 14.66, + "tokens/trainable": 7423685 + }, + { + "epoch": 1.3975757575757575, + "grad_norm": 0.0156385600566864, + "learning_rate": 0.0001810605906360996, + "loss": 0.000897888746112585, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0009, + "step": 1440, + "tokens/total": 23605248, + "tokens/train_per_sec_per_gpu": 13.99, + "tokens/trainable": 7476266 + }, + { + "epoch": 1.4072727272727272, + "grad_norm": 0.01643913984298706, + "learning_rate": 0.00018066225039635794, + "loss": 0.000922933965921402, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00092, + "step": 1450, + "tokens/total": 23769088, + "tokens/train_per_sec_per_gpu": 14.57, + "tokens/trainable": 7528208 + }, + { + "epoch": 1.416969696969697, + "grad_norm": 0.024322666227817535, + "learning_rate": 0.00018026021244992287, + "loss": 0.0011652217246592045, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00117, + "step": 1460, + "tokens/total": 23932928, + "tokens/train_per_sec_per_gpu": 13.91, + "tokens/trainable": 7580038 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 0.05165834724903107, + "learning_rate": 0.0001798544952269572, + "loss": 0.0009731135331094265, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00097, + "step": 1470, + "tokens/total": 24096768, + "tokens/train_per_sec_per_gpu": 14.56, + "tokens/trainable": 7631772 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 0.02529827691614628, + "learning_rate": 0.0001794451173262885, + "loss": 0.0005802253726869822, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00058, + "step": 1480, + "tokens/total": 24260608, + "tokens/train_per_sec_per_gpu": 13.72, + "tokens/trainable": 7683048 + }, + { + "epoch": 1.446060606060606, + "grad_norm": 0.0670745000243187, + "learning_rate": 0.00017903209751455665, + "loss": 0.000642474414780736, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00064, + "step": 1490, + "tokens/total": 24424448, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 7735332 + }, + { + "epoch": 1.4557575757575758, + "grad_norm": 0.02367187850177288, + "learning_rate": 0.00017861545472535348, + "loss": 0.00032834114972501993, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 1500, + "tokens/total": 24588288, + "tokens/train_per_sec_per_gpu": 16.37, + "tokens/trainable": 7787186 + }, + { + "epoch": 1.4654545454545453, + "grad_norm": 0.011678172275424004, + "learning_rate": 0.00017819520805835475, + "loss": 0.0009690596722066403, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00097, + "step": 1510, + "tokens/total": 24752128, + "tokens/train_per_sec_per_gpu": 13.55, + "tokens/trainable": 7838878 + }, + { + "epoch": 1.475151515151515, + "grad_norm": 0.05298800393939018, + "learning_rate": 0.00017777137677844461, + "loss": 0.0009098535403609276, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00091, + "step": 1520, + "tokens/total": 24915968, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 7890631 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 0.037918779999017715, + "learning_rate": 0.00017734398031483265, + "loss": 0.0006457697600126266, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 1530, + "tokens/total": 25079808, + "tokens/train_per_sec_per_gpu": 13.25, + "tokens/trainable": 7942366 + }, + { + "epoch": 1.4945454545454546, + "grad_norm": 0.02729674056172371, + "learning_rate": 0.0001769130382601629, + "loss": 0.0009943137876689434, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00099, + "step": 1540, + "tokens/total": 25243648, + "tokens/train_per_sec_per_gpu": 14.37, + "tokens/trainable": 7994307 + }, + { + "epoch": 1.5023030303030303, + "eval_loss": 0.0006865999894216657, + "eval_ppl": 1.00069, + "eval_runtime": 12.127, + "eval_samples_per_second": 16.492, + "eval_steps_per_second": 8.246, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "step": 1548 + }, + { + "epoch": 1.5042424242424244, + "grad_norm": 0.053267233073711395, + "learning_rate": 0.00017647857036961592, + "loss": 0.0006284893956035375, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00063, + "step": 1550, + "tokens/total": 25407488, + "tokens/train_per_sec_per_gpu": 14.87, + "tokens/trainable": 8046124 + }, + { + "epoch": 1.513939393939394, + "grad_norm": 0.05232734978199005, + "learning_rate": 0.0001760405965600031, + "loss": 0.0005064161494374275, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00051, + "step": 1560, + "tokens/total": 25571328, + "tokens/train_per_sec_per_gpu": 14.39, + "tokens/trainable": 8098367 + }, + { + "epoch": 1.5236363636363637, + "grad_norm": 0.015440079383552074, + "learning_rate": 0.00017559913690885364, + "loss": 0.0004742793273180723, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00047, + "step": 1570, + "tokens/total": 25735168, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 8150005 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.005799058359116316, + "learning_rate": 0.00017515421165349414, + "loss": 0.0005522690713405609, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00055, + "step": 1580, + "tokens/total": 25899008, + "tokens/train_per_sec_per_gpu": 14.94, + "tokens/trainable": 8201985 + }, + { + "epoch": 1.543030303030303, + "grad_norm": 0.025745827704668045, + "learning_rate": 0.00017470584119012094, + "loss": 0.0004415466450154781, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 1590, + "tokens/total": 26062848, + "tokens/train_per_sec_per_gpu": 14.76, + "tokens/trainable": 8253407 + }, + { + "epoch": 1.5527272727272727, + "grad_norm": 0.006111942231655121, + "learning_rate": 0.00017425404607286508, + "loss": 0.0004033858887851238, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 1600, + "tokens/total": 26226688, + "tokens/train_per_sec_per_gpu": 13.45, + "tokens/trainable": 8305596 + }, + { + "epoch": 1.5624242424242425, + "grad_norm": 0.01315031573176384, + "learning_rate": 0.00017379884701285, + "loss": 0.0006456051021814346, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00065, + "step": 1610, + "tokens/total": 26390528, + "tokens/train_per_sec_per_gpu": 15.34, + "tokens/trainable": 8357648 + }, + { + "epoch": 1.5721212121212123, + "grad_norm": 0.002383842132985592, + "learning_rate": 0.00017334026487724225, + "loss": 0.00028960562776774167, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 1620, + "tokens/total": 26554368, + "tokens/train_per_sec_per_gpu": 14.29, + "tokens/trainable": 8410056 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 0.006294222082942724, + "learning_rate": 0.0001728783206882948, + "loss": 0.00025043871719390156, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 1630, + "tokens/total": 26718208, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 8461798 + }, + { + "epoch": 1.5915151515151515, + "grad_norm": 8.702854393050075e-05, + "learning_rate": 0.00017241303562238336, + "loss": 0.00012461008736863732, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 1640, + "tokens/total": 26882048, + "tokens/train_per_sec_per_gpu": 15.61, + "tokens/trainable": 8514035 + }, + { + "epoch": 1.601212121212121, + "grad_norm": 0.07624056935310364, + "learning_rate": 0.00017194443100903558, + "loss": 0.00024855402298271654, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 1650, + "tokens/total": 27045888, + "tokens/train_per_sec_per_gpu": 14.48, + "tokens/trainable": 8565875 + }, + { + "epoch": 1.6109090909090908, + "grad_norm": 0.02497026138007641, + "learning_rate": 0.00017147252832995337, + "loss": 0.00044286823831498625, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 1660, + "tokens/total": 27209728, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 8617912 + }, + { + "epoch": 1.6206060606060606, + "grad_norm": 0.0016530955908820033, + "learning_rate": 0.00017099734921802802, + "loss": 0.0003104714211076498, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1670, + "tokens/total": 27373568, + "tokens/train_per_sec_per_gpu": 13.53, + "tokens/trainable": 8669875 + }, + { + "epoch": 1.6303030303030304, + "grad_norm": 0.02621961385011673, + "learning_rate": 0.00017051891545634854, + "loss": 0.0004010321106761694, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 1680, + "tokens/total": 27537408, + "tokens/train_per_sec_per_gpu": 16.09, + "tokens/trainable": 8721709 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.043721288442611694, + "learning_rate": 0.00017003724897720316, + "loss": 0.00042473864741623404, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 1690, + "tokens/total": 27701248, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 8773762 + }, + { + "epoch": 1.6496969696969697, + "grad_norm": 0.01791808009147644, + "learning_rate": 0.00016955237186107387, + "loss": 0.0003858121577650309, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 1700, + "tokens/total": 27865088, + "tokens/train_per_sec_per_gpu": 14.87, + "tokens/trainable": 8825435 + }, + { + "epoch": 1.6593939393939394, + "grad_norm": 0.017175329849123955, + "learning_rate": 0.0001690643063356241, + "loss": 0.0003785108681768179, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00038, + "step": 1710, + "tokens/total": 28028928, + "tokens/train_per_sec_per_gpu": 13.63, + "tokens/trainable": 8877227 + }, + { + "epoch": 1.669090909090909, + "grad_norm": 0.03429865464568138, + "learning_rate": 0.0001685730747746799, + "loss": 0.0003128159558400512, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1720, + "tokens/total": 28192768, + "tokens/train_per_sec_per_gpu": 13.42, + "tokens/trainable": 8928835 + }, + { + "epoch": 1.6787878787878787, + "grad_norm": 0.008623798377811909, + "learning_rate": 0.0001680786996972043, + "loss": 0.0008884714916348457, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00089, + "step": 1730, + "tokens/total": 28356608, + "tokens/train_per_sec_per_gpu": 14.8, + "tokens/trainable": 8979863 + }, + { + "epoch": 1.6884848484848485, + "grad_norm": 0.007137796841561794, + "learning_rate": 0.00016758120376626488, + "loss": 0.000342932902276516, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00034, + "step": 1740, + "tokens/total": 28520448, + "tokens/train_per_sec_per_gpu": 13.64, + "tokens/trainable": 9031317 + }, + { + "epoch": 1.6981818181818182, + "grad_norm": 0.006754934322088957, + "learning_rate": 0.00016708060978799493, + "loss": 0.00031610706355422735, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1750, + "tokens/total": 28684288, + "tokens/train_per_sec_per_gpu": 16.63, + "tokens/trainable": 9082925 + }, + { + "epoch": 1.707878787878788, + "grad_norm": 0.012158721685409546, + "learning_rate": 0.00016657694071054794, + "loss": 0.00039324900135397913, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00039, + "step": 1760, + "tokens/total": 28848128, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 9134535 + }, + { + "epoch": 1.7175757575757575, + "grad_norm": 0.04653792828321457, + "learning_rate": 0.00016607021962304565, + "loss": 0.0003617320442572236, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 1770, + "tokens/total": 29011968, + "tokens/train_per_sec_per_gpu": 14.01, + "tokens/trainable": 9186666 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.009638557210564613, + "learning_rate": 0.00016556046975451963, + "loss": 0.00031410730443894865, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1780, + "tokens/total": 29175808, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 9238529 + }, + { + "epoch": 1.7369696969696968, + "grad_norm": 0.017064686864614487, + "learning_rate": 0.0001650477144728462, + "loss": 0.00043909624218940735, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 1790, + "tokens/total": 29339648, + "tokens/train_per_sec_per_gpu": 14.08, + "tokens/trainable": 9290289 + }, + { + "epoch": 1.7466666666666666, + "grad_norm": 0.0022802259773015976, + "learning_rate": 0.00016453197728367563, + "loss": 0.00032380607444792986, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 1800, + "tokens/total": 29503488, + "tokens/train_per_sec_per_gpu": 13.73, + "tokens/trainable": 9341953 + }, + { + "epoch": 1.7563636363636363, + "grad_norm": 0.0036841712426394224, + "learning_rate": 0.00016401328182935417, + "loss": 0.0006712255533784627, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00067, + "step": 1810, + "tokens/total": 29667328, + "tokens/train_per_sec_per_gpu": 16.36, + "tokens/trainable": 9393126 + }, + { + "epoch": 1.766060606060606, + "grad_norm": 0.0006454121321439743, + "learning_rate": 0.0001634916518878404, + "loss": 0.00010477005271241069, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 1820, + "tokens/total": 29831168, + "tokens/train_per_sec_per_gpu": 14.7, + "tokens/trainable": 9444494 + }, + { + "epoch": 1.7757575757575759, + "grad_norm": 0.035474907606840134, + "learning_rate": 0.00016296711137161535, + "loss": 0.00034273902419954536, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00034, + "step": 1830, + "tokens/total": 29995008, + "tokens/train_per_sec_per_gpu": 14.78, + "tokens/trainable": 9496432 + }, + { + "epoch": 1.7854545454545454, + "grad_norm": 0.0042278701439499855, + "learning_rate": 0.00016243968432658605, + "loss": 0.0004896576981991529, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 1840, + "tokens/total": 30158848, + "tokens/train_per_sec_per_gpu": 15.01, + "tokens/trainable": 9547913 + }, + { + "epoch": 1.7951515151515152, + "grad_norm": 0.008337569423019886, + "learning_rate": 0.00016190939493098344, + "loss": 0.0003711160738021135, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00037, + "step": 1850, + "tokens/total": 30322688, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 9599023 + }, + { + "epoch": 1.8048484848484847, + "grad_norm": 0.033457424491643906, + "learning_rate": 0.00016137626749425377, + "loss": 0.0005191094242036343, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00052, + "step": 1860, + "tokens/total": 30486528, + "tokens/train_per_sec_per_gpu": 14.35, + "tokens/trainable": 9651048 + }, + { + "epoch": 1.8145454545454545, + "grad_norm": 0.014811063185334206, + "learning_rate": 0.0001608403264559445, + "loss": 0.0002689486602321267, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 1870, + "tokens/total": 30650368, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 9703354 + }, + { + "epoch": 1.8242424242424242, + "grad_norm": 0.011829032562673092, + "learning_rate": 0.00016030159638458376, + "loss": 0.0003055253764614463, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00031, + "step": 1880, + "tokens/total": 30814208, + "tokens/train_per_sec_per_gpu": 14.05, + "tokens/trainable": 9755371 + }, + { + "epoch": 1.833939393939394, + "grad_norm": 0.003898326540365815, + "learning_rate": 0.00015976010197655397, + "loss": 0.00023026440758258104, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 1890, + "tokens/total": 30978048, + "tokens/train_per_sec_per_gpu": 13.89, + "tokens/trainable": 9807011 + }, + { + "epoch": 1.8436363636363637, + "grad_norm": 0.00993694830685854, + "learning_rate": 0.00015921586805496004, + "loss": 0.000414779270067811, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00041, + "step": 1900, + "tokens/total": 31141888, + "tokens/train_per_sec_per_gpu": 14.42, + "tokens/trainable": 9859849 + }, + { + "epoch": 1.8533333333333335, + "grad_norm": 0.00715588079765439, + "learning_rate": 0.0001586689195684911, + "loss": 0.0004666011780500412, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00047, + "step": 1910, + "tokens/total": 31305728, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 9911712 + }, + { + "epoch": 1.863030303030303, + "grad_norm": 0.021137356758117676, + "learning_rate": 0.000158119281590277, + "loss": 0.00046254890039563177, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00046, + "step": 1920, + "tokens/total": 31469568, + "tokens/train_per_sec_per_gpu": 14.81, + "tokens/trainable": 9963813 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 0.0023340010084211826, + "learning_rate": 0.000157566979316739, + "loss": 0.0004919813480228185, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 1930, + "tokens/total": 31633408, + "tokens/train_per_sec_per_gpu": 15.8, + "tokens/trainable": 10015724 + }, + { + "epoch": 1.8824242424242423, + "grad_norm": 0.01151804905384779, + "learning_rate": 0.00015701203806643433, + "loss": 0.00023937469813972712, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00024, + "step": 1940, + "tokens/total": 31797248, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 10067073 + }, + { + "epoch": 1.892121212121212, + "grad_norm": 0.016535570845007896, + "learning_rate": 0.00015645448327889603, + "loss": 0.00021827330347150563, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 1950, + "tokens/total": 31961088, + "tokens/train_per_sec_per_gpu": 14.48, + "tokens/trainable": 10119393 + }, + { + "epoch": 1.9018181818181819, + "grad_norm": 0.0034130853600800037, + "learning_rate": 0.00015589434051346634, + "loss": 0.00017861993983387948, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 1960, + "tokens/total": 32124928, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 10171930 + }, + { + "epoch": 1.9115151515151516, + "grad_norm": 0.02398502826690674, + "learning_rate": 0.0001553316354481253, + "loss": 0.00014141426654532552, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 1970, + "tokens/total": 32288768, + "tokens/train_per_sec_per_gpu": 15.59, + "tokens/trainable": 10223639 + }, + { + "epoch": 1.9212121212121214, + "grad_norm": 0.0007365989149548113, + "learning_rate": 0.00015476639387831343, + "loss": 0.00011406640987843275, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 1980, + "tokens/total": 32452608, + "tokens/train_per_sec_per_gpu": 13.45, + "tokens/trainable": 10275019 + }, + { + "epoch": 1.930909090909091, + "grad_norm": 0.028317851945757866, + "learning_rate": 0.00015419864171574944, + "loss": 0.0004076042678207159, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00041, + "step": 1990, + "tokens/total": 32616448, + "tokens/train_per_sec_per_gpu": 14.68, + "tokens/trainable": 10327234 + }, + { + "epoch": 1.9406060606060604, + "grad_norm": 0.0007216805825009942, + "learning_rate": 0.00015362840498724215, + "loss": 0.0002287053968757391, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 2000, + "tokens/total": 32780288, + "tokens/train_per_sec_per_gpu": 14.77, + "tokens/trainable": 10379906 + }, + { + "epoch": 1.9503030303030302, + "grad_norm": 0.021391045302152634, + "learning_rate": 0.00015305570983349743, + "loss": 0.0006855262909084558, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00069, + "step": 2010, + "tokens/total": 32944128, + "tokens/train_per_sec_per_gpu": 13.75, + "tokens/trainable": 10431864 + }, + { + "epoch": 1.96, + "grad_norm": 0.014411289244890213, + "learning_rate": 0.00015248058250792008, + "loss": 0.00020992583595216274, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00021, + "step": 2020, + "tokens/total": 33107968, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 10483503 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.0019180785166099668, + "learning_rate": 0.00015190304937540993, + "loss": 0.000295165297575295, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 2030, + "tokens/total": 33271808, + "tokens/train_per_sec_per_gpu": 15.32, + "tokens/trainable": 10534682 + }, + { + "epoch": 1.9793939393939395, + "grad_norm": 0.027906686067581177, + "learning_rate": 0.00015132313691115367, + "loss": 0.00030230602715164423, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 2040, + "tokens/total": 33435648, + "tokens/train_per_sec_per_gpu": 13.52, + "tokens/trainable": 10586848 + }, + { + "epoch": 1.9890909090909092, + "grad_norm": 0.030775317922234535, + "learning_rate": 0.00015074087169941085, + "loss": 0.00011671001557260752, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2050, + "tokens/total": 33599488, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 10638485 + }, + { + "epoch": 1.9987878787878788, + "grad_norm": 0.054577309638261795, + "learning_rate": 0.00015015628043229523, + "loss": 0.0003703285474330187, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00037, + "step": 2060, + "tokens/total": 33763328, + "tokens/train_per_sec_per_gpu": 14.81, + "tokens/trainable": 10689855 + }, + { + "epoch": 2.003878787878788, + "eval_loss": 0.00032737868605181575, + "eval_ppl": 1.00033, + "eval_runtime": 12.1345, + "eval_samples_per_second": 16.482, + "eval_steps_per_second": 8.241, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.73, + "memory/max_allocated (GiB)": 16.73, + "step": 2064 + }, + { + "epoch": 2.0096969696969698, + "grad_norm": 0.02574228309094906, + "learning_rate": 0.00014956938990855139, + "loss": 0.0006258985958993435, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00063, + "step": 2070, + "tokens/total": 33939456, + "tokens/train_per_sec_per_gpu": 15.27, + "tokens/trainable": 10745674 + }, + { + "epoch": 2.0193939393939395, + "grad_norm": 0.0003698334621731192, + "learning_rate": 0.00014898022703232604, + "loss": 0.00025913610588759186, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 2080, + "tokens/total": 34103296, + "tokens/train_per_sec_per_gpu": 14.61, + "tokens/trainable": 10797792 + }, + { + "epoch": 2.0290909090909093, + "grad_norm": 0.0033025413285940886, + "learning_rate": 0.00014838881881193468, + "loss": 0.0001973774516955018, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2090, + "tokens/total": 34267136, + "tokens/train_per_sec_per_gpu": 14.68, + "tokens/trainable": 10849439 + }, + { + "epoch": 2.0387878787878786, + "grad_norm": 0.0001970751181943342, + "learning_rate": 0.00014779519235862365, + "loss": 0.00029088449664413927, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 2100, + "tokens/total": 34430976, + "tokens/train_per_sec_per_gpu": 14.21, + "tokens/trainable": 10902278 + }, + { + "epoch": 2.0484848484848484, + "grad_norm": 0.0011533941142261028, + "learning_rate": 0.00014719937488532706, + "loss": 0.00021680027712136506, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 2110, + "tokens/total": 34594816, + "tokens/train_per_sec_per_gpu": 14.83, + "tokens/trainable": 10954337 + }, + { + "epoch": 2.058181818181818, + "grad_norm": 0.0012934933183714747, + "learning_rate": 0.00014660139370541953, + "loss": 0.00015767107252031564, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2120, + "tokens/total": 34758656, + "tokens/train_per_sec_per_gpu": 14.22, + "tokens/trainable": 11006253 + }, + { + "epoch": 2.067878787878788, + "grad_norm": 0.00458933599293232, + "learning_rate": 0.00014600127623146388, + "loss": 0.0001101671252399683, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2130, + "tokens/total": 34922496, + "tokens/train_per_sec_per_gpu": 14.53, + "tokens/trainable": 11058062 + }, + { + "epoch": 2.0775757575757576, + "grad_norm": 0.0032617889810353518, + "learning_rate": 0.00014539904997395468, + "loss": 0.00019488829420879483, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2140, + "tokens/total": 35086336, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 11109942 + }, + { + "epoch": 2.0872727272727274, + "grad_norm": 0.007860329002141953, + "learning_rate": 0.00014479474254005707, + "loss": 9.439463028684258e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2150, + "tokens/total": 35250176, + "tokens/train_per_sec_per_gpu": 15.29, + "tokens/trainable": 11161699 + }, + { + "epoch": 2.096969696969697, + "grad_norm": 0.0008931563934311271, + "learning_rate": 0.0001441883816323411, + "loss": 0.00016972824232652783, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00017, + "step": 2160, + "tokens/total": 35414016, + "tokens/train_per_sec_per_gpu": 15.51, + "tokens/trainable": 11213741 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 0.006945727858692408, + "learning_rate": 0.00014357999504751182, + "loss": 9.466245537623764e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2170, + "tokens/total": 35577856, + "tokens/train_per_sec_per_gpu": 14.46, + "tokens/trainable": 11265729 + }, + { + "epoch": 2.1163636363636362, + "grad_norm": 0.009756731800734997, + "learning_rate": 0.0001429696106751352, + "loss": 7.116884225979447e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2180, + "tokens/total": 35741696, + "tokens/train_per_sec_per_gpu": 14.9, + "tokens/trainable": 11318089 + }, + { + "epoch": 2.126060606060606, + "grad_norm": 0.003617421491071582, + "learning_rate": 0.00014235725649635933, + "loss": 0.00017703230259940027, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 2190, + "tokens/total": 35905536, + "tokens/train_per_sec_per_gpu": 16.31, + "tokens/trainable": 11370159 + }, + { + "epoch": 2.1357575757575757, + "grad_norm": 0.0008388167480006814, + "learning_rate": 0.00014174296058263195, + "loss": 0.0002220547990873456, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 2200, + "tokens/total": 36069376, + "tokens/train_per_sec_per_gpu": 15.31, + "tokens/trainable": 11422568 + }, + { + "epoch": 2.1454545454545455, + "grad_norm": 0.03691717982292175, + "learning_rate": 0.00014112675109441352, + "loss": 0.00018518726574257016, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2210, + "tokens/total": 36233216, + "tokens/train_per_sec_per_gpu": 14.81, + "tokens/trainable": 11473971 + }, + { + "epoch": 2.1551515151515153, + "grad_norm": 0.0008130021742545068, + "learning_rate": 0.0001405086562798863, + "loss": 0.0001568903331644833, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2220, + "tokens/total": 36397056, + "tokens/train_per_sec_per_gpu": 15.22, + "tokens/trainable": 11526106 + }, + { + "epoch": 2.164848484848485, + "grad_norm": 0.0014426361303776503, + "learning_rate": 0.00013988870447365933, + "loss": 0.00027461207937449215, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 2230, + "tokens/total": 36560896, + "tokens/train_per_sec_per_gpu": 15.95, + "tokens/trainable": 11578483 + }, + { + "epoch": 2.174545454545455, + "grad_norm": 0.029341408982872963, + "learning_rate": 0.00013926692409546964, + "loss": 0.0003196842735633254, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 2240, + "tokens/total": 36724736, + "tokens/train_per_sec_per_gpu": 16.15, + "tokens/trainable": 11630965 + }, + { + "epoch": 2.184242424242424, + "grad_norm": 0.00210795970633626, + "learning_rate": 0.00013864334364887943, + "loss": 0.0004162232857197523, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 2250, + "tokens/total": 36888576, + "tokens/train_per_sec_per_gpu": 15.03, + "tokens/trainable": 11682642 + }, + { + "epoch": 2.193939393939394, + "grad_norm": 0.003121949266642332, + "learning_rate": 0.0001380179917199692, + "loss": 0.00042150220833718776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00042, + "step": 2260, + "tokens/total": 37052416, + "tokens/train_per_sec_per_gpu": 15.47, + "tokens/trainable": 11734687 + }, + { + "epoch": 2.2036363636363636, + "grad_norm": 0.009584403596818447, + "learning_rate": 0.00013739089697602764, + "loss": 0.0003333257278427482, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 2270, + "tokens/total": 37216256, + "tokens/train_per_sec_per_gpu": 14.88, + "tokens/trainable": 11786194 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 0.0031741419807076454, + "learning_rate": 0.00013676208816423724, + "loss": 0.00011245617642998695, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2280, + "tokens/total": 37380096, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 11837550 + }, + { + "epoch": 2.223030303030303, + "grad_norm": 0.03865548223257065, + "learning_rate": 0.00013613159411035648, + "loss": 0.00020037838257849216, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2290, + "tokens/total": 37543936, + "tokens/train_per_sec_per_gpu": 15.33, + "tokens/trainable": 11889401 + }, + { + "epoch": 2.232727272727273, + "grad_norm": 0.012145821005105972, + "learning_rate": 0.00013549944371739854, + "loss": 0.00011074641952291131, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2300, + "tokens/total": 37707776, + "tokens/train_per_sec_per_gpu": 14.18, + "tokens/trainable": 11941616 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 0.0009741581161506474, + "learning_rate": 0.00013486566596430623, + "loss": 0.00024885197635740044, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 2310, + "tokens/total": 37871616, + "tokens/train_per_sec_per_gpu": 14.86, + "tokens/trainable": 11993896 + }, + { + "epoch": 2.252121212121212, + "grad_norm": 0.011996032670140266, + "learning_rate": 0.00013423028990462344, + "loss": 0.0003463976550847292, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00035, + "step": 2320, + "tokens/total": 38035456, + "tokens/train_per_sec_per_gpu": 14.09, + "tokens/trainable": 12045275 + }, + { + "epoch": 2.2618181818181817, + "grad_norm": 0.021751079708337784, + "learning_rate": 0.0001335933446651636, + "loss": 0.0008397232741117477, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00084, + "step": 2330, + "tokens/total": 38199296, + "tokens/train_per_sec_per_gpu": 15.05, + "tokens/trainable": 12096897 + }, + { + "epoch": 2.2715151515151515, + "grad_norm": 0.02025892771780491, + "learning_rate": 0.00013295485944467405, + "loss": 0.0005815276876091957, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00058, + "step": 2340, + "tokens/total": 38363136, + "tokens/train_per_sec_per_gpu": 14.28, + "tokens/trainable": 12148443 + }, + { + "epoch": 2.2812121212121212, + "grad_norm": 0.028191884979605675, + "learning_rate": 0.0001323148635124978, + "loss": 0.00035780200269073246, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 2350, + "tokens/total": 38526976, + "tokens/train_per_sec_per_gpu": 14.5, + "tokens/trainable": 12200260 + }, + { + "epoch": 2.290909090909091, + "grad_norm": 0.018472714349627495, + "learning_rate": 0.00013167338620723165, + "loss": 0.0006046999711543322, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0006, + "step": 2360, + "tokens/total": 38690816, + "tokens/train_per_sec_per_gpu": 13.44, + "tokens/trainable": 12252405 + }, + { + "epoch": 2.3006060606060608, + "grad_norm": 0.018522929400205612, + "learning_rate": 0.00013103045693538135, + "loss": 0.000294373813085258, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00029, + "step": 2370, + "tokens/total": 38854656, + "tokens/train_per_sec_per_gpu": 15.42, + "tokens/trainable": 12304241 + }, + { + "epoch": 2.3103030303030305, + "grad_norm": 0.024094371125102043, + "learning_rate": 0.00013038610517001332, + "loss": 0.00027109310030937195, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00027, + "step": 2380, + "tokens/total": 39018496, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 12356446 + }, + { + "epoch": 2.32, + "grad_norm": 0.019156360998749733, + "learning_rate": 0.0001297403604494039, + "loss": 0.00016260554548352957, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2390, + "tokens/total": 39182336, + "tokens/train_per_sec_per_gpu": 15.4, + "tokens/trainable": 12408205 + }, + { + "epoch": 2.3296969696969696, + "grad_norm": 0.030154094099998474, + "learning_rate": 0.00012909325237568496, + "loss": 0.0001862394856289029, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2400, + "tokens/total": 39346176, + "tokens/train_per_sec_per_gpu": 14.29, + "tokens/trainable": 12460035 + }, + { + "epoch": 2.3393939393939394, + "grad_norm": 0.0018396849045529962, + "learning_rate": 0.00012844481061348708, + "loss": 0.00013985306723043322, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 2410, + "tokens/total": 39510016, + "tokens/train_per_sec_per_gpu": 14.37, + "tokens/trainable": 12512793 + }, + { + "epoch": 2.349090909090909, + "grad_norm": 0.007293887436389923, + "learning_rate": 0.00012779506488857945, + "loss": 0.0004945728462189436, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 2420, + "tokens/total": 39673856, + "tokens/train_per_sec_per_gpu": 14.49, + "tokens/trainable": 12564678 + }, + { + "epoch": 2.358787878787879, + "grad_norm": 0.0013043258804827929, + "learning_rate": 0.00012714404498650743, + "loss": 0.0002628775080665946, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 2430, + "tokens/total": 39837696, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 12616633 + }, + { + "epoch": 2.3684848484848486, + "grad_norm": 0.00601148558780551, + "learning_rate": 0.00012649178075122702, + "loss": 0.0005043975077569484, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0005, + "step": 2440, + "tokens/total": 40001536, + "tokens/train_per_sec_per_gpu": 15.51, + "tokens/trainable": 12669042 + }, + { + "epoch": 2.378181818181818, + "grad_norm": 0.004092884249985218, + "learning_rate": 0.00012583830208373674, + "loss": 0.00020396907348185778, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2450, + "tokens/total": 40165376, + "tokens/train_per_sec_per_gpu": 14.54, + "tokens/trainable": 12720635 + }, + { + "epoch": 2.3878787878787877, + "grad_norm": 0.004112009424716234, + "learning_rate": 0.00012518363894070683, + "loss": 0.00010208101011812686, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2460, + "tokens/total": 40329216, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 12772667 + }, + { + "epoch": 2.3975757575757575, + "grad_norm": 0.005660182796418667, + "learning_rate": 0.00012452782133310624, + "loss": 0.0001985645852982998, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2470, + "tokens/total": 40493056, + "tokens/train_per_sec_per_gpu": 14.85, + "tokens/trainable": 12824689 + }, + { + "epoch": 2.4072727272727272, + "grad_norm": 0.014492900110781193, + "learning_rate": 0.00012387087932482665, + "loss": 0.00014933901838958262, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 2480, + "tokens/total": 40656896, + "tokens/train_per_sec_per_gpu": 15.28, + "tokens/trainable": 12876411 + }, + { + "epoch": 2.416969696969697, + "grad_norm": 0.0019427158404141665, + "learning_rate": 0.00012321284303130426, + "loss": 7.200292311608792e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2490, + "tokens/total": 40820736, + "tokens/train_per_sec_per_gpu": 15.06, + "tokens/trainable": 12928897 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 0.02509615570306778, + "learning_rate": 0.00012255374261813944, + "loss": 0.00043660206720232966, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00044, + "step": 2500, + "tokens/total": 40984576, + "tokens/train_per_sec_per_gpu": 14.48, + "tokens/trainable": 12980466 + }, + { + "epoch": 2.4363636363636365, + "grad_norm": 0.007349422667175531, + "learning_rate": 0.00012189360829971371, + "loss": 0.0001283957506529987, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2510, + "tokens/total": 41148416, + "tokens/train_per_sec_per_gpu": 15.15, + "tokens/trainable": 13032069 + }, + { + "epoch": 2.4460606060606063, + "grad_norm": 0.0029069455340504646, + "learning_rate": 0.00012123247033780476, + "loss": 6.898010615259409e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2520, + "tokens/total": 41312256, + "tokens/train_per_sec_per_gpu": 14.23, + "tokens/trainable": 13084418 + }, + { + "epoch": 2.4557575757575756, + "grad_norm": 0.010700283572077751, + "learning_rate": 0.00012057035904019913, + "loss": 0.00011750553967431188, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2530, + "tokens/total": 41476096, + "tokens/train_per_sec_per_gpu": 13.79, + "tokens/trainable": 13136606 + }, + { + "epoch": 2.4654545454545453, + "grad_norm": 0.002509322250261903, + "learning_rate": 0.00011990730475930288, + "loss": 0.0003227895824238658, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 2540, + "tokens/total": 41639936, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 13188322 + }, + { + "epoch": 2.475151515151515, + "grad_norm": 0.009015699848532677, + "learning_rate": 0.00011924333789075013, + "loss": 0.00032298346050083635, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 2550, + "tokens/total": 41803776, + "tokens/train_per_sec_per_gpu": 14.52, + "tokens/trainable": 13240187 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.3949132561683655, + "learning_rate": 0.00011857848887200973, + "loss": 0.0007695606444031, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00077, + "step": 2560, + "tokens/total": 41967616, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 13291657 + }, + { + "epoch": 2.4945454545454546, + "grad_norm": 0.0052930801175534725, + "learning_rate": 0.00011791278818098994, + "loss": 0.0016795439645648003, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00168, + "step": 2570, + "tokens/total": 42131456, + "tokens/train_per_sec_per_gpu": 14.83, + "tokens/trainable": 13343711 + }, + { + "epoch": 2.5042424242424244, + "grad_norm": 0.0013836952857673168, + "learning_rate": 0.00011724626633464127, + "loss": 0.0001935441978275776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2580, + "tokens/total": 42295296, + "tokens/train_per_sec_per_gpu": 13.84, + "tokens/trainable": 13396092 + }, + { + "epoch": 2.5042424242424244, + "eval_loss": 8.247328514698893e-05, + "eval_ppl": 1.00008, + "eval_runtime": 12.3103, + "eval_samples_per_second": 16.247, + "eval_steps_per_second": 8.123, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 14.2, + "memory/max_allocated (GiB)": 14.2, + "step": 2580 + }, + { + "epoch": 2.5139393939393937, + "grad_norm": 0.003271307796239853, + "learning_rate": 0.00011657895388755742, + "loss": 8.508508908562362e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2590, + "tokens/total": 42459136, + "tokens/train_per_sec_per_gpu": 13.47, + "tokens/trainable": 13448209 + }, + { + "epoch": 2.5236363636363635, + "grad_norm": 0.0009727279539220035, + "learning_rate": 0.00011591088143057483, + "loss": 3.968240635003895e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 2600, + "tokens/total": 42622976, + "tokens/train_per_sec_per_gpu": 13.54, + "tokens/trainable": 13499718 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.0010527002159506083, + "learning_rate": 0.00011524207958937001, + "loss": 0.00018399815307930113, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 2610, + "tokens/total": 42786816, + "tokens/train_per_sec_per_gpu": 15.62, + "tokens/trainable": 13551815 + }, + { + "epoch": 2.543030303030303, + "grad_norm": 0.023774035274982452, + "learning_rate": 0.00011457257902305598, + "loss": 0.0003953744191676378, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0004, + "step": 2620, + "tokens/total": 42950656, + "tokens/train_per_sec_per_gpu": 14.46, + "tokens/trainable": 13603678 + }, + { + "epoch": 2.5527272727272727, + "grad_norm": 0.013190316036343575, + "learning_rate": 0.00011390241042277654, + "loss": 0.0005875382572412491, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00059, + "step": 2630, + "tokens/total": 43114496, + "tokens/train_per_sec_per_gpu": 13.34, + "tokens/trainable": 13655501 + }, + { + "epoch": 2.5624242424242425, + "grad_norm": 0.019383637234568596, + "learning_rate": 0.00011323160451029932, + "loss": 0.0002609423128888011, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00026, + "step": 2640, + "tokens/total": 43278336, + "tokens/train_per_sec_per_gpu": 15.43, + "tokens/trainable": 13707528 + }, + { + "epoch": 2.5721212121212123, + "grad_norm": 0.002637348370626569, + "learning_rate": 0.00011256019203660764, + "loss": 0.0003633877262473106, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00036, + "step": 2650, + "tokens/total": 43442176, + "tokens/train_per_sec_per_gpu": 13.95, + "tokens/trainable": 13758801 + }, + { + "epoch": 2.581818181818182, + "grad_norm": 0.008525248616933823, + "learning_rate": 0.00011188820378049065, + "loss": 0.000345646683126688, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00035, + "step": 2660, + "tokens/total": 43606016, + "tokens/train_per_sec_per_gpu": 16.61, + "tokens/trainable": 13810541 + }, + { + "epoch": 2.5915151515151518, + "grad_norm": 0.003398684086278081, + "learning_rate": 0.00011121567054713244, + "loss": 0.00010743099264800548, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 2670, + "tokens/total": 43769856, + "tokens/train_per_sec_per_gpu": 13.59, + "tokens/trainable": 13861683 + }, + { + "epoch": 2.601212121212121, + "grad_norm": 0.048622433096170425, + "learning_rate": 0.00011054262316669986, + "loss": 0.0006771612912416458, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00068, + "step": 2680, + "tokens/total": 43933696, + "tokens/train_per_sec_per_gpu": 13.89, + "tokens/trainable": 13913157 + }, + { + "epoch": 2.610909090909091, + "grad_norm": 0.015018350444734097, + "learning_rate": 0.00010986909249292922, + "loss": 0.00019932850264012814, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0002, + "step": 2690, + "tokens/total": 44097536, + "tokens/train_per_sec_per_gpu": 14.55, + "tokens/trainable": 13965160 + }, + { + "epoch": 2.6206060606060606, + "grad_norm": 0.0012435365933924913, + "learning_rate": 0.00010919510940171189, + "loss": 5.868576117791235e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00006, + "step": 2700, + "tokens/total": 44261376, + "tokens/train_per_sec_per_gpu": 14.73, + "tokens/trainable": 14017024 + }, + { + "epoch": 2.6303030303030304, + "grad_norm": 0.0019523982191458344, + "learning_rate": 0.00010852070478967889, + "loss": 0.0001263051643036306, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2710, + "tokens/total": 44425216, + "tokens/train_per_sec_per_gpu": 13.44, + "tokens/trainable": 14068784 + }, + { + "epoch": 2.64, + "grad_norm": 0.007224493194371462, + "learning_rate": 0.0001078459095727845, + "loss": 0.0001929138321429491, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2720, + "tokens/total": 44589056, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 14120598 + }, + { + "epoch": 2.6496969696969694, + "grad_norm": 0.047363366931676865, + "learning_rate": 0.00010717075468488913, + "loss": 0.00019309332128614187, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 2730, + "tokens/total": 44752896, + "tokens/train_per_sec_per_gpu": 15.31, + "tokens/trainable": 14172560 + }, + { + "epoch": 2.659393939393939, + "grad_norm": 0.001373408129438758, + "learning_rate": 0.00010649527107634108, + "loss": 9.99198411591351e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2740, + "tokens/total": 44916736, + "tokens/train_per_sec_per_gpu": 14.44, + "tokens/trainable": 14223646 + }, + { + "epoch": 2.669090909090909, + "grad_norm": 0.0005223056650720537, + "learning_rate": 0.00010581948971255788, + "loss": 0.0001228376990184188, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2750, + "tokens/total": 45080576, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 14275006 + }, + { + "epoch": 2.6787878787878787, + "grad_norm": 0.0011381276417523623, + "learning_rate": 0.00010514344157260673, + "loss": 5.9981108643114565e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00006, + "step": 2760, + "tokens/total": 45244416, + "tokens/train_per_sec_per_gpu": 13.74, + "tokens/trainable": 14327112 + }, + { + "epoch": 2.6884848484848485, + "grad_norm": 0.0028999936766922474, + "learning_rate": 0.00010446715764778423, + "loss": 0.0001589686726219952, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 2770, + "tokens/total": 45408256, + "tokens/train_per_sec_per_gpu": 15.54, + "tokens/trainable": 14378961 + }, + { + "epoch": 2.6981818181818182, + "grad_norm": 0.0008248479571193457, + "learning_rate": 0.00010379066894019589, + "loss": 0.00013254316290840508, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2780, + "tokens/total": 45572096, + "tokens/train_per_sec_per_gpu": 14.41, + "tokens/trainable": 14430803 + }, + { + "epoch": 2.707878787878788, + "grad_norm": 0.00010997291246894747, + "learning_rate": 0.00010311400646133482, + "loss": 0.0001163567416369915, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2790, + "tokens/total": 45735936, + "tokens/train_per_sec_per_gpu": 14.58, + "tokens/trainable": 14482749 + }, + { + "epoch": 2.7175757575757578, + "grad_norm": 0.004438972566276789, + "learning_rate": 0.00010243720123066011, + "loss": 0.0008217763155698776, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00082, + "step": 2800, + "tokens/total": 45899776, + "tokens/train_per_sec_per_gpu": 13.59, + "tokens/trainable": 14534668 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.0006182301440276206, + "learning_rate": 0.0001017602842741749, + "loss": 0.00021976977586746216, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00022, + "step": 2810, + "tokens/total": 46063616, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 14586586 + }, + { + "epoch": 2.736969696969697, + "grad_norm": 0.003250017762184143, + "learning_rate": 0.000101083286623004, + "loss": 0.00012328216107562184, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2820, + "tokens/total": 46227456, + "tokens/train_per_sec_per_gpu": 13.96, + "tokens/trainable": 14638808 + }, + { + "epoch": 2.7466666666666666, + "grad_norm": 0.010098662227392197, + "learning_rate": 0.00010040623931197144, + "loss": 7.462603389285505e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 2830, + "tokens/total": 46391296, + "tokens/train_per_sec_per_gpu": 13.8, + "tokens/trainable": 14690613 + }, + { + "epoch": 2.7563636363636363, + "grad_norm": 0.002696437295526266, + "learning_rate": 9.972917337817771e-05, + "loss": 4.609748430084437e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 2840, + "tokens/total": 46555136, + "tokens/train_per_sec_per_gpu": 14.39, + "tokens/trainable": 14742309 + }, + { + "epoch": 2.766060606060606, + "grad_norm": 0.0002640737220644951, + "learning_rate": 9.905211985957706e-05, + "loss": 9.76522103883326e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2850, + "tokens/total": 46718976, + "tokens/train_per_sec_per_gpu": 14.89, + "tokens/trainable": 14794633 + }, + { + "epoch": 2.775757575757576, + "grad_norm": 0.0023825804237276316, + "learning_rate": 9.837510979355457e-05, + "loss": 9.005467290990055e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2860, + "tokens/total": 46882816, + "tokens/train_per_sec_per_gpu": 13.85, + "tokens/trainable": 14846334 + }, + { + "epoch": 2.785454545454545, + "grad_norm": 0.007719525136053562, + "learning_rate": 9.769817421550335e-05, + "loss": 0.00035368206445127723, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00035, + "step": 2870, + "tokens/total": 47046656, + "tokens/train_per_sec_per_gpu": 14.04, + "tokens/trainable": 14898484 + }, + { + "epoch": 2.795151515151515, + "grad_norm": 0.0010807636426761746, + "learning_rate": 9.702134415740192e-05, + "loss": 9.26341162994504e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 2880, + "tokens/total": 47210496, + "tokens/train_per_sec_per_gpu": 14.7, + "tokens/trainable": 14950418 + }, + { + "epoch": 2.8048484848484847, + "grad_norm": 0.02270282432436943, + "learning_rate": 9.634465064639153e-05, + "loss": 0.00013720652787014843, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 2890, + "tokens/total": 47374336, + "tokens/train_per_sec_per_gpu": 13.75, + "tokens/trainable": 15002347 + }, + { + "epoch": 2.8145454545454545, + "grad_norm": 0.05273193120956421, + "learning_rate": 9.56681247033538e-05, + "loss": 0.0002461188472807407, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00025, + "step": 2900, + "tokens/total": 47538176, + "tokens/train_per_sec_per_gpu": 13.76, + "tokens/trainable": 15054325 + }, + { + "epoch": 2.824242424242424, + "grad_norm": 0.021871395409107208, + "learning_rate": 9.499179734148883e-05, + "loss": 9.564256761223078e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 2910, + "tokens/total": 47702016, + "tokens/train_per_sec_per_gpu": 14.34, + "tokens/trainable": 15105722 + }, + { + "epoch": 2.833939393939394, + "grad_norm": 0.0173841193318367, + "learning_rate": 9.431569956489331e-05, + "loss": 0.00014969281619414687, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 2920, + "tokens/total": 47865856, + "tokens/train_per_sec_per_gpu": 14.11, + "tokens/trainable": 15157622 + }, + { + "epoch": 2.8436363636363637, + "grad_norm": 0.015775226056575775, + "learning_rate": 9.363986236713933e-05, + "loss": 0.00022732678335160016, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 2930, + "tokens/total": 48029696, + "tokens/train_per_sec_per_gpu": 13.86, + "tokens/trainable": 15208749 + }, + { + "epoch": 2.8533333333333335, + "grad_norm": 0.0024653058499097824, + "learning_rate": 9.296431672985363e-05, + "loss": 0.0001259389566257596, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2940, + "tokens/total": 48193536, + "tokens/train_per_sec_per_gpu": 14.64, + "tokens/trainable": 15260098 + }, + { + "epoch": 2.8630303030303033, + "grad_norm": 0.000619547616224736, + "learning_rate": 9.228909362129722e-05, + "loss": 7.931838044896721e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 2950, + "tokens/total": 48357376, + "tokens/train_per_sec_per_gpu": 13.87, + "tokens/trainable": 15311590 + }, + { + "epoch": 2.8727272727272726, + "grad_norm": 0.016801398247480392, + "learning_rate": 9.16142239949458e-05, + "loss": 0.00022562453523278236, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00023, + "step": 2960, + "tokens/total": 48521216, + "tokens/train_per_sec_per_gpu": 14.63, + "tokens/trainable": 15363058 + }, + { + "epoch": 2.8824242424242423, + "grad_norm": 0.0022071890998631716, + "learning_rate": 9.093973878807072e-05, + "loss": 0.00012458593118935823, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 2970, + "tokens/total": 48685056, + "tokens/train_per_sec_per_gpu": 14.62, + "tokens/trainable": 15415260 + }, + { + "epoch": 2.892121212121212, + "grad_norm": 0.004443019162863493, + "learning_rate": 9.026566892032105e-05, + "loss": 0.0001334903878159821, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 2980, + "tokens/total": 48848896, + "tokens/train_per_sec_per_gpu": 14.95, + "tokens/trainable": 15466621 + }, + { + "epoch": 2.901818181818182, + "grad_norm": 0.0007753843092359602, + "learning_rate": 8.959204529230569e-05, + "loss": 0.00028287877794355156, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00028, + "step": 2990, + "tokens/total": 49012736, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 15517914 + }, + { + "epoch": 2.9115151515151516, + "grad_norm": 0.0011952788336202502, + "learning_rate": 8.891889878417724e-05, + "loss": 0.000494527630507946, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00049, + "step": 3000, + "tokens/total": 49176576, + "tokens/train_per_sec_per_gpu": 14.87, + "tokens/trainable": 15569029 + }, + { + "epoch": 2.9212121212121214, + "grad_norm": 0.0041526807472109795, + "learning_rate": 8.824626025421626e-05, + "loss": 0.00010177484946325422, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 3010, + "tokens/total": 49340416, + "tokens/train_per_sec_per_gpu": 15.38, + "tokens/trainable": 15620849 + }, + { + "epoch": 2.9309090909090907, + "grad_norm": 0.00011553156218724325, + "learning_rate": 8.757416053741649e-05, + "loss": 0.00010593911865726113, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 3020, + "tokens/total": 49504256, + "tokens/train_per_sec_per_gpu": 14.3, + "tokens/trainable": 15671996 + }, + { + "epoch": 2.9406060606060604, + "grad_norm": 0.0024511946830898523, + "learning_rate": 8.690263044407168e-05, + "loss": 0.0001637642504647374, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 3030, + "tokens/total": 49668096, + "tokens/train_per_sec_per_gpu": 14.01, + "tokens/trainable": 15723682 + }, + { + "epoch": 2.95030303030303, + "grad_norm": 0.007446048315614462, + "learning_rate": 8.62317007583628e-05, + "loss": 5.339759518392384e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 3040, + "tokens/total": 49831936, + "tokens/train_per_sec_per_gpu": 15.18, + "tokens/trainable": 15775385 + }, + { + "epoch": 2.96, + "grad_norm": 0.0077268267050385475, + "learning_rate": 8.556140223694718e-05, + "loss": 0.00031895393040031194, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00032, + "step": 3050, + "tokens/total": 49995776, + "tokens/train_per_sec_per_gpu": 15.09, + "tokens/trainable": 15827301 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 0.038065724074840546, + "learning_rate": 8.489176560754834e-05, + "loss": 0.00015137892914935948, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 3060, + "tokens/total": 50159616, + "tokens/train_per_sec_per_gpu": 15.25, + "tokens/trainable": 15879448 + }, + { + "epoch": 2.9793939393939395, + "grad_norm": 0.01792677491903305, + "learning_rate": 8.422282156754741e-05, + "loss": 0.00016337501583620905, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 3070, + "tokens/total": 50323456, + "tokens/train_per_sec_per_gpu": 14.83, + "tokens/trainable": 15930723 + }, + { + "epoch": 2.9890909090909092, + "grad_norm": 0.03399665653705597, + "learning_rate": 8.355460078257607e-05, + "loss": 0.0003045425517484546, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0003, + "step": 3080, + "tokens/total": 50487296, + "tokens/train_per_sec_per_gpu": 14.33, + "tokens/trainable": 15981910 + }, + { + "epoch": 2.998787878787879, + "grad_norm": 0.029494913294911385, + "learning_rate": 8.288713388511047e-05, + "loss": 0.0003337380010634661, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 3090, + "tokens/total": 50651136, + "tokens/train_per_sec_per_gpu": 15.43, + "tokens/trainable": 16034665 + }, + { + "epoch": 3.003878787878788, + "eval_loss": 6.105785723775625e-05, + "eval_ppl": 1.00006, + "eval_runtime": 12.297, + "eval_samples_per_second": 16.264, + "eval_steps_per_second": 8.132, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.73, + "memory/max_allocated (GiB)": 16.73, + "step": 3096 + }, + { + "epoch": 3.0077575757575756, + "grad_norm": 0.04519466683268547, + "learning_rate": 8.222045147306733e-05, + "loss": 0.00016488047549501061, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 3100, + "tokens/total": 50800640, + "tokens/train_per_sec_per_gpu": 15.13, + "tokens/trainable": 16081476 + }, + { + "epoch": 3.0174545454545454, + "grad_norm": 0.00043377449037507176, + "learning_rate": 8.155458410840097e-05, + "loss": 8.846807177178561e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 3110, + "tokens/total": 50964480, + "tokens/train_per_sec_per_gpu": 14.26, + "tokens/trainable": 16133199 + }, + { + "epoch": 3.027151515151515, + "grad_norm": 0.004262726753950119, + "learning_rate": 8.088956231570255e-05, + "loss": 0.00011739074252545834, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 3120, + "tokens/total": 51128320, + "tokens/train_per_sec_per_gpu": 15.31, + "tokens/trainable": 16185603 + }, + { + "epoch": 3.036848484848485, + "grad_norm": 0.037849023938179016, + "learning_rate": 8.022541658080062e-05, + "loss": 0.00018082676688209177, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 3130, + "tokens/total": 51292160, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 16237682 + }, + { + "epoch": 3.0465454545454547, + "grad_norm": 0.037954214960336685, + "learning_rate": 7.956217734936353e-05, + "loss": 8.843120886012912e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 3140, + "tokens/total": 51456000, + "tokens/train_per_sec_per_gpu": 14.74, + "tokens/trainable": 16290232 + }, + { + "epoch": 3.0562424242424244, + "grad_norm": 0.0013315236428752542, + "learning_rate": 7.889987502550395e-05, + "loss": 7.704548188485205e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3150, + "tokens/total": 51619840, + "tokens/train_per_sec_per_gpu": 15.55, + "tokens/trainable": 16342640 + }, + { + "epoch": 3.0659393939393937, + "grad_norm": 0.00012197630712762475, + "learning_rate": 7.823853997038488e-05, + "loss": 8.29990312922746e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3160, + "tokens/total": 51783680, + "tokens/train_per_sec_per_gpu": 13.65, + "tokens/trainable": 16393867 + }, + { + "epoch": 3.0756363636363635, + "grad_norm": 0.0018186501692980528, + "learning_rate": 7.757820250082802e-05, + "loss": 0.00017998996190726757, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00018, + "step": 3170, + "tokens/total": 51947520, + "tokens/train_per_sec_per_gpu": 15.16, + "tokens/trainable": 16446250 + }, + { + "epoch": 3.0853333333333333, + "grad_norm": 0.0035118849482387304, + "learning_rate": 7.691889288792389e-05, + "loss": 4.4890533899888396e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3180, + "tokens/total": 52111360, + "tokens/train_per_sec_per_gpu": 15.25, + "tokens/trainable": 16498691 + }, + { + "epoch": 3.095030303030303, + "grad_norm": 0.0006739828968420625, + "learning_rate": 7.626064135564404e-05, + "loss": 8.429370354861021e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3190, + "tokens/total": 52275200, + "tokens/train_per_sec_per_gpu": 13.79, + "tokens/trainable": 16550603 + }, + { + "epoch": 3.104727272727273, + "grad_norm": 0.0013700309209525585, + "learning_rate": 7.560347807945584e-05, + "loss": 6.57537835650146e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 3200, + "tokens/total": 52439040, + "tokens/train_per_sec_per_gpu": 15.66, + "tokens/trainable": 16602365 + }, + { + "epoch": 3.1144242424242425, + "grad_norm": 0.0036931221839040518, + "learning_rate": 7.494743318493882e-05, + "loss": 8.03955306764692e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3210, + "tokens/total": 52602880, + "tokens/train_per_sec_per_gpu": 14.1, + "tokens/trainable": 16654435 + }, + { + "epoch": 3.1241212121212123, + "grad_norm": 0.00026944788987748325, + "learning_rate": 7.42925367464039e-05, + "loss": 7.618146482855082e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3220, + "tokens/total": 52766720, + "tokens/train_per_sec_per_gpu": 15.9, + "tokens/trainable": 16705983 + }, + { + "epoch": 3.1338181818181816, + "grad_norm": 0.0007724022725597024, + "learning_rate": 7.363881878551473e-05, + "loss": 4.647823807317764e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 3230, + "tokens/total": 52930560, + "tokens/train_per_sec_per_gpu": 14.22, + "tokens/trainable": 16757171 + }, + { + "epoch": 3.1435151515151514, + "grad_norm": 0.004681420046836138, + "learning_rate": 7.298630926991122e-05, + "loss": 0.00011213724501430988, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 3240, + "tokens/total": 53094400, + "tokens/train_per_sec_per_gpu": 14.88, + "tokens/trainable": 16808844 + }, + { + "epoch": 3.153212121212121, + "grad_norm": 0.00022965454263612628, + "learning_rate": 7.233503811183598e-05, + "loss": 0.00014833036111667753, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00015, + "step": 3250, + "tokens/total": 53258240, + "tokens/train_per_sec_per_gpu": 15.03, + "tokens/trainable": 16861664 + }, + { + "epoch": 3.162909090909091, + "grad_norm": 0.00010181339894188568, + "learning_rate": 7.168503516676302e-05, + "loss": 4.449372354429215e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3260, + "tokens/total": 53422080, + "tokens/train_per_sec_per_gpu": 14.57, + "tokens/trainable": 16913224 + }, + { + "epoch": 3.1726060606060607, + "grad_norm": 0.00781994964927435, + "learning_rate": 7.103633023202916e-05, + "loss": 0.0001156279817223549, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 3270, + "tokens/total": 53585920, + "tokens/train_per_sec_per_gpu": 13.99, + "tokens/trainable": 16965318 + }, + { + "epoch": 3.1823030303030304, + "grad_norm": 0.0002718314644880593, + "learning_rate": 7.038895304546795e-05, + "loss": 4.4661539141088725e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3280, + "tokens/total": 53749760, + "tokens/train_per_sec_per_gpu": 14.63, + "tokens/trainable": 17017312 + }, + { + "epoch": 3.192, + "grad_norm": 0.0014515136135742068, + "learning_rate": 6.974293328404653e-05, + "loss": 8.743933867663145e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 3290, + "tokens/total": 53913600, + "tokens/train_per_sec_per_gpu": 14.89, + "tokens/trainable": 17069196 + }, + { + "epoch": 3.20169696969697, + "grad_norm": 0.00047755855484865606, + "learning_rate": 6.909830056250527e-05, + "loss": 4.3512805132195356e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3300, + "tokens/total": 54077440, + "tokens/train_per_sec_per_gpu": 14.0, + "tokens/trainable": 17120996 + }, + { + "epoch": 3.2113939393939392, + "grad_norm": 0.0019338884158059955, + "learning_rate": 6.845508443199989e-05, + "loss": 4.359671729616821e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3310, + "tokens/total": 54241280, + "tokens/train_per_sec_per_gpu": 13.79, + "tokens/trainable": 17172896 + }, + { + "epoch": 3.221090909090909, + "grad_norm": 0.0019301094580441713, + "learning_rate": 6.78133143787471e-05, + "loss": 4.4929751311428845e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3320, + "tokens/total": 54405120, + "tokens/train_per_sec_per_gpu": 14.59, + "tokens/trainable": 17223944 + }, + { + "epoch": 3.2307878787878788, + "grad_norm": 0.00026344318757764995, + "learning_rate": 6.717301982267264e-05, + "loss": 1.6775081166997553e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3330, + "tokens/total": 54568960, + "tokens/train_per_sec_per_gpu": 14.22, + "tokens/trainable": 17276000 + }, + { + "epoch": 3.2404848484848485, + "grad_norm": 0.0025252695195376873, + "learning_rate": 6.653423011606284e-05, + "loss": 2.2526683460455386e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3340, + "tokens/total": 54732800, + "tokens/train_per_sec_per_gpu": 14.55, + "tokens/trainable": 17328084 + }, + { + "epoch": 3.2501818181818183, + "grad_norm": 0.00013686020975001156, + "learning_rate": 6.58969745422189e-05, + "loss": 0.00012354745995253326, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 3350, + "tokens/total": 54896640, + "tokens/train_per_sec_per_gpu": 14.84, + "tokens/trainable": 17380688 + }, + { + "epoch": 3.259878787878788, + "grad_norm": 0.021551014855504036, + "learning_rate": 6.52612823141145e-05, + "loss": 3.4375887480564415e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00003, + "step": 3360, + "tokens/total": 55060480, + "tokens/train_per_sec_per_gpu": 14.5, + "tokens/trainable": 17432020 + }, + { + "epoch": 3.2695757575757574, + "grad_norm": 5.958124529570341e-05, + "learning_rate": 6.462718257305676e-05, + "loss": 4.8505124868825075e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 3370, + "tokens/total": 55224320, + "tokens/train_per_sec_per_gpu": 13.26, + "tokens/trainable": 17483384 + }, + { + "epoch": 3.279272727272727, + "grad_norm": 0.0010329332435503602, + "learning_rate": 6.399470438735014e-05, + "loss": 4.009721742477268e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3380, + "tokens/total": 55388160, + "tokens/train_per_sec_per_gpu": 15.8, + "tokens/trainable": 17535484 + }, + { + "epoch": 3.288969696969697, + "grad_norm": 0.0005560293793678284, + "learning_rate": 6.336387675096406e-05, + "loss": 6.602061912417412e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 3390, + "tokens/total": 55552000, + "tokens/train_per_sec_per_gpu": 15.29, + "tokens/trainable": 17587056 + }, + { + "epoch": 3.2986666666666666, + "grad_norm": 5.3608902817359194e-05, + "learning_rate": 6.273472858220368e-05, + "loss": 1.2437945406418293e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3400, + "tokens/total": 55715840, + "tokens/train_per_sec_per_gpu": 14.49, + "tokens/trainable": 17638416 + }, + { + "epoch": 3.3083636363636364, + "grad_norm": 2.0864759790129028e-05, + "learning_rate": 6.210728872238432e-05, + "loss": 2.474188804626465e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3410, + "tokens/total": 55879680, + "tokens/train_per_sec_per_gpu": 15.35, + "tokens/trainable": 17689602 + }, + { + "epoch": 3.318060606060606, + "grad_norm": 7.073156302794814e-05, + "learning_rate": 6.148158593450921e-05, + "loss": 9.975369903258979e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3420, + "tokens/total": 56043520, + "tokens/train_per_sec_per_gpu": 13.49, + "tokens/trainable": 17741844 + }, + { + "epoch": 3.327757575757576, + "grad_norm": 4.995657582185231e-05, + "learning_rate": 6.085764890195097e-05, + "loss": 2.0776886958628894e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3430, + "tokens/total": 56207360, + "tokens/train_per_sec_per_gpu": 14.21, + "tokens/trainable": 17794512 + }, + { + "epoch": 3.3374545454545457, + "grad_norm": 0.008468235842883587, + "learning_rate": 6.0235506227136804e-05, + "loss": 9.970037499442696e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 3440, + "tokens/total": 56371200, + "tokens/train_per_sec_per_gpu": 14.21, + "tokens/trainable": 17846384 + }, + { + "epoch": 3.347151515151515, + "grad_norm": 0.005479085259139538, + "learning_rate": 5.961518643023714e-05, + "loss": 3.432526718825102e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00003, + "step": 3450, + "tokens/total": 56535040, + "tokens/train_per_sec_per_gpu": 13.59, + "tokens/trainable": 17898410 + }, + { + "epoch": 3.3568484848484847, + "grad_norm": 0.00012323590635787696, + "learning_rate": 5.899671794785839e-05, + "loss": 3.815153031609953e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3460, + "tokens/total": 56698880, + "tokens/train_per_sec_per_gpu": 14.15, + "tokens/trainable": 17950310 + }, + { + "epoch": 3.3665454545454545, + "grad_norm": 0.00029943216941319406, + "learning_rate": 5.8380129131739325e-05, + "loss": 3.6733318120241164e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3470, + "tokens/total": 56862720, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 18002086 + }, + { + "epoch": 3.3762424242424243, + "grad_norm": 0.00012203674123156816, + "learning_rate": 5.776544824745117e-05, + "loss": 1.5152255946304649e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3480, + "tokens/total": 57026560, + "tokens/train_per_sec_per_gpu": 14.39, + "tokens/trainable": 18054090 + }, + { + "epoch": 3.385939393939394, + "grad_norm": 0.007876560091972351, + "learning_rate": 5.715270347310223e-05, + "loss": 4.16127557400614e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3490, + "tokens/total": 57190400, + "tokens/train_per_sec_per_gpu": 14.86, + "tokens/trainable": 18106212 + }, + { + "epoch": 3.395636363636364, + "grad_norm": 0.0004973178147338331, + "learning_rate": 5.6541922898045786e-05, + "loss": 1.3376350398175419e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3500, + "tokens/total": 57354240, + "tokens/train_per_sec_per_gpu": 14.6, + "tokens/trainable": 18157584 + }, + { + "epoch": 3.405333333333333, + "grad_norm": 1.9712582798092626e-05, + "learning_rate": 5.5933134521592726e-05, + "loss": 1.0704222950153052e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3510, + "tokens/total": 57518080, + "tokens/train_per_sec_per_gpu": 14.99, + "tokens/trainable": 18209912 + }, + { + "epoch": 3.415030303030303, + "grad_norm": 1.0385288987890817e-05, + "learning_rate": 5.532636625172777e-05, + "loss": 2.7085625333711505e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0, + "step": 3520, + "tokens/total": 57681920, + "tokens/train_per_sec_per_gpu": 14.2, + "tokens/trainable": 18262090 + }, + { + "epoch": 3.4247272727272726, + "grad_norm": 0.0004951295559294522, + "learning_rate": 5.4721645903830224e-05, + "loss": 1.903311494970694e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3530, + "tokens/total": 57845760, + "tokens/train_per_sec_per_gpu": 14.19, + "tokens/trainable": 18313992 + }, + { + "epoch": 3.4344242424242424, + "grad_norm": 3.555689181666821e-05, + "learning_rate": 5.411900119939895e-05, + "loss": 1.433735596947372e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3540, + "tokens/total": 58009600, + "tokens/train_per_sec_per_gpu": 14.24, + "tokens/trainable": 18365168 + }, + { + "epoch": 3.444121212121212, + "grad_norm": 0.0002728050749283284, + "learning_rate": 5.351845976478138e-05, + "loss": 9.391235653311015e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3550, + "tokens/total": 58173440, + "tokens/train_per_sec_per_gpu": 15.0, + "tokens/trainable": 18417316 + }, + { + "epoch": 3.453818181818182, + "grad_norm": 0.0004289250646252185, + "learning_rate": 5.292004912990724e-05, + "loss": 3.863255842588842e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3560, + "tokens/total": 58337280, + "tokens/train_per_sec_per_gpu": 15.74, + "tokens/trainable": 18469156 + }, + { + "epoch": 3.4635151515151517, + "grad_norm": 0.007286752574145794, + "learning_rate": 5.232379672702639e-05, + "loss": 0.00013780767330899834, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00014, + "step": 3570, + "tokens/total": 58501120, + "tokens/train_per_sec_per_gpu": 15.52, + "tokens/trainable": 18521592 + }, + { + "epoch": 3.4732121212121214, + "grad_norm": 0.020722538232803345, + "learning_rate": 5.172972988945144e-05, + "loss": 0.0003289236687123775, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00033, + "step": 3580, + "tokens/total": 58664960, + "tokens/train_per_sec_per_gpu": 16.14, + "tokens/trainable": 18573716 + }, + { + "epoch": 3.4829090909090907, + "grad_norm": 0.00703927269205451, + "learning_rate": 5.113787585030454e-05, + "loss": 0.0007538365665823221, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00075, + "step": 3590, + "tokens/total": 58828800, + "tokens/train_per_sec_per_gpu": 14.95, + "tokens/trainable": 18626456 + }, + { + "epoch": 3.4926060606060605, + "grad_norm": 0.0010445477673783898, + "learning_rate": 5.054826174126908e-05, + "loss": 0.0002355028875172138, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00024, + "step": 3600, + "tokens/total": 58992640, + "tokens/train_per_sec_per_gpu": 13.99, + "tokens/trainable": 18678168 + }, + { + "epoch": 3.5023030303030303, + "grad_norm": 0.0034678117372095585, + "learning_rate": 4.996091459134603e-05, + "loss": 0.00011191462399438024, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 3610, + "tokens/total": 59156480, + "tokens/train_per_sec_per_gpu": 14.88, + "tokens/trainable": 18730292 + }, + { + "epoch": 3.5042424242424244, + "eval_loss": 0.00017824456153903157, + "eval_ppl": 1.00018, + "eval_runtime": 12.4516, + "eval_samples_per_second": 16.062, + "eval_steps_per_second": 8.031, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "step": 3612 + }, + { + "epoch": 3.512, + "grad_norm": 0.0009657443733885884, + "learning_rate": 4.9375861325614606e-05, + "loss": 8.825904224067927e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 3620, + "tokens/total": 59320320, + "tokens/train_per_sec_per_gpu": 13.73, + "tokens/trainable": 18782052 + }, + { + "epoch": 3.5216969696969698, + "grad_norm": 0.00018646326498128474, + "learning_rate": 4.879312876399822e-05, + "loss": 0.00015567413065582515, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00016, + "step": 3630, + "tokens/total": 59484160, + "tokens/train_per_sec_per_gpu": 13.49, + "tokens/trainable": 18833744 + }, + { + "epoch": 3.5313939393939395, + "grad_norm": 0.013982892036437988, + "learning_rate": 4.821274362003497e-05, + "loss": 4.431642882991582e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3640, + "tokens/total": 59648000, + "tokens/train_per_sec_per_gpu": 15.38, + "tokens/trainable": 18886264 + }, + { + "epoch": 3.541090909090909, + "grad_norm": 0.00012099265586584806, + "learning_rate": 4.763473249965288e-05, + "loss": 0.00011523300781846046, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00012, + "step": 3650, + "tokens/total": 59811840, + "tokens/train_per_sec_per_gpu": 13.5, + "tokens/trainable": 18938164 + }, + { + "epoch": 3.5507878787878786, + "grad_norm": 0.0007786125643178821, + "learning_rate": 4.7059121899950364e-05, + "loss": 0.00019468108657747508, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00019, + "step": 3660, + "tokens/total": 59975680, + "tokens/train_per_sec_per_gpu": 14.05, + "tokens/trainable": 18989852 + }, + { + "epoch": 3.5604848484848484, + "grad_norm": 0.025333423167467117, + "learning_rate": 4.6485938207981526e-05, + "loss": 9.758002706803382e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0001, + "step": 3670, + "tokens/total": 60139520, + "tokens/train_per_sec_per_gpu": 13.79, + "tokens/trainable": 19041522 + }, + { + "epoch": 3.570181818181818, + "grad_norm": 0.001490296795964241, + "learning_rate": 4.5915207699546625e-05, + "loss": 3.943985793739557e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3680, + "tokens/total": 60303360, + "tokens/train_per_sec_per_gpu": 14.86, + "tokens/trainable": 19093162 + }, + { + "epoch": 3.579878787878788, + "grad_norm": 0.0007833060226403177, + "learning_rate": 4.5346956537987316e-05, + "loss": 1.0386841313447804e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3690, + "tokens/total": 60467200, + "tokens/train_per_sec_per_gpu": 14.54, + "tokens/trainable": 19145046 + }, + { + "epoch": 3.5895757575757576, + "grad_norm": 0.00012790538312401623, + "learning_rate": 4.4781210772987514e-05, + "loss": 3.011857916135341e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00003, + "step": 3700, + "tokens/total": 60631040, + "tokens/train_per_sec_per_gpu": 15.42, + "tokens/trainable": 19196986 + }, + { + "epoch": 3.5992727272727274, + "grad_norm": 0.0004471437423489988, + "learning_rate": 4.4217996339379165e-05, + "loss": 8.84202163433656e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3710, + "tokens/total": 60794880, + "tokens/train_per_sec_per_gpu": 14.14, + "tokens/trainable": 19248428 + }, + { + "epoch": 3.608969696969697, + "grad_norm": 0.0009804401779547334, + "learning_rate": 4.365733905595305e-05, + "loss": 4.249412741046399e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3720, + "tokens/total": 60958720, + "tokens/train_per_sec_per_gpu": 12.95, + "tokens/trainable": 19300280 + }, + { + "epoch": 3.618666666666667, + "grad_norm": 0.00024300174845848233, + "learning_rate": 4.309926462427577e-05, + "loss": 5.587518680840731e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00006, + "step": 3730, + "tokens/total": 61122560, + "tokens/train_per_sec_per_gpu": 14.31, + "tokens/trainable": 19352472 + }, + { + "epoch": 3.6283636363636362, + "grad_norm": 0.010901122353971004, + "learning_rate": 4.2543798627511e-05, + "loss": 2.8631213353946805e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00003, + "step": 3740, + "tokens/total": 61286400, + "tokens/train_per_sec_per_gpu": 14.3, + "tokens/trainable": 19404152 + }, + { + "epoch": 3.638060606060606, + "grad_norm": 0.0031345924362540245, + "learning_rate": 4.199096652924707e-05, + "loss": 1.8156676378566772e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3750, + "tokens/total": 61450240, + "tokens/train_per_sec_per_gpu": 15.25, + "tokens/trainable": 19455468 + }, + { + "epoch": 3.6477575757575758, + "grad_norm": 0.0003150990523863584, + "learning_rate": 4.144079367232953e-05, + "loss": 1.3762562593910843e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3760, + "tokens/total": 61614080, + "tokens/train_per_sec_per_gpu": 13.11, + "tokens/trainable": 19506824 + }, + { + "epoch": 3.6574545454545455, + "grad_norm": 0.00012636656174436212, + "learning_rate": 4.089330527769928e-05, + "loss": 8.082169370027258e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3770, + "tokens/total": 61777920, + "tokens/train_per_sec_per_gpu": 15.74, + "tokens/trainable": 19558428 + }, + { + "epoch": 3.6671515151515153, + "grad_norm": 5.4544212616747245e-05, + "learning_rate": 4.034852644323661e-05, + "loss": 8.598688145866617e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3780, + "tokens/total": 61941760, + "tokens/train_per_sec_per_gpu": 14.98, + "tokens/trainable": 19610278 + }, + { + "epoch": 3.6768484848484846, + "grad_norm": 0.0006127552478574216, + "learning_rate": 3.980648214261047e-05, + "loss": 9.496800485067069e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00009, + "step": 3790, + "tokens/total": 62105600, + "tokens/train_per_sec_per_gpu": 14.16, + "tokens/trainable": 19662032 + }, + { + "epoch": 3.6865454545454543, + "grad_norm": 0.00015887692279648036, + "learning_rate": 3.926719722413382e-05, + "loss": 7.704031304456294e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3800, + "tokens/total": 62269440, + "tokens/train_per_sec_per_gpu": 14.29, + "tokens/trainable": 19713576 + }, + { + "epoch": 3.696242424242424, + "grad_norm": 0.001004945719614625, + "learning_rate": 3.8730696409624436e-05, + "loss": 9.208643314195797e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3810, + "tokens/total": 62433280, + "tokens/train_per_sec_per_gpu": 14.5, + "tokens/trainable": 19766120 + }, + { + "epoch": 3.705939393939394, + "grad_norm": 0.001090078498236835, + "learning_rate": 3.81970042932715e-05, + "loss": 2.173678658436984e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3820, + "tokens/total": 62597120, + "tokens/train_per_sec_per_gpu": 16.34, + "tokens/trainable": 19818266 + }, + { + "epoch": 3.7156363636363636, + "grad_norm": 0.010169378481805325, + "learning_rate": 3.766614534050845e-05, + "loss": 0.00013178124791011215, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00013, + "step": 3830, + "tokens/total": 62760960, + "tokens/train_per_sec_per_gpu": 15.74, + "tokens/trainable": 19870036 + }, + { + "epoch": 3.7253333333333334, + "grad_norm": 0.0007077799527905881, + "learning_rate": 3.713814388689113e-05, + "loss": 0.00010515267495065927, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00011, + "step": 3840, + "tokens/total": 62924800, + "tokens/train_per_sec_per_gpu": 14.39, + "tokens/trainable": 19921268 + }, + { + "epoch": 3.735030303030303, + "grad_norm": 0.0038877541664987803, + "learning_rate": 3.661302413698239e-05, + "loss": 6.733826594427229e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00007, + "step": 3850, + "tokens/total": 63088640, + "tokens/train_per_sec_per_gpu": 13.4, + "tokens/trainable": 19973524 + }, + { + "epoch": 3.744727272727273, + "grad_norm": 0.002190250437706709, + "learning_rate": 3.609081016324243e-05, + "loss": 9.266241977456958e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3860, + "tokens/total": 63252480, + "tokens/train_per_sec_per_gpu": 15.06, + "tokens/trainable": 20025408 + }, + { + "epoch": 3.7544242424242427, + "grad_norm": 0.00012944771151524037, + "learning_rate": 3.557152590492533e-05, + "loss": 4.890719428658485e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00005, + "step": 3870, + "tokens/total": 63416320, + "tokens/train_per_sec_per_gpu": 14.61, + "tokens/trainable": 20077088 + }, + { + "epoch": 3.764121212121212, + "grad_norm": 2.297574064868968e-05, + "learning_rate": 3.5055195166981645e-05, + "loss": 8.172105299308896e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00008, + "step": 3880, + "tokens/total": 63580160, + "tokens/train_per_sec_per_gpu": 13.28, + "tokens/trainable": 20128520 + }, + { + "epoch": 3.7738181818181817, + "grad_norm": 0.0004974680487066507, + "learning_rate": 3.4541841618966955e-05, + "loss": 2.541634894441813e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00003, + "step": 3890, + "tokens/total": 63744000, + "tokens/train_per_sec_per_gpu": 14.54, + "tokens/trainable": 20180720 + }, + { + "epoch": 3.7835151515151515, + "grad_norm": 0.015625184401869774, + "learning_rate": 3.403148879395711e-05, + "loss": 4.029588599223644e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 3900, + "tokens/total": 63907840, + "tokens/train_per_sec_per_gpu": 14.4, + "tokens/trainable": 20232340 + }, + { + "epoch": 3.7932121212121213, + "grad_norm": 0.0002272177516715601, + "learning_rate": 3.352416008746916e-05, + "loss": 1.8363635172136127e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3910, + "tokens/total": 64071680, + "tokens/train_per_sec_per_gpu": 14.92, + "tokens/trainable": 20283830 + }, + { + "epoch": 3.802909090909091, + "grad_norm": 0.0011279061436653137, + "learning_rate": 3.301987875638897e-05, + "loss": 2.048533642664552e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3920, + "tokens/total": 64235520, + "tokens/train_per_sec_per_gpu": 15.78, + "tokens/trainable": 20335268 + }, + { + "epoch": 3.8126060606060603, + "grad_norm": 0.0009211613796651363, + "learning_rate": 3.2518667917905164e-05, + "loss": 1.1079616524511948e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3930, + "tokens/total": 64399360, + "tokens/train_per_sec_per_gpu": 14.71, + "tokens/trainable": 20386774 + }, + { + "epoch": 3.82230303030303, + "grad_norm": 1.010528467304539e-05, + "learning_rate": 3.202055054844921e-05, + "loss": 1.9800318113993853e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3940, + "tokens/total": 64563200, + "tokens/train_per_sec_per_gpu": 13.35, + "tokens/trainable": 20438552 + }, + { + "epoch": 3.832, + "grad_norm": 0.0012298497604206204, + "learning_rate": 3.1525549482642344e-05, + "loss": 3.893448229064233e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0, + "step": 3950, + "tokens/total": 64727040, + "tokens/train_per_sec_per_gpu": 15.1, + "tokens/trainable": 20490752 + }, + { + "epoch": 3.8416969696969696, + "grad_norm": 7.130308949854225e-05, + "learning_rate": 3.103368741224857e-05, + "loss": 1.607610465725884e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 3960, + "tokens/total": 64890880, + "tokens/train_per_sec_per_gpu": 15.38, + "tokens/trainable": 20542748 + }, + { + "epoch": 3.8513939393939394, + "grad_norm": 5.47428717254661e-05, + "learning_rate": 3.054498688513464e-05, + "loss": 7.034661393845454e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3970, + "tokens/total": 65054720, + "tokens/train_per_sec_per_gpu": 15.39, + "tokens/trainable": 20594764 + }, + { + "epoch": 3.861090909090909, + "grad_norm": 2.4619574105599895e-05, + "learning_rate": 3.0059470304236247e-05, + "loss": 7.135008490877226e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 3980, + "tokens/total": 65218560, + "tokens/train_per_sec_per_gpu": 13.9, + "tokens/trainable": 20646930 + }, + { + "epoch": 3.870787878787879, + "grad_norm": 0.00030795851489529014, + "learning_rate": 2.9577159926531096e-05, + "loss": 3.880067015415989e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0, + "step": 3990, + "tokens/total": 65382400, + "tokens/train_per_sec_per_gpu": 15.52, + "tokens/trainable": 20699092 + }, + { + "epoch": 3.8804848484848486, + "grad_norm": 0.00010195578215643764, + "learning_rate": 2.909807786201868e-05, + "loss": 3.185079185641371e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0, + "step": 4000, + "tokens/total": 65546240, + "tokens/train_per_sec_per_gpu": 14.32, + "tokens/trainable": 20751248 + }, + { + "epoch": 3.8901818181818184, + "grad_norm": 0.0004780820745509118, + "learning_rate": 2.862224607270656e-05, + "loss": 1.814713868952822e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.0, + "step": 4010, + "tokens/total": 65710080, + "tokens/train_per_sec_per_gpu": 14.15, + "tokens/trainable": 20803052 + }, + { + "epoch": 3.8998787878787877, + "grad_norm": 3.844877937808633e-05, + "learning_rate": 2.8149686371603767e-05, + "loss": 6.231228326214477e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4020, + "tokens/total": 65873920, + "tokens/train_per_sec_per_gpu": 14.46, + "tokens/trainable": 20855036 + }, + { + "epoch": 3.9095757575757575, + "grad_norm": 1.545458144391887e-05, + "learning_rate": 2.7680420421720687e-05, + "loss": 1.2459891149774193e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4030, + "tokens/total": 66037760, + "tokens/train_per_sec_per_gpu": 13.75, + "tokens/trainable": 20906882 + }, + { + "epoch": 3.9192727272727272, + "grad_norm": 3.0351895475178026e-05, + "learning_rate": 2.7214469735076053e-05, + "loss": 9.68368913163431e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4040, + "tokens/total": 66201600, + "tokens/train_per_sec_per_gpu": 15.26, + "tokens/trainable": 20958884 + }, + { + "epoch": 3.928969696969697, + "grad_norm": 2.0362367649795488e-05, + "learning_rate": 2.675185567171088e-05, + "loss": 2.1959710284136236e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00002, + "step": 4050, + "tokens/total": 66365440, + "tokens/train_per_sec_per_gpu": 15.39, + "tokens/trainable": 21010276 + }, + { + "epoch": 3.9386666666666668, + "grad_norm": 0.0004253160150256008, + "learning_rate": 2.6292599438709087e-05, + "loss": 1.3346082414500416e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4060, + "tokens/total": 66529280, + "tokens/train_per_sec_per_gpu": 14.47, + "tokens/trainable": 21062258 + }, + { + "epoch": 3.9483636363636365, + "grad_norm": 0.0019986326806247234, + "learning_rate": 2.5836722089225597e-05, + "loss": 6.809899787185714e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4070, + "tokens/total": 66693120, + "tokens/train_per_sec_per_gpu": 14.14, + "tokens/trainable": 21114804 + }, + { + "epoch": 3.958060606060606, + "grad_norm": 0.008325994946062565, + "learning_rate": 2.5384244521520883e-05, + "loss": 7.637168164364993e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4080, + "tokens/total": 66856960, + "tokens/train_per_sec_per_gpu": 15.0, + "tokens/trainable": 21166612 + }, + { + "epoch": 3.9677575757575756, + "grad_norm": 0.00023120295372791588, + "learning_rate": 2.4935187478003297e-05, + "loss": 6.109364767326042e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4090, + "tokens/total": 67020800, + "tokens/train_per_sec_per_gpu": 14.93, + "tokens/trainable": 21218182 + }, + { + "epoch": 3.9774545454545454, + "grad_norm": 0.003937445115298033, + "learning_rate": 2.4489571544277945e-05, + "loss": 6.239359936444089e-06, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4100, + "tokens/total": 67184640, + "tokens/train_per_sec_per_gpu": 15.17, + "tokens/trainable": 21269560 + }, + { + "epoch": 3.987151515151515, + "grad_norm": 0.0002558927226345986, + "learning_rate": 2.40474171482031e-05, + "loss": 3.797525132540613e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00004, + "step": 4110, + "tokens/total": 67348480, + "tokens/train_per_sec_per_gpu": 13.23, + "tokens/trainable": 21321712 + }, + { + "epoch": 3.996848484848485, + "grad_norm": 0.0020596531685441732, + "learning_rate": 2.360874455895381e-05, + "loss": 1.2282792886253446e-05, + "memory/device_reserved (GiB)": 20.01, + "memory/max_active (GiB)": 16.23, + "memory/max_allocated (GiB)": 16.23, + "ppl": 1.00001, + "step": 4120, + "tokens/total": 67512320, + "tokens/train_per_sec_per_gpu": 13.91, + "tokens/trainable": 21373422 + } + ], + "logging_steps": 10, + "max_steps": 5155, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1031, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.500282255807873e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}