{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.615384615384615, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15384615384615385, "grad_norm": 4.412626190006247, "learning_rate": 8.333333333333333e-07, "loss": 1.2032, "step": 1 }, { "epoch": 0.3076923076923077, "grad_norm": 4.441970068628214, "learning_rate": 1.6666666666666667e-06, "loss": 1.2132, "step": 2 }, { "epoch": 0.46153846153846156, "grad_norm": 4.581997128609873, "learning_rate": 2.5e-06, "loss": 1.2137, "step": 3 }, { "epoch": 0.6153846153846154, "grad_norm": 4.304844304249193, "learning_rate": 3.3333333333333333e-06, "loss": 1.1822, "step": 4 }, { "epoch": 0.7692307692307693, "grad_norm": 3.8791643255654202, "learning_rate": 4.166666666666667e-06, "loss": 1.2146, "step": 5 }, { "epoch": 0.9230769230769231, "grad_norm": 3.0048985534972137, "learning_rate": 5e-06, "loss": 1.1544, "step": 6 }, { "epoch": 1.0, "grad_norm": 3.0048985534972137, "learning_rate": 4.995770395678171e-06, "loss": 0.5766, "step": 7 }, { "epoch": 1.1538461538461537, "grad_norm": 2.7977743386578084, "learning_rate": 4.983095894354858e-06, "loss": 1.1065, "step": 8 }, { "epoch": 1.3076923076923077, "grad_norm": 2.952566520549763, "learning_rate": 4.962019382530521e-06, "loss": 1.0679, "step": 9 }, { "epoch": 1.4615384615384617, "grad_norm": 2.6181710127008944, "learning_rate": 4.93261217644956e-06, "loss": 1.0155, "step": 10 }, { "epoch": 1.6153846153846154, "grad_norm": 2.559248743534276, "learning_rate": 4.894973780788722e-06, "loss": 1.0063, "step": 11 }, { "epoch": 1.7692307692307692, "grad_norm": 2.022076331439615, "learning_rate": 4.849231551964771e-06, "loss": 0.9441, "step": 12 }, { "epoch": 1.9230769230769231, "grad_norm": 2.4659223307375835, "learning_rate": 4.7955402672006855e-06, "loss": 0.9214, "step": 13 }, { "epoch": 2.0, "grad_norm": 2.3269412931921942, "learning_rate": 4.734081600808531e-06, "loss": 0.4572, "step": 14 }, { "epoch": 2.1538461538461537, "grad_norm": 1.8941456630880233, "learning_rate": 4.665063509461098e-06, "loss": 0.8955, "step": 15 }, { "epoch": 2.3076923076923075, "grad_norm": 1.647028999939705, "learning_rate": 4.588719528532342e-06, "loss": 0.8693, "step": 16 }, { "epoch": 2.4615384615384617, "grad_norm": 1.3563259284626916, "learning_rate": 4.50530798188761e-06, "loss": 0.841, "step": 17 }, { "epoch": 2.6153846153846154, "grad_norm": 1.251613461595753, "learning_rate": 4.415111107797445e-06, "loss": 0.8152, "step": 18 }, { "epoch": 2.769230769230769, "grad_norm": 1.4767406480770795, "learning_rate": 4.318434103932622e-06, "loss": 0.7768, "step": 19 }, { "epoch": 2.9230769230769234, "grad_norm": 1.5508389570529124, "learning_rate": 4.215604094671835e-06, "loss": 0.7846, "step": 20 }, { "epoch": 3.0, "grad_norm": 1.5508389570529124, "learning_rate": 4.106969024216348e-06, "loss": 0.3778, "step": 21 }, { "epoch": 3.1538461538461537, "grad_norm": 1.491365044443786, "learning_rate": 3.992896479256966e-06, "loss": 0.7495, "step": 22 }, { "epoch": 3.3076923076923075, "grad_norm": 1.2365324578204933, "learning_rate": 3.8737724451770155e-06, "loss": 0.7175, "step": 23 }, { "epoch": 3.4615384615384617, "grad_norm": 1.1370658509132563, "learning_rate": 3.7500000000000005e-06, "loss": 0.7281, "step": 24 }, { "epoch": 3.6153846153846154, "grad_norm": 7.279215244130937, "learning_rate": 3.621997950501156e-06, "loss": 0.7187, "step": 25 }, { "epoch": 3.769230769230769, "grad_norm": 1.3074060847565636, "learning_rate": 3.4901994150978926e-06, "loss": 0.7123, "step": 26 }, { "epoch": 3.9230769230769234, "grad_norm": 1.3207588352162958, "learning_rate": 3.3550503583141726e-06, "loss": 0.7025, "step": 27 }, { "epoch": 4.0, "grad_norm": 1.1690670792295927, "learning_rate": 3.217008081777726e-06, "loss": 0.3284, "step": 28 }, { "epoch": 4.153846153846154, "grad_norm": 1.1528609376029573, "learning_rate": 3.0765396768561005e-06, "loss": 0.6758, "step": 29 }, { "epoch": 4.3076923076923075, "grad_norm": 1.4549662024813634, "learning_rate": 2.9341204441673267e-06, "loss": 0.6794, "step": 30 }, { "epoch": 4.461538461538462, "grad_norm": 0.9855903276876635, "learning_rate": 2.7902322853130758e-06, "loss": 0.6446, "step": 31 }, { "epoch": 4.615384615384615, "grad_norm": 0.9679470488380136, "learning_rate": 2.6453620722761897e-06, "loss": 0.67, "step": 32 }, { "epoch": 4.769230769230769, "grad_norm": 1.7352648364409022, "learning_rate": 2.5e-06, "loss": 0.636, "step": 33 }, { "epoch": 4.923076923076923, "grad_norm": 0.9877574746856996, "learning_rate": 2.3546379277238107e-06, "loss": 0.6559, "step": 34 }, { "epoch": 5.0, "grad_norm": 0.9877574746856996, "learning_rate": 2.2097677146869242e-06, "loss": 0.3249, "step": 35 }, { "epoch": 5.153846153846154, "grad_norm": 1.3800661026691763, "learning_rate": 2.0658795558326745e-06, "loss": 0.6344, "step": 36 }, { "epoch": 5.3076923076923075, "grad_norm": 1.383753298521261, "learning_rate": 1.9234603231439e-06, "loss": 0.6334, "step": 37 }, { "epoch": 5.461538461538462, "grad_norm": 1.043144309237299, "learning_rate": 1.7829919182222752e-06, "loss": 0.6159, "step": 38 }, { "epoch": 5.615384615384615, "grad_norm": 0.915191752421227, "learning_rate": 1.6449496416858285e-06, "loss": 0.6194, "step": 39 }, { "epoch": 5.769230769230769, "grad_norm": 0.9290867173246218, "learning_rate": 1.509800584902108e-06, "loss": 0.6215, "step": 40 }, { "epoch": 5.923076923076923, "grad_norm": 0.877279653470722, "learning_rate": 1.3780020494988447e-06, "loss": 0.6164, "step": 41 }, { "epoch": 6.0, "grad_norm": 0.9625404590011445, "learning_rate": 1.2500000000000007e-06, "loss": 0.306, "step": 42 }, { "epoch": 6.153846153846154, "grad_norm": 0.9447763225617137, "learning_rate": 1.1262275548229852e-06, "loss": 0.6098, "step": 43 }, { "epoch": 6.3076923076923075, "grad_norm": 0.9026707226926396, "learning_rate": 1.0071035207430352e-06, "loss": 0.5961, "step": 44 }, { "epoch": 6.461538461538462, "grad_norm": 0.8125670239271661, "learning_rate": 8.930309757836517e-07, "loss": 0.5896, "step": 45 }, { "epoch": 6.615384615384615, "grad_norm": 0.8184346090402826, "learning_rate": 7.843959053281663e-07, "loss": 0.6085, "step": 46 }, { "epoch": 6.769230769230769, "grad_norm": 0.8180073728960267, "learning_rate": 6.815658960673782e-07, "loss": 0.6024, "step": 47 }, { "epoch": 6.923076923076923, "grad_norm": 0.797015085470038, "learning_rate": 5.848888922025553e-07, "loss": 0.5954, "step": 48 }, { "epoch": 7.0, "grad_norm": 0.797015085470038, "learning_rate": 4.946920181123904e-07, "loss": 0.2997, "step": 49 }, { "epoch": 7.153846153846154, "grad_norm": 1.0365029227676532, "learning_rate": 4.1128047146765936e-07, "loss": 0.5838, "step": 50 }, { "epoch": 7.3076923076923075, "grad_norm": 0.8010913969163229, "learning_rate": 3.3493649053890325e-07, "loss": 0.5993, "step": 51 }, { "epoch": 7.461538461538462, "grad_norm": 0.8047853994156616, "learning_rate": 2.6591839919146963e-07, "loss": 0.5908, "step": 52 }, { "epoch": 7.615384615384615, "grad_norm": 0.7895795633351214, "learning_rate": 2.044597327993153e-07, "loss": 0.5756, "step": 53 }, { "epoch": 7.769230769230769, "grad_norm": 0.8891963315989966, "learning_rate": 1.507684480352292e-07, "loss": 0.6084, "step": 54 }, { "epoch": 7.923076923076923, "grad_norm": 0.8233725761566618, "learning_rate": 1.0502621921127776e-07, "loss": 0.5859, "step": 55 }, { "epoch": 8.0, "grad_norm": 0.9193007935270117, "learning_rate": 6.738782355044048e-08, "loss": 0.2892, "step": 56 }, { "epoch": 8.153846153846153, "grad_norm": 0.8092902005307483, "learning_rate": 3.798061746947995e-08, "loss": 0.586, "step": 57 }, { "epoch": 8.307692307692308, "grad_norm": 0.7679863614372926, "learning_rate": 1.6904105645142443e-08, "loss": 0.5827, "step": 58 }, { "epoch": 8.461538461538462, "grad_norm": 1.6098807513131712, "learning_rate": 4.229604321829561e-09, "loss": 0.5837, "step": 59 }, { "epoch": 8.615384615384615, "grad_norm": 0.8372544817321463, "learning_rate": 0.0, "loss": 0.5892, "step": 60 }, { "epoch": 8.615384615384615, "step": 60, "total_flos": 24955762507776.0, "train_loss": 0.0, "train_runtime": 0.003, "train_samples_per_second": 1345694.74, "train_steps_per_second": 19985.565 } ], "logging_steps": 1.0, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 24955762507776.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }