diff --git "a/Pretrain_2B/training_log.txt" "b/Pretrain_2B/training_log.txt" new file mode 100644--- /dev/null +++ "b/Pretrain_2B/training_log.txt" @@ -0,0 +1,15873 @@ +WARNING:torch.distributed.run: +***************************************** +Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. +***************************************** +[2024-12-31 17:08:38,339] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,339] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,342] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,346] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,346] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,346] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,346] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2024-12-31 17:08:38,346] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect) +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it.petrel_client is not installed. If you read data locally instead of from ceph, ignore it. + +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images.petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. + +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. Using PIL to load images. +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[2024-12-31 17:09:06,656] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2024-12-31 17:09:06,656] [INFO] [comm.py:637:init_distributed] cdb=None +[INFO|training_args.py:1828] 2024-12-31 17:09:06,700 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,700 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,700 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,700 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,700 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,702 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,703 >> PyTorch: setting up devices +[INFO|training_args.py:1828] 2024-12-31 17:09:06,704 >> PyTorch: setting up devices +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 6, device: cuda:6, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - INFO - __main__ - Training/evaluation parameters TrainingArguments( +_n_gpu=1, +adafactor=False, +adam_beta1=0.9, +adam_beta2=0.999, +adam_epsilon=1e-08, +auto_find_batch_size=False, +bf16=True, +bf16_full_eval=False, +data_seed=None, +dataloader_drop_last=False, +dataloader_num_workers=4, +dataloader_persistent_workers=False, +dataloader_pin_memory=True, +ddp_backend=None, +ddp_broadcast_buffers=None, +ddp_bucket_cap_mb=None, +ddp_find_unused_parameters=None, +ddp_timeout=1800, +debug=[], +deepspeed=zero_stage1_config.json, +disable_tqdm=False, +dispatch_batches=None, +do_eval=False, +do_predict=False, +do_train=True, +eval_accumulation_steps=None, +eval_delay=0, +eval_steps=None, +evaluation_strategy=no, +fp16=False, +fp16_backend=auto, +fp16_full_eval=False, +fp16_opt_level=O1, +fsdp=[], +fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, +fsdp_min_num_params=0, +fsdp_transformer_layer_cls_to_wrap=None, +full_determinism=False, +gradient_accumulation_steps=16, +gradient_checkpointing=False, +gradient_checkpointing_kwargs=None, +greater_is_better=None, +group_by_length=True, +half_precision_backend=auto, +hub_always_push=False, +hub_model_id=None, +hub_private_repo=False, +hub_strategy=every_save, +hub_token=, +ignore_data_skip=False, +include_inputs_for_metrics=False, +include_num_input_tokens_seen=False, +include_tokens_per_second=False, +jit_mode_eval=False, +label_names=None, +label_smoothing_factor=0.0, +learning_rate=2e-05, +length_column_name=length, +load_best_model_at_end=False, +local_rank=0, +log_level=passive, +log_level_replica=warning, +log_on_each_node=True, +logging_dir=work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/runs/Dec31_17-09-06_de-18913-zczhao-a100-8-0910173806-c56ccbb85-w9xns, +logging_first_step=False, +logging_nan_inf_filter=True, +logging_steps=1.0, +logging_strategy=steps, +lr_scheduler_kwargs={}, +lr_scheduler_type=cosine, +max_grad_norm=1.0, +max_steps=-1, +metric_for_best_model=None, +mp_parameters=, +neftune_noise_alpha=None, +no_cuda=False, +num_train_epochs=1.0, +optim=adamw_torch, +optim_args=None, +output_dir=work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight, +overwrite_output_dir=True, +past_index=-1, +per_device_eval_batch_size=8, +per_device_train_batch_size=2, +prediction_loss_only=False, +push_to_hub=False, +push_to_hub_model_id=None, +push_to_hub_organization=None, +push_to_hub_token=, +ray_scope=last, +remove_unused_columns=True, +report_to=['tensorboard'], +resume_from_checkpoint=None, +run_name=work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight, +save_on_each_node=False, +save_only_model=False, +save_safetensors=True, +save_steps=200, +save_strategy=steps, +save_total_limit=1, +seed=42, +skip_memory_metrics=True, +split_batches=False, +tf32=None, +torch_compile=False, +torch_compile_backend=None, +torch_compile_mode=None, +torchdynamo=None, +tpu_metrics_debug=False, +tpu_num_cores=None, +use_cpu=False, +use_ipex=False, +use_legacy_prediction_loop=False, +use_mps_device=False, +warmup_ratio=0.03, +warmup_steps=0, +weight_decay=0.01, +) +12/31/2024 17:09:06 - INFO - __main__ - Loading Tokenizer: ./weight/InternVL2-2B +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 4, device: cuda:4, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 1, device: cuda:1, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 3, device: cuda:3, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 5, device: cuda:5, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 7, device: cuda:7, n_gpu: 1distributed training: True, 16-bits training: False +12/31/2024 17:09:06 - WARNING - __main__ - Process rank: 2, device: cuda:2, n_gpu: 1distributed training: True, 16-bits training: False +[INFO|tokenization_utils_base.py:2025] 2024-12-31 17:09:06,936 >> loading file ./tokenizer.model +[INFO|tokenization_utils_base.py:2025] 2024-12-31 17:09:06,936 >> loading file added_tokens.json +[INFO|tokenization_utils_base.py:2025] 2024-12-31 17:09:06,936 >> loading file special_tokens_map.json +[INFO|tokenization_utils_base.py:2025] 2024-12-31 17:09:06,936 >> loading file tokenizer_config.json +[INFO|tokenization_utils_base.py:2025] 2024-12-31 17:09:06,936 >> loading file tokenizer.json +[WARNING|logging.py:314] 2024-12-31 17:09:07,123 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,123 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,124 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,124 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,125 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,126 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,127 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +[WARNING|logging.py:314] 2024-12-31 17:09:07,129 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. +12/31/2024 17:09:07 - INFO - __main__ - Loading InternVLChatModel... +[INFO|configuration_utils.py:727] 2024-12-31 17:09:07,251 >> loading configuration file ./weight/InternVL2-2B/config.json +[INFO|configuration_utils.py:792] 2024-12-31 17:09:07,252 >> Model config InternVLChatConfig { + "_commit_hash": null, + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "hidden_size": 2048, + "llm_config": { + "_name_or_path": "internlm/internlm2-chat-1_8b", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "flash_attention_2", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "pad2square": false, + "ps_version": "v2", + "select_layer": -1, + "template": "internlm2-chat", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.37.2", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} + +12/31/2024 17:09:07 - INFO - __main__ - Using flash_attention_2 for InternLM +[INFO|modeling_utils.py:3473] 2024-12-31 17:09:07,256 >> loading weights file ./weight/InternVL2-2B/model.safetensors +[INFO|modeling_utils.py:1426] 2024-12-31 17:09:08,054 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16. +[INFO|configuration_utils.py:826] 2024-12-31 17:09:08,068 >> Generate config GenerationConfig {} + +[INFO|configuration_utils.py:826] 2024-12-31 17:09:08,187 >> Generate config GenerationConfig { + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 2 +} + +[INFO|modeling_utils.py:4350] 2024-12-31 17:09:22,512 >> All model checkpoint weights were used when initializing InternVLChatModel. + +[INFO|modeling_utils.py:4358] 2024-12-31 17:09:22,540 >> All the weights of InternVLChatModel were initialized from the model checkpoint at ./weight/InternVL2-2B. +If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training. +[INFO|configuration_utils.py:779] 2024-12-31 17:09:22,554 >> loading configuration file ./weight/InternVL2-2B/generation_config.json +[INFO|configuration_utils.py:826] 2024-12-31 17:09:22,555 >> Generate config GenerationConfig { + "eos_token_id": [ + 92542, + 92543 + ] +} + +12/31/2024 17:09:22 - INFO - __main__ - Finished +12/31/2024 17:09:22 - INFO - __main__ - model.config.force_image_size: 448 +12/31/2024 17:09:22 - INFO - __main__ - data_args.force_image_size: 448 +12/31/2024 17:09:22 - INFO - __main__ - model.config.vision_config.image_size: 448 +[INFO|modeling_utils.py:1900] 2024-12-31 17:09:22,836 >> You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 92559. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.tok_embeddings.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.output.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.0.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.0.bias +12/31/2024 17:09:39 - INFO - __main__ - mlp1.1.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.1.bias +12/31/2024 17:09:39 - INFO - __main__ - mlp1.3.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.3.bias +12/31/2024 17:09:39 - INFO - __main__ - not train params: +12/31/2024 17:09:39 - INFO - __main__ - vision_model.embeddings.class_embedding +12/31/2024 17:09:39 - INFO - __main__ - vision_model.embeddings.position_embedding +12/31/2024 17:09:39 - INFO - __main__ - vision_model.embeddings.patch_embedding.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.embeddings.patch_embedding.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.0.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.1.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.2.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.3.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.4.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.5.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.6.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.7.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.8.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.9.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.10.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.11.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.12.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.13.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.14.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.15.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.16.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.17.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.18.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.19.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.20.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.21.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.22.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.ls1 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.ls2 +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.attn.qkv.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.attn.qkv.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.attn.proj.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.attn.proj.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.mlp.fc1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.mlp.fc1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.mlp.fc2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.mlp.fc2.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.norm1.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.norm1.bias +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.norm2.weight +12/31/2024 17:09:39 - INFO - __main__ - vision_model.encoder.layers.23.norm2.bias +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.0.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.1.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.2.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.3.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.4.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.5.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.6.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.7.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.8.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.9.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.10.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.11.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.12.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.13.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.14.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.15.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.16.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.17.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.18.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.19.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.20.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.21.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.22.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.attention.wqkv.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.attention.wo.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.feed_forward.w1.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.feed_forward.w3.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.feed_forward.w2.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.attention_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.layers.23.ffn_norm.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.norm.weight +12/31/2024 17:09:39 - INFO - __main__ - train params (no include lora) +12/31/2024 17:09:39 - INFO - __main__ - language_model.model.tok_embeddings.weight +12/31/2024 17:09:39 - INFO - __main__ - language_model.output.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.0.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.0.bias +12/31/2024 17:09:39 - INFO - __main__ - mlp1.1.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.1.bias +12/31/2024 17:09:39 - INFO - __main__ - mlp1.3.weight +12/31/2024 17:09:39 - INFO - __main__ - mlp1.3.bias +12/31/2024 17:09:39 - INFO - __main__ - trainable params: 391716864 || all params: 2205778944 || trainable%: 17.7587 +12/31/2024 17:09:39 - INFO - __main__ - LLaMA params: 1889171456 || ViT params: 304012288 || Projector params: 12595200 +12/31/2024 17:09:39 - INFO - __main__ - LLM_lora params: 0 || Vision_lora params: 0 || others params: 0 +12/31/2024 17:09:39 - INFO - __main__ - bbox_projector_params: 0 +12/31/2024 17:09:39 - INFO - __main__ - Bev tower params: 0 || Bev train params: 0 +12/31/2024 17:09:39 - INFO - __main__ - det query params: 0 || det query train params: 0 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] num_image_token: 256 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] dynamic_image_size: False +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] use_thumbnail: True +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6 +12/31/2024 17:09:39 - INFO - internvl.train.nuscene_dataset - Formatting inputs...Skip in lazy mode +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +12/31/2024 17:09:59 - WARNING - accelerate.utils.other - Detected kernel version 5.4.54, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher. +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +12/31/2024 17:09:59 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +[INFO|trainer.py:571] 2024-12-31 17:10:00,035 >> Using auto half precision backend +12/31/2024 17:10:00 - INFO - internvl.train.nuscene_dataset - Add dataset: nuscene_cap_194k with length: 194308 +[2024-12-31 17:10:00,345] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.5, git-hash=unknown, git-branch=unknown +[2024-12-31 17:10:03,571] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2024-12-31 17:10:03,572] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer +[2024-12-31 17:10:03,572] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2024-12-31 17:10:03,574] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = AdamW +[2024-12-31 17:10:03,574] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=AdamW type= +[2024-12-31 17:10:03,574] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 1 optimizer +[2024-12-31 17:10:03,574] [INFO] [stage_1_and_2.py:149:__init__] Reduce bucket size 1000000000 +[2024-12-31 17:10:03,574] [INFO] [stage_1_and_2.py:150:__init__] Allgather bucket size 1000000000 +[2024-12-31 17:10:03,574] [INFO] [stage_1_and_2.py:151:__init__] CPU Offload: False +[2024-12-31 17:10:03,574] [INFO] [stage_1_and_2.py:152:__init__] Round robin gradient partitioning: False +[2024-12-31 17:10:04,424] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states +[2024-12-31 17:10:04,425] [INFO] [utils.py:801:see_memory_usage] MA 4.67 GB Max_MA 4.76 GB CA 4.94 GB Max_CA 5 GB +[2024-12-31 17:10:04,425] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 222.12 GB, percent = 22.0% +[2024-12-31 17:10:04,679] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states +[2024-12-31 17:10:04,680] [INFO] [utils.py:801:see_memory_usage] MA 4.67 GB Max_MA 4.85 GB CA 5.12 GB Max_CA 5 GB +[2024-12-31 17:10:04,680] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 222.11 GB, percent = 22.0% +[2024-12-31 17:10:04,680] [INFO] [stage_1_and_2.py:539:__init__] optimizer state initialized +[2024-12-31 17:10:04,932] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer +[2024-12-31 17:10:04,933] [INFO] [utils.py:801:see_memory_usage] MA 4.67 GB Max_MA 4.67 GB CA 5.12 GB Max_CA 5 GB +[2024-12-31 17:10:04,933] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory: used = 222.12 GB, percent = 22.0% +[2024-12-31 17:10:04,934] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw +[2024-12-31 17:10:04,934] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler +[2024-12-31 17:10:04,934] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2024-12-31 17:10:04,934] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]] +[2024-12-31 17:10:04,935] [INFO] [config.py:996:print] DeepSpeedEngine configuration: +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] amp_enabled .................. False +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] amp_params ................... False +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] bfloat16_enabled ............. True +[2024-12-31 17:10:04,935] [INFO] [config.py:1000:print] bfloat16_immediate_grad_update False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] checkpoint_parallel_write_pipeline False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] checkpoint_tag_validation_enabled True +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] checkpoint_tag_validation_fail False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] comms_config ................. +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] communication_data_type ...... None +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] compile_config ............... enabled=False backend='inductor' kwargs={} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] curriculum_enabled_legacy .... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] curriculum_params_legacy ..... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] data_efficiency_enabled ...... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] dataloader_drop_last ......... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] disable_allgather ............ False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] dump_state ................... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] dynamic_loss_scale_args ...... None +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_enabled ........... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_gas_boundary_resolution 1 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_layer_name ........ bert.encoder.layer +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_layer_num ......... 0 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_max_iter .......... 100 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_stability ......... 1e-06 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_tol ............... 0.01 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] eigenvalue_verbose ........... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] elasticity_enabled ........... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] flops_profiler_config ........ { + "enabled": false, + "recompute_fwd_factor": 0.0, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] fp16_auto_cast ............... None +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] fp16_enabled ................. False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] fp16_master_weights_and_gradients False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] global_rank .................. 0 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] grad_accum_dtype ............. None +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] gradient_accumulation_steps .. 16 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] gradient_clipping ............ 1.0 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] gradient_predivide_factor .... 1.0 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] graph_harvesting ............. False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] initial_dynamic_scale ........ 1 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] load_universal_checkpoint .... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] loss_scale ................... 1.0 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] memory_breakdown ............. False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] mics_hierarchial_params_gather False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] mics_shard_size .............. -1 +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] optimizer_legacy_fusion ...... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] optimizer_name ............... adamw +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] optimizer_params ............. {'lr': 2e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.01} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] pld_enabled .................. False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] pld_params ................... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] prescale_gradients ........... False +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] scheduler_name ............... None +[2024-12-31 17:10:04,936] [INFO] [config.py:1000:print] scheduler_params ............. None +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] seq_parallel_communication_data_type torch.float32 +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] sparse_attention ............. None +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] sparse_gradients_enabled ..... False +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] steps_per_print .............. inf +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] train_batch_size ............. 256 +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] train_micro_batch_size_per_gpu 2 +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] use_data_before_expert_parallel_ False +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] use_node_local_storage ....... False +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] wall_clock_breakdown ......... True +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] weight_quantization_config ... None +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] world_size ................... 8 +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] zero_allow_untested_optimizer False +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] zero_config .................. stage=1 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=1000000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] zero_enabled ................. True +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] zero_force_ds_cpu_optimizer .. True +[2024-12-31 17:10:04,937] [INFO] [config.py:1000:print] zero_optimization_stage ...... 1 +[2024-12-31 17:10:04,937] [INFO] [config.py:986:print_user_config] json = { + "zero_optimization": { + "stage": 1, + "allgather_partitions": true, + "allgather_bucket_size": 1.000000e+09, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 1.000000e+09, + "contiguous_gradients": true + }, + "fp16": { + "enabled": false, + "auto_cast": true, + "loss_scale": 0, + "initial_scale_power": 32, + "loss_scale_window": 1000, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": true + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 2e-05, + "betas": [0.9, 0.999], + "eps": 1e-08, + "weight_decay": 0.01 + } + }, + "gradient_accumulation_steps": 16, + "gradient_clipping": 1.0, + "steps_per_print": inf, + "train_batch_size": 256, + "train_micro_batch_size_per_gpu": 2, + "wall_clock_breakdown": true +} +[INFO|trainer.py:1721] 2024-12-31 17:10:04,937 >> ***** Running training ***** +[INFO|trainer.py:1722] 2024-12-31 17:10:04,937 >> Num examples = 194,308 +[INFO|trainer.py:1723] 2024-12-31 17:10:04,937 >> Num Epochs = 1 +[INFO|trainer.py:1724] 2024-12-31 17:10:04,937 >> Instantaneous batch size per device = 2 +[INFO|trainer.py:1727] 2024-12-31 17:10:04,937 >> Total train batch size (w. parallel, distributed & accumulation) = 256 +[INFO|trainer.py:1728] 2024-12-31 17:10:04,937 >> Gradient Accumulation steps = 16 +[INFO|trainer.py:1729] 2024-12-31 17:10:04,937 >> Total optimization steps = 759 +[INFO|trainer.py:1730] 2024-12-31 17:10:04,939 >> Number of trainable parameters = 391,716,864 + 0%| | 0/759 [00:00> Saving model checkpoint to work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200 +[INFO|configuration_utils.py:473] 2024-12-31 17:36:56,163 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/config.json +[INFO|configuration_utils.py:594] 2024-12-31 17:36:56,172 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/generation_config.json +[INFO|modeling_utils.py:2493] 2024-12-31 17:39:15,607 >> Model weights saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/model.safetensors +[INFO|tokenization_utils_base.py:2433] 2024-12-31 17:39:15,788 >> tokenizer config file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/tokenizer_config.json +[INFO|tokenization_utils_base.py:2442] 2024-12-31 17:39:15,862 >> Special tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/special_tokens_map.json +[INFO|tokenization_utils_base.py:2493] 2024-12-31 17:39:15,892 >> added tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/added_tokens.json +[2024-12-31 17:39:29,658] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step200 is about to be saved! +[2024-12-31 17:39:29,736] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt +[2024-12-31 17:39:29,736] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt... +[2024-12-31 17:39:51,587] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/global_step200/mp_rank_00_model_states.pt. +[2024-12-31 17:39:51,621] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-12-31 17:40:01,985] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-12-31 17:40:02,212] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-200/global_step200/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-12-31 17:40:02,214] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step200 is ready now! +[2024-12-31 17:40:04,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 256.01 | bwd_microstep: 343.70 | bwd_inner_microstep: 343.32 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.27 +[2024-12-31 17:40:05,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.43 | bwd_microstep: 281.57 | bwd_inner_microstep: 281.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:40:05,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.32 | bwd_microstep: 265.46 | bwd_inner_microstep: 265.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:40:06,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.61 | bwd_microstep: 261.80 | bwd_inner_microstep: 261.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:40:06,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 247.10 | bwd_inner_microstep: 247.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:40:06,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:40:07,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:40:07,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:15,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.73 | bwd_microstep: 245.57 | bwd_inner_microstep: 245.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:16,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:16,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.51 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:40:17,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 242.90 | bwd_inner_microstep: 242.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:17,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.85 | bwd_microstep: 241.67 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:18,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 240.78 | bwd_inner_microstep: 240.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:18,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:20,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.32 | optimizer_gradients: 1.03 | optimizer_step: 3.47 +[2024-12-31 17:40:20,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.90 | bwd_microstep: 1257.36 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 1013.48 | step_microstep: 14.15 +[2024-12-31 17:40:20,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2872.47 | bwd: 5092.89 | bwd_inner: 4078.63 | bwd_allreduce: 1013.74 | step: 17.16 + 26%|██▋ | 201/759 [30:15<10:21:30, 66.83s/it] {'loss': 1.2496, 'learning_rate': 1.7249824909333445e-05, 'epoch': 0.26} + 26%|██▋ | 201/759 [30:15<10:21:30, 66.83s/it][2024-12-31 17:40:20,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.88 | bwd_microstep: 309.78 | bwd_inner_microstep: 309.40 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:40:21,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.70 | bwd_microstep: 292.20 | bwd_inner_microstep: 292.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:40:21,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.33 | bwd_microstep: 268.75 | bwd_inner_microstep: 268.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:40:22,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.00 | bwd_microstep: 263.58 | bwd_inner_microstep: 263.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:40:22,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 248.85 | bwd_inner_microstep: 248.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:40:23,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 261.15 | bwd_inner_microstep: 261.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:40:23,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:24,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 246.29 | bwd_inner_microstep: 246.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:24,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:25,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:40:25,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 247.93 | bwd_inner_microstep: 247.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:25,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.38 | bwd_microstep: 262.98 | bwd_inner_microstep: 262.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:26,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:26,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.52 | bwd_microstep: 240.54 | bwd_inner_microstep: 240.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:27,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 242.46 | bwd_inner_microstep: 242.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:40:31,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.14 | optimizer_gradients: 0.88 | optimizer_step: 3.31 +[2024-12-31 17:40:31,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.07 | bwd_microstep: 2762.29 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 2518.47 | step_microstep: 13.55 +[2024-12-31 17:40:31,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2806.50 | bwd: 6627.71 | bwd_inner: 4108.30 | bwd_allreduce: 2518.73 | step: 16.80 + 27%|██▋ | 202/759 [30:26<7:44:21, 50.02s/it] {'loss': 1.2728, 'learning_rate': 1.722035913991048e-05, 'epoch': 0.27} + 27%|██▋ | 202/759 [30:26<7:44:21, 50.02s/it][2024-12-31 17:40:31,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.83 | bwd_microstep: 346.53 | bwd_inner_microstep: 346.00 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.28 +[2024-12-31 17:40:32,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.88 | bwd_microstep: 279.89 | bwd_inner_microstep: 279.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:32,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.44 | bwd_microstep: 256.41 | bwd_inner_microstep: 256.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:33,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.12 | bwd_microstep: 254.12 | bwd_inner_microstep: 254.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:40:33,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 248.91 | bwd_inner_microstep: 248.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 17:40:33,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 246.25 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 17:40:34,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:40:34,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:35,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 247.40 | bwd_inner_microstep: 247.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:40:35,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:40:36,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.03 | bwd_microstep: 241.16 | bwd_inner_microstep: 241.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:36,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.67 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:36,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 242.61 | bwd_inner_microstep: 242.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:40:37,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.49 | bwd_microstep: 240.79 | bwd_inner_microstep: 240.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:38,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.83 | bwd_microstep: 240.80 | bwd_inner_microstep: 240.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:40:39,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.43 | optimizer_gradients: 0.64 | optimizer_step: 3.10 +[2024-12-31 17:40:39,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 569.30 | bwd_inner_microstep: 241.81 | bwd_allreduce_microstep: 327.43 | step_microstep: 11.86 +[2024-12-31 17:40:39,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2794.70 | bwd: 4391.13 | bwd_inner: 4062.66 | bwd_allreduce: 327.76 | step: 15.19 + 27%|██▋ | 203/759 [30:34<5:46:45, 37.42s/it] {'loss': 1.2566, 'learning_rate': 1.719076181703291e-05, 'epoch': 0.27} + 27%|██▋ | 203/759 [30:34<5:46:45, 37.42s/it][2024-12-31 17:40:39,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.92 | bwd_microstep: 297.18 | bwd_inner_microstep: 296.83 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:40:40,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.84 | bwd_microstep: 280.78 | bwd_inner_microstep: 280.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:41,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.25 | bwd_microstep: 282.80 | bwd_inner_microstep: 282.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:41,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.18 | bwd_microstep: 280.94 | bwd_inner_microstep: 280.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:42,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.87 | bwd_microstep: 256.16 | bwd_inner_microstep: 256.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:42,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 249.29 | bwd_inner_microstep: 249.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:43,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:44,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:40:44,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:45,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.07 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:40:45,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.86 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:46,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.14 | bwd_microstep: 242.77 | bwd_inner_microstep: 242.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:47,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.92 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:49,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 1134.64 | bwd_inner_microstep: 1134.31 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.26 +[2024-12-31 17:40:49,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.23 +[2024-12-31 17:40:50,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.00 | optimizer_gradients: 0.75 | optimizer_step: 3.54 +[2024-12-31 17:40:50,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.41 | bwd_microstep: 256.83 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 13.65 | step_microstep: 13.70 +[2024-12-31 17:40:50,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2788.84 | bwd: 4997.84 | bwd_inner: 4982.54 | bwd_allreduce: 14.34 | step: 16.84 + 27%|██▋ | 204/759 [30:45<4:33:09, 29.53s/it] {'loss': 1.2786, 'learning_rate': 1.716103347995785e-05, 'epoch': 0.27} + 27%|██▋ | 204/759 [30:45<4:33:09, 29.53s/it][2024-12-31 17:40:50,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.26 | bwd_microstep: 313.53 | bwd_inner_microstep: 313.15 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.25 +[2024-12-31 17:40:51,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.97 | bwd_microstep: 289.01 | bwd_inner_microstep: 288.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:40:51,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.68 | bwd_microstep: 263.54 | bwd_inner_microstep: 263.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:40:52,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.25 | bwd_microstep: 254.05 | bwd_inner_microstep: 254.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:52,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 248.33 | bwd_inner_microstep: 248.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:53,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 248.47 | bwd_inner_microstep: 248.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:40:53,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:40:53,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:40:54,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:40:55,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.21 +[2024-12-31 17:40:56,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.19 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.31 +[2024-12-31 17:40:56,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:40:56,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.57 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:58,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.03 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:58,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.65 | bwd_microstep: 242.67 | bwd_inner_microstep: 242.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:40:59,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.83 | optimizer_step: 3.27 +[2024-12-31 17:40:59,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.75 | bwd_microstep: 1043.04 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 798.74 | step_microstep: 12.31 +[2024-12-31 17:40:59,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2790.90 | bwd: 4855.82 | bwd_inner: 4055.77 | bwd_allreduce: 799.23 | step: 15.59 + 27%|██▋ | 205/759 [30:54<3:37:32, 23.56s/it] {'loss': 1.2873, 'learning_rate': 1.713117467032948e-05, 'epoch': 0.27} + 27%|██▋ | 205/759 [30:54<3:37:32, 23.56s/it][2024-12-31 17:41:00,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.93 | bwd_microstep: 346.64 | bwd_inner_microstep: 346.22 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.20 +[2024-12-31 17:41:00,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.76 | bwd_microstep: 282.12 | bwd_inner_microstep: 282.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:01,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.56 | bwd_microstep: 267.92 | bwd_inner_microstep: 267.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:01,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.47 | bwd_microstep: 263.11 | bwd_inner_microstep: 263.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:02,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 249.89 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:41:02,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 255.10 | bwd_inner_microstep: 255.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:03,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 248.34 | bwd_inner_microstep: 248.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:03,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:04,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.90 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:04,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 323.28 | bwd_inner_microstep: 322.89 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.29 +[2024-12-31 17:41:05,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:05,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 244.95 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.22 | step_microstep: 0.29 +[2024-12-31 17:41:05,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 247.08 | bwd_inner_microstep: 247.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:41:07,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:07,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 242.69 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:08,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.90 | optimizer_gradients: 0.63 | optimizer_step: 3.14 +[2024-12-31 17:41:08,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 456.60 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 211.50 | step_microstep: 11.98 +[2024-12-31 17:41:08,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2849.85 | bwd: 4407.05 | bwd_inner: 4193.81 | bwd_allreduce: 212.26 | step: 14.96 + 27%|██▋ | 206/759 [31:03<2:55:23, 19.03s/it] {'loss': 1.2745, 'learning_rate': 1.7101185932169147e-05, 'epoch': 0.27} + 27%|██▋ | 206/759 [31:03<2:55:23, 19.03s/it][2024-12-31 17:41:08,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.08 | bwd_microstep: 315.20 | bwd_inner_microstep: 314.82 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:41:09,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.75 | bwd_microstep: 292.33 | bwd_inner_microstep: 292.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:10,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.88 | bwd_microstep: 265.56 | bwd_inner_microstep: 265.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:10,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.44 | bwd_microstep: 266.58 | bwd_inner_microstep: 266.40 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.23 +[2024-12-31 17:41:11,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.47 | bwd_microstep: 255.72 | bwd_inner_microstep: 255.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:11,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:41:12,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:12,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 247.34 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:12,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:14,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:14,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 242.10 | bwd_inner_microstep: 242.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:15,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:41:15,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 242.18 | bwd_inner_microstep: 242.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:16,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.14 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:41:16,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 241.19 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:18,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.86 | optimizer_gradients: 0.85 | optimizer_step: 3.25 +[2024-12-31 17:41:18,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 1969.93 | bwd_inner_microstep: 242.52 | bwd_allreduce_microstep: 1727.36 | step_microstep: 12.96 +[2024-12-31 17:41:18,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2800.69 | bwd: 5808.25 | bwd_inner: 4079.90 | bwd_allreduce: 1727.72 | step: 15.94 + 27%|██▋ | 207/759 [31:13<2:31:18, 16.45s/it] {'loss': 1.249, 'learning_rate': 1.7071067811865477e-05, 'epoch': 0.27} + 27%|██▋ | 207/759 [31:13<2:31:18, 16.45s/it][2024-12-31 17:41:19,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.47 | bwd_microstep: 362.21 | bwd_inner_microstep: 361.85 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:41:19,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.16 | bwd_microstep: 299.61 | bwd_inner_microstep: 299.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:20,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.01 | bwd_microstep: 301.31 | bwd_inner_microstep: 301.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:20,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.45 | bwd_microstep: 284.70 | bwd_inner_microstep: 284.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:21,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.44 | bwd_microstep: 262.89 | bwd_inner_microstep: 262.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:41:21,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.69 | bwd_microstep: 256.69 | bwd_inner_microstep: 256.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:22,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 249.46 | bwd_inner_microstep: 249.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:22,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:23,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:23,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 246.80 | bwd_inner_microstep: 246.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:23,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:41:24,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.97 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:24,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:25,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:25,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:26,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.95 | optimizer_step: 3.38 +[2024-12-31 17:41:26,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 257.39 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 13.60 | step_microstep: 12.28 +[2024-12-31 17:41:26,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2854.19 | bwd: 4236.23 | bwd_inner: 4221.57 | bwd_allreduce: 13.89 | step: 15.29 + 27%|██▋ | 208/759 [31:21<2:06:04, 13.73s/it] {'loss': 1.2347, 'learning_rate': 1.7040820858164413e-05, 'epoch': 0.27} + 27%|██▋ | 208/759 [31:21<2:06:04, 13.73s/it][2024-12-31 17:41:26,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.31 | bwd_microstep: 392.40 | bwd_inner_microstep: 391.98 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:41:27,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.00 | bwd_microstep: 297.43 | bwd_inner_microstep: 297.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:41:27,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.56 | bwd_microstep: 267.85 | bwd_inner_microstep: 267.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:28,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.38 | bwd_microstep: 262.65 | bwd_inner_microstep: 262.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:28,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.37 | bwd_microstep: 257.21 | bwd_inner_microstep: 257.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:29,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.58 | bwd_microstep: 256.20 | bwd_inner_microstep: 256.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:41:29,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 248.71 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:29,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:30,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:30,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:31,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:31,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:41:32,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:32,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.24 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.35 +[2024-12-31 17:41:32,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 249.44 | bwd_inner_microstep: 249.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:41:33,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.40 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 17:41:33,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 302.47 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 59.43 | step_microstep: 11.30 +[2024-12-31 17:41:33,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.15 | bwd: 4250.70 | bwd_inner: 4190.06 | bwd_allreduce: 59.89 | step: 14.26 + 28%|██▊ | 209/759 [31:28<1:48:20, 11.82s/it] {'loss': 1.2614, 'learning_rate': 1.7010445622159214e-05, 'epoch': 0.28} + 28%|██▊ | 209/759 [31:28<1:48:20, 11.82s/it][2024-12-31 17:41:33,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.82 | bwd_microstep: 298.78 | bwd_inner_microstep: 298.41 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.23 +[2024-12-31 17:41:34,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.33 | bwd_microstep: 357.91 | bwd_inner_microstep: 357.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:41:35,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.81 | bwd_microstep: 263.24 | bwd_inner_microstep: 263.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:35,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.75 | bwd_microstep: 257.47 | bwd_inner_microstep: 257.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:41:35,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.53 | bwd_microstep: 257.50 | bwd_inner_microstep: 257.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:41:36,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:36,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 299.74 | bwd_inner_microstep: 299.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:41:37,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 246.89 | bwd_inner_microstep: 246.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:41:37,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.67 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:38,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:38,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:39,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:39,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 250.88 | bwd_inner_microstep: 250.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:39,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 250.87 | bwd_inner_microstep: 250.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:40,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 242.48 | bwd_inner_microstep: 242.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:40,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 0.75 | optimizer_step: 3.17 +[2024-12-31 17:41:40,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.19 | bwd_microstep: 439.76 | bwd_inner_microstep: 241.84 | bwd_allreduce_microstep: 197.87 | step_microstep: 11.79 +[2024-12-31 17:41:40,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2834.00 | bwd: 4393.39 | bwd_inner: 4194.59 | bwd_allreduce: 198.14 | step: 14.72 + 28%|██▊ | 210/759 [31:35<1:36:19, 10.53s/it] {'loss': 1.2607, 'learning_rate': 1.6979942657280414e-05, 'epoch': 0.28} + 28%|██▊ | 210/759 [31:35<1:36:19, 10.53s/it][2024-12-31 17:41:41,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.57 | bwd_microstep: 337.82 | bwd_inner_microstep: 337.45 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.20 +[2024-12-31 17:41:42,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.52 | bwd_microstep: 285.85 | bwd_inner_microstep: 285.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:42,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.96 | bwd_microstep: 268.95 | bwd_inner_microstep: 268.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:42,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.51 | bwd_microstep: 262.46 | bwd_inner_microstep: 262.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:43,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.17 | bwd_microstep: 258.35 | bwd_inner_microstep: 258.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:43,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.46 | bwd_microstep: 262.15 | bwd_inner_microstep: 262.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:44,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 250.24 | bwd_inner_microstep: 250.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:41:44,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:45,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:45,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 246.16 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:46,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.40 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:46,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 246.96 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:46,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:41:47,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:47,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:48,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.61 | optimizer_gradients: 0.61 | optimizer_step: 3.10 +[2024-12-31 17:41:48,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 485.74 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 241.13 | step_microstep: 11.23 +[2024-12-31 17:41:48,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2827.02 | bwd: 4374.37 | bwd_inner: 4132.45 | bwd_allreduce: 241.40 | step: 14.23 + 28%|██▊ | 211/759 [31:43<1:27:51, 9.62s/it] {'loss': 1.2215, 'learning_rate': 1.694931251928575e-05, 'epoch': 0.28} + 28%|██▊ | 211/759 [31:43<1:27:51, 9.62s/it][2024-12-31 17:41:49,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.36 | bwd_microstep: 369.89 | bwd_inner_microstep: 369.53 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:41:49,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.80 | bwd_microstep: 297.54 | bwd_inner_microstep: 297.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:50,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.91 | bwd_microstep: 286.93 | bwd_inner_microstep: 286.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:50,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.02 | bwd_microstep: 289.22 | bwd_inner_microstep: 289.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:51,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.97 | bwd_microstep: 263.09 | bwd_inner_microstep: 263.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:51,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.93 | bwd_microstep: 261.25 | bwd_inner_microstep: 261.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:41:52,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 250.54 | bwd_inner_microstep: 250.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:52,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.47 | bwd_microstep: 251.77 | bwd_inner_microstep: 251.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:52,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 248.85 | bwd_inner_microstep: 248.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:53,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.06 | bwd_microstep: 245.36 | bwd_inner_microstep: 245.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:41:53,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:54,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:41:54,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:41:55,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:55,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:56,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.23 | optimizer_gradients: 0.58 | optimizer_step: 3.21 +[2024-12-31 17:41:56,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 577.87 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 330.80 | step_microstep: 11.45 +[2024-12-31 17:41:56,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2941.42 | bwd: 4566.00 | bwd_inner: 4234.24 | bwd_allreduce: 331.06 | step: 14.64 + 28%|██▊ | 212/759 [31:51<1:22:45, 9.08s/it] {'loss': 1.2279, 'learning_rate': 1.691855576625001e-05, 'epoch': 0.28} + 28%|██▊ | 212/759 [31:51<1:22:45, 9.08s/it][2024-12-31 17:41:56,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.69 | bwd_microstep: 337.08 | bwd_inner_microstep: 336.69 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.22 +[2024-12-31 17:41:57,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.07 | bwd_microstep: 283.21 | bwd_inner_microstep: 283.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:57,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.85 | bwd_microstep: 287.98 | bwd_inner_microstep: 287.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:58,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.72 | bwd_microstep: 287.58 | bwd_inner_microstep: 287.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:41:58,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.12 | bwd_microstep: 286.73 | bwd_inner_microstep: 286.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:41:59,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.67 | bwd_microstep: 257.40 | bwd_inner_microstep: 257.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:41:59,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 249.51 | bwd_inner_microstep: 249.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:00,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 247.84 | bwd_inner_microstep: 247.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:00,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:01,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:01,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 246.73 | bwd_inner_microstep: 246.51 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.22 +[2024-12-31 17:42:01,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:02,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 247.20 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:02,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 246.51 | bwd_inner_microstep: 246.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:03,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 246.57 | bwd_inner_microstep: 246.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:42:03,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.77 | optimizer_step: 3.39 +[2024-12-31 17:42:03,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 257.99 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 13.55 | step_microstep: 11.33 +[2024-12-31 17:42:03,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.51 | bwd: 4216.74 | bwd_inner: 4202.05 | bwd_allreduce: 13.99 | step: 14.37 + 28%|██▊ | 213/759 [31:58<1:18:04, 8.58s/it] {'loss': 1.2378, 'learning_rate': 1.68876729585549e-05, 'epoch': 0.28} + 28%|██▊ | 213/759 [31:58<1:18:04, 8.58s/it][2024-12-31 17:42:04,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.14 | bwd_microstep: 388.21 | bwd_inner_microstep: 387.85 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:42:04,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.15 | bwd_microstep: 306.20 | bwd_inner_microstep: 306.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:05,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.86 | bwd_microstep: 291.13 | bwd_inner_microstep: 291.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:05,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.79 | bwd_microstep: 281.14 | bwd_inner_microstep: 281.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:06,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 250.63 | bwd_inner_microstep: 250.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:06,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.40 | bwd_microstep: 254.30 | bwd_inner_microstep: 254.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:07,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.36 | bwd_inner_microstep: 245.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:07,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:08,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 246.50 | bwd_inner_microstep: 246.31 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.24 +[2024-12-31 17:42:08,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:08,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.10 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:42:09,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:09,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:10,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.50 | bwd_microstep: 241.48 | bwd_inner_microstep: 241.37 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.21 +[2024-12-31 17:42:10,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 242.09 | bwd_inner_microstep: 242.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:11,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.00 | optimizer_step: 3.81 +[2024-12-31 17:42:11,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 258.10 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 13.93 | step_microstep: 13.22 +[2024-12-31 17:42:11,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2911.32 | bwd: 4227.59 | bwd_inner: 4212.42 | bwd_allreduce: 14.38 | step: 16.42 + 28%|██▊ | 214/759 [32:06<1:14:55, 8.25s/it] {'loss': 1.2437, 'learning_rate': 1.6856664658878797e-05, 'epoch': 0.28} + 28%|██▊ | 214/759 [32:06<1:14:55, 8.25s/it][2024-12-31 17:42:11,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.96 | bwd_microstep: 359.75 | bwd_inner_microstep: 359.39 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:42:12,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.34 | bwd_microstep: 290.60 | bwd_inner_microstep: 290.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:12,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.97 | bwd_microstep: 266.99 | bwd_inner_microstep: 266.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:13,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.52 | bwd_microstep: 256.78 | bwd_inner_microstep: 256.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:13,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 249.34 | bwd_inner_microstep: 249.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:14,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 248.82 | bwd_inner_microstep: 248.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:14,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.67 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:14,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:15,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 246.45 | bwd_inner_microstep: 246.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:42:15,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:42:16,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:16,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:17,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:42:17,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:18,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:18,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.73 | optimizer_step: 3.34 +[2024-12-31 17:42:18,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 258.08 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 13.53 | step_microstep: 10.85 +[2024-12-31 17:42:18,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2832.36 | bwd: 4134.27 | bwd_inner: 4119.61 | bwd_allreduce: 13.82 | step: 14.07 + 28%|██▊ | 215/759 [32:13<1:12:07, 7.95s/it] {'loss': 1.2671, 'learning_rate': 1.6825531432186545e-05, 'epoch': 0.28} + 28%|██▊ | 215/759 [32:13<1:12:07, 7.95s/it][2024-12-31 17:42:19,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.87 | bwd_microstep: 343.96 | bwd_inner_microstep: 343.59 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 17:42:19,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.10 | bwd_microstep: 311.89 | bwd_inner_microstep: 311.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:42:20,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.73 | bwd_microstep: 280.90 | bwd_inner_microstep: 280.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:42:20,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.35 | bwd_microstep: 266.57 | bwd_inner_microstep: 266.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:20,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.53 | bwd_microstep: 259.70 | bwd_inner_microstep: 259.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:21,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 247.33 | bwd_inner_microstep: 247.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:21,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 251.93 | bwd_inner_microstep: 251.57 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.23 +[2024-12-31 17:42:22,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:22,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:23,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:23,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:42:24,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:24,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:42:24,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 250.06 | bwd_inner_microstep: 250.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:25,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:42:26,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 1.18 | optimizer_step: 3.10 +[2024-12-31 17:42:26,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.83 | bwd_microstep: 537.72 | bwd_inner_microstep: 242.21 | bwd_allreduce_microstep: 295.46 | step_microstep: 11.35 +[2024-12-31 17:42:26,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2844.73 | bwd: 4458.73 | bwd_inner: 4162.13 | bwd_allreduce: 295.87 | step: 13.90 + 28%|██▊ | 216/759 [32:20<1:10:57, 7.84s/it] {'loss': 1.251, 'learning_rate': 1.6794273845719096e-05, 'epoch': 0.28} + 28%|██▊ | 216/759 [32:21<1:10:57, 7.84s/it][2024-12-31 17:42:26,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.93 | bwd_microstep: 311.47 | bwd_inner_microstep: 310.97 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.20 +[2024-12-31 17:42:27,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 250.52 | bwd_microstep: 418.45 | bwd_inner_microstep: 418.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:42:27,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.53 | bwd_microstep: 266.46 | bwd_inner_microstep: 266.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:28,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 248.93 | bwd_inner_microstep: 248.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:28,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.30 | bwd_microstep: 257.15 | bwd_inner_microstep: 257.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:29,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 262.17 | bwd_inner_microstep: 261.73 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.27 +[2024-12-31 17:42:29,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 252.06 | bwd_inner_microstep: 252.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:29,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:30,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 248.95 | bwd_inner_microstep: 248.78 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.21 +[2024-12-31 17:42:30,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 246.21 | bwd_inner_microstep: 246.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:31,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:31,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:32,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 246.15 | bwd_inner_microstep: 245.89 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.31 +[2024-12-31 17:42:32,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:32,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.59 | bwd_microstep: 241.03 | bwd_inner_microstep: 241.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:33,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.80 | optimizer_step: 3.62 +[2024-12-31 17:42:33,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.96 | bwd_microstep: 255.61 | bwd_inner_microstep: 241.88 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.57 +[2024-12-31 17:42:33,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2851.60 | bwd: 4230.09 | bwd_inner: 4214.39 | bwd_allreduce: 14.47 | step: 14.86 + 29%|██▊ | 217/759 [32:28<1:09:36, 7.71s/it] {'loss': 1.2588, 'learning_rate': 1.6762892468983237e-05, 'epoch': 0.29} + 29%|██▊ | 217/759 [32:28<1:09:36, 7.71s/it][2024-12-31 17:42:34,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.91 | bwd_microstep: 393.89 | bwd_inner_microstep: 393.50 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:42:34,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.28 | bwd_microstep: 366.78 | bwd_inner_microstep: 366.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:35,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.74 | bwd_microstep: 286.68 | bwd_inner_microstep: 286.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:35,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.52 | bwd_microstep: 263.36 | bwd_inner_microstep: 263.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:36,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.16 | bwd_microstep: 261.40 | bwd_inner_microstep: 261.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:36,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.84 | bwd_microstep: 251.16 | bwd_inner_microstep: 251.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:36,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 247.28 | bwd_inner_microstep: 247.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:37,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.25 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:37,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:38,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:38,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:39,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:39,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.91 | bwd_microstep: 242.49 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:40,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 242.70 | bwd_inner_microstep: 242.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:40,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:40,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.76 | optimizer_step: 3.43 +[2024-12-31 17:42:40,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.63 | bwd_microstep: 254.86 | bwd_inner_microstep: 241.19 | bwd_allreduce_microstep: 13.56 | step_microstep: 11.45 +[2024-12-31 17:42:40,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2886.51 | bwd: 4279.01 | bwd_inner: 4264.41 | bwd_allreduce: 13.84 | step: 14.52 + 29%|██▊ | 218/759 [32:35<1:08:48, 7.63s/it] {'loss': 1.2078, 'learning_rate': 1.673138787374119e-05, 'epoch': 0.29} + 29%|██▊ | 218/759 [32:35<1:08:48, 7.63s/it][2024-12-31 17:42:41,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.41 | bwd_microstep: 369.58 | bwd_inner_microstep: 369.21 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 17:42:42,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.12 | bwd_microstep: 293.92 | bwd_inner_microstep: 293.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:42,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.65 | bwd_microstep: 266.34 | bwd_inner_microstep: 266.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:42,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.89 | bwd_microstep: 262.31 | bwd_inner_microstep: 262.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:43,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 249.39 | bwd_inner_microstep: 249.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:43,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:44,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 265.20 | bwd_inner_microstep: 265.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:44,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 245.71 | bwd_inner_microstep: 245.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:45,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:45,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.13 | bwd_microstep: 245.69 | bwd_inner_microstep: 245.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:46,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:42:46,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:46,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:42:47,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 244.18 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.26 +[2024-12-31 17:42:47,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 242.60 | bwd_inner_microstep: 242.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:42:48,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.42 | optimizer_gradients: 0.81 | optimizer_step: 3.42 +[2024-12-31 17:42:48,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 254.96 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 13.67 | step_microstep: 15.50 +[2024-12-31 17:42:48,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2849.12 | bwd: 4171.27 | bwd_inner: 4156.15 | bwd_allreduce: 14.22 | step: 18.45 + 29%|██▉ | 219/759 [32:43<1:07:50, 7.54s/it] {'loss': 1.2534, 'learning_rate': 1.6699760634000166e-05, 'epoch': 0.29} + 29%|██▉ | 219/759 [32:43<1:07:50, 7.54s/it][2024-12-31 17:42:48,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.26 | bwd_microstep: 297.70 | bwd_inner_microstep: 297.25 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.24 +[2024-12-31 17:42:49,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.70 | bwd_microstep: 288.00 | bwd_inner_microstep: 287.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:49,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 263.21 | bwd_inner_microstep: 263.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:42:50,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 256.52 | bwd_inner_microstep: 256.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:50,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 270.27 | bwd_microstep: 459.40 | bwd_inner_microstep: 459.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:42:51,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:51,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:52,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 247.72 | bwd_inner_microstep: 247.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:52,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.27 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:53,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:42:53,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:42:53,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:42:54,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 254.93 | bwd_inner_microstep: 254.65 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.15 +[2024-12-31 17:42:54,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.47 | bwd_microstep: 240.79 | bwd_inner_microstep: 240.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:42:55,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.80 | bwd_microstep: 241.42 | bwd_inner_microstep: 241.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:55,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.31 | optimizer_gradients: 0.70 | optimizer_step: 3.26 +[2024-12-31 17:42:55,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 255.09 | bwd_inner_microstep: 241.37 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.92 +[2024-12-31 17:42:55,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2918.43 | bwd: 4276.87 | bwd_inner: 4261.97 | bwd_allreduce: 14.04 | step: 14.93 + 29%|██▉ | 220/759 [32:50<1:07:34, 7.52s/it] {'loss': 1.2941, 'learning_rate': 1.6668011326001962e-05, 'epoch': 0.29} + 29%|██▉ | 220/759 [32:50<1:07:34, 7.52s/it][2024-12-31 17:42:56,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.81 | bwd_microstep: 347.89 | bwd_inner_microstep: 347.52 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:42:56,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.10 | bwd_microstep: 358.06 | bwd_inner_microstep: 358.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:57,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.34 | bwd_microstep: 268.32 | bwd_inner_microstep: 268.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:57,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.97 | bwd_microstep: 257.76 | bwd_inner_microstep: 257.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:42:58,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.88 | bwd_microstep: 250.73 | bwd_inner_microstep: 250.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:42:58,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 259.52 | bwd_inner_microstep: 259.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:42:59,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 246.16 | bwd_inner_microstep: 246.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:42:59,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.62 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:42:59,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.04 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:43:00,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:00,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 246.85 | bwd_inner_microstep: 246.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:01,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 255.52 | bwd_inner_microstep: 255.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:43:01,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:43:02,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:02,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 250.89 | bwd_inner_microstep: 250.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:03,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.78 | optimizer_step: 3.42 +[2024-12-31 17:43:03,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 258.10 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 13.54 | step_microstep: 12.03 +[2024-12-31 17:43:03,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2869.13 | bwd: 4225.22 | bwd_inner: 4210.72 | bwd_allreduce: 13.82 | step: 14.67 + 29%|██▉ | 221/759 [32:58<1:07:01, 7.48s/it] {'loss': 1.2867, 'learning_rate': 1.6636140528212427e-05, 'epoch': 0.29} + 29%|██▉ | 221/759 [32:58<1:07:01, 7.48s/it][2024-12-31 17:43:03,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.40 | bwd_microstep: 340.63 | bwd_inner_microstep: 340.25 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.25 +[2024-12-31 17:43:04,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.88 | bwd_microstep: 310.00 | bwd_inner_microstep: 309.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:43:04,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.32 | bwd_microstep: 266.97 | bwd_inner_microstep: 266.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:05,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.63 | bwd_microstep: 250.57 | bwd_inner_microstep: 250.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:05,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.76 | bwd_microstep: 255.23 | bwd_inner_microstep: 255.04 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.21 +[2024-12-31 17:43:05,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 262.45 | bwd_inner_microstep: 262.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:06,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 248.67 | bwd_inner_microstep: 248.35 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.29 +[2024-12-31 17:43:06,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:07,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.70 | bwd_microstep: 246.40 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:07,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.66 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:08,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.63 | bwd_microstep: 246.48 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:08,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:09,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:09,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.18 | bwd_microstep: 241.08 | bwd_inner_microstep: 241.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:09,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.64 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:10,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.63 | optimizer_gradients: 0.73 | optimizer_step: 3.39 +[2024-12-31 17:43:10,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.04 | bwd_microstep: 261.42 | bwd_inner_microstep: 241.71 | bwd_allreduce_microstep: 19.60 | step_microstep: 11.23 +[2024-12-31 17:43:10,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.78 | bwd: 4152.44 | bwd_inner: 4131.26 | bwd_allreduce: 20.26 | step: 14.42 + 29%|██▉ | 222/759 [33:05<1:06:22, 7.42s/it] {'loss': 1.2647, 'learning_rate': 1.6604148821310912e-05, 'epoch': 0.29} + 29%|██▉ | 222/759 [33:05<1:06:22, 7.42s/it][2024-12-31 17:43:10,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.56 | bwd_microstep: 370.59 | bwd_inner_microstep: 370.22 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:43:11,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 241.66 | bwd_microstep: 352.81 | bwd_inner_microstep: 352.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:12,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.02 | bwd_microstep: 266.33 | bwd_inner_microstep: 266.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:43:12,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.45 | bwd_microstep: 256.12 | bwd_inner_microstep: 256.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:12,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.90 | bwd_microstep: 254.96 | bwd_inner_microstep: 254.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:13,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 246.90 | bwd_inner_microstep: 246.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:13,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 249.96 | bwd_inner_microstep: 249.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:14,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.97 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:14,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:15,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 248.48 | bwd_inner_microstep: 248.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:15,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 246.32 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:15,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:16,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:16,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:17,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:17,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.03 | optimizer_gradients: 0.72 | optimizer_step: 3.28 +[2024-12-31 17:43:17,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.86 | bwd_microstep: 256.93 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 13.52 | step_microstep: 24.87 +[2024-12-31 17:43:17,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2903.57 | bwd: 4220.05 | bwd_inner: 4205.54 | bwd_allreduce: 13.80 | step: 28.01 + 29%|██▉ | 223/759 [33:12<1:06:17, 7.42s/it] {'loss': 1.2414, 'learning_rate': 1.6572036788179728e-05, 'epoch': 0.29} + 29%|██▉ | 223/759 [33:12<1:06:17, 7.42s/it][2024-12-31 17:43:18,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 249.65 | bwd_microstep: 410.66 | bwd_inner_microstep: 410.31 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:43:19,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.58 | bwd_microstep: 346.94 | bwd_inner_microstep: 346.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:19,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.84 | bwd_microstep: 267.76 | bwd_inner_microstep: 267.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:19,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.31 | bwd_microstep: 262.75 | bwd_inner_microstep: 262.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 17:43:20,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.11 | bwd_microstep: 256.60 | bwd_inner_microstep: 256.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:20,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.56 | bwd_microstep: 248.78 | bwd_inner_microstep: 248.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:21,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:21,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.88 | bwd_inner_microstep: 245.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:22,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:43:22,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 256.64 | bwd_inner_microstep: 256.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:43:23,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.91 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:23,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:23,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 264.41 | bwd_inner_microstep: 264.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:24,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.84 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:24,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.14 | bwd_microstep: 241.60 | bwd_inner_microstep: 241.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:25,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.80 | optimizer_gradients: 0.82 | optimizer_step: 3.45 +[2024-12-31 17:43:25,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 257.39 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 13.63 | step_microstep: 11.88 +[2024-12-31 17:43:25,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2891.18 | bwd: 4282.63 | bwd_inner: 4268.12 | bwd_allreduce: 13.92 | step: 14.90 + 30%|██▉ | 224/759 [33:20<1:06:17, 7.43s/it] {'loss': 1.2572, 'learning_rate': 1.6539805013893493e-05, 'epoch': 0.3} + 30%|██▉ | 224/759 [33:20<1:06:17, 7.43s/it][2024-12-31 17:43:25,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.34 | bwd_microstep: 387.29 | bwd_inner_microstep: 386.94 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:43:26,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.24 | bwd_microstep: 321.50 | bwd_inner_microstep: 321.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:43:26,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.32 | bwd_microstep: 293.10 | bwd_inner_microstep: 293.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 17:43:27,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.06 | bwd_microstep: 289.22 | bwd_inner_microstep: 289.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:27,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.07 | bwd_microstep: 271.35 | bwd_inner_microstep: 271.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:28,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.98 | bwd_microstep: 268.23 | bwd_inner_microstep: 268.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 17:43:28,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 253.07 | bwd_inner_microstep: 253.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 4.73 +[2024-12-31 17:43:29,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 246.82 | bwd_inner_microstep: 246.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:29,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 247.19 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:30,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:43:30,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.29 | bwd_microstep: 251.41 | bwd_inner_microstep: 251.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:30,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:31,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:31,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.12 | bwd_inner_microstep: 243.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:32,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 261.45 | bwd_inner_microstep: 261.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:43:32,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.25 | optimizer_gradients: 0.63 | optimizer_step: 3.10 +[2024-12-31 17:43:32,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.29 | bwd_microstep: 303.02 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 60.03 | step_microstep: 11.20 +[2024-12-31 17:43:32,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2907.90 | bwd: 4370.66 | bwd_inner: 4309.76 | bwd_allreduce: 60.28 | step: 18.87 + 30%|██▉ | 225/759 [33:27<1:06:31, 7.47s/it] {'loss': 1.195, 'learning_rate': 1.650745408570849e-05, 'epoch': 0.3} + 30%|██▉ | 225/759 [33:27<1:06:31, 7.47s/it][2024-12-31 17:43:33,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.67 | bwd_microstep: 387.15 | bwd_inner_microstep: 386.78 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:43:33,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.49 | bwd_microstep: 288.30 | bwd_inner_microstep: 287.98 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.28 +[2024-12-31 17:43:34,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.74 | bwd_microstep: 267.57 | bwd_inner_microstep: 267.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:34,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.50 | bwd_microstep: 268.34 | bwd_inner_microstep: 268.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:35,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.74 | bwd_microstep: 257.16 | bwd_inner_microstep: 257.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:43:35,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 250.44 | bwd_inner_microstep: 250.06 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.21 +[2024-12-31 17:43:36,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 247.81 | bwd_inner_microstep: 247.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:43:36,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.91 | bwd_microstep: 246.66 | bwd_inner_microstep: 246.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:37,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 245.90 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:37,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 245.48 | bwd_inner_microstep: 245.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:37,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:38,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:38,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:43:39,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.14 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:39,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.22 +[2024-12-31 17:43:40,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 16.56 | optimizer_gradients: 0.84 | optimizer_step: 4.76 +[2024-12-31 17:43:40,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.15 | bwd_microstep: 327.09 | bwd_inner_microstep: 242.45 | bwd_allreduce_microstep: 84.59 | step_microstep: 25.24 +[2024-12-31 17:43:40,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2855.36 | bwd: 4254.94 | bwd_inner: 4168.76 | bwd_allreduce: 85.30 | step: 28.28 + 30%|██▉ | 226/759 [33:35<1:06:15, 7.46s/it] {'loss': 1.2229, 'learning_rate': 1.6474984593051965e-05, 'epoch': 0.3} + 30%|██▉ | 226/759 [33:35<1:06:15, 7.46s/it][2024-12-31 17:43:40,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.66 | bwd_microstep: 344.57 | bwd_inner_microstep: 344.21 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:43:41,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.82 | bwd_microstep: 287.94 | bwd_inner_microstep: 287.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:41,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.61 | bwd_microstep: 263.46 | bwd_inner_microstep: 263.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:42,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.19 | bwd_microstep: 265.48 | bwd_inner_microstep: 265.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:42,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.07 | bwd_microstep: 256.85 | bwd_inner_microstep: 256.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:43,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.36 | bwd_microstep: 249.42 | bwd_inner_microstep: 249.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:43,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.28 | bwd_microstep: 250.36 | bwd_inner_microstep: 250.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:44,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.17 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:44,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 253.96 | bwd_inner_microstep: 253.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:43:44,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:45,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.95 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:43:45,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 244.53 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:46,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:46,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:47,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:47,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.55 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 17:43:47,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 295.35 | bwd_inner_microstep: 248.26 | bwd_allreduce_microstep: 47.05 | step_microstep: 12.92 +[2024-12-31 17:43:47,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2841.28 | bwd: 4179.17 | bwd_inner: 4131.29 | bwd_allreduce: 47.30 | step: 15.68 + 30%|██▉ | 227/759 [33:42<1:05:44, 7.41s/it] {'loss': 1.2675, 'learning_rate': 1.6442397127511366e-05, 'epoch': 0.3} + 30%|██▉ | 227/759 [33:42<1:05:44, 7.41s/it][2024-12-31 17:43:48,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 264.87 | bwd_microstep: 451.44 | bwd_inner_microstep: 451.09 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:43:48,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 248.88 | bwd_microstep: 395.41 | bwd_inner_microstep: 395.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:43:49,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.09 | bwd_microstep: 285.50 | bwd_inner_microstep: 285.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:49,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.33 | bwd_microstep: 266.42 | bwd_inner_microstep: 266.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:50,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 255.47 | bwd_inner_microstep: 255.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:50,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:51,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 246.42 | bwd_inner_microstep: 246.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:51,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 245.87 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:52,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:52,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:52,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 248.04 | bwd_inner_microstep: 248.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:53,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:53,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:43:54,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 242.58 | bwd_inner_microstep: 242.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:43:54,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 241.47 | bwd_inner_microstep: 241.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:43:55,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 1.14 | optimizer_step: 3.27 +[2024-12-31 17:43:55,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 254.94 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 13.56 | step_microstep: 10.95 +[2024-12-31 17:43:55,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2951.22 | bwd: 4363.48 | bwd_inner: 4349.04 | bwd_allreduce: 13.83 | step: 13.74 + 30%|███ | 228/759 [33:50<1:06:04, 7.47s/it] {'loss': 1.2349, 'learning_rate': 1.6409692282823604e-05, 'epoch': 0.3} + 30%|███ | 228/759 [33:50<1:06:04, 7.47s/it][2024-12-31 17:43:55,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.58 | bwd_microstep: 372.35 | bwd_inner_microstep: 372.05 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.20 +[2024-12-31 17:43:56,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.17 | bwd_microstep: 351.42 | bwd_inner_microstep: 351.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:56,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.65 | bwd_microstep: 283.88 | bwd_inner_microstep: 283.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:43:57,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.06 | bwd_microstep: 263.45 | bwd_inner_microstep: 263.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.32 +[2024-12-31 17:43:57,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.11 | bwd_microstep: 299.50 | bwd_inner_microstep: 299.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:43:58,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 248.58 | bwd_inner_microstep: 248.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:43:58,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.04 | bwd_microstep: 250.10 | bwd_inner_microstep: 250.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:43:59,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 249.40 | bwd_inner_microstep: 249.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:43:59,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.53 | bwd_inner_microstep: 245.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:43:59,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 246.42 | bwd_inner_microstep: 246.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:00,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:00,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 245.46 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:44:01,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:01,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 283.20 | bwd_inner_microstep: 283.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:02,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:44:02,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.92 | optimizer_step: 3.12 +[2024-12-31 17:44:02,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.17 | bwd_microstep: 291.92 | bwd_inner_microstep: 242.72 | bwd_allreduce_microstep: 49.16 | step_microstep: 11.36 +[2024-12-31 17:44:02,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.55 | bwd: 4364.26 | bwd_inner: 4314.22 | bwd_allreduce: 49.42 | step: 14.37 + 30%|███ | 229/759 [33:57<1:06:05, 7.48s/it] {'loss': 1.2357, 'learning_rate': 1.63768706548642e-05, 'epoch': 0.3} + 30%|███ | 229/759 [33:57<1:06:05, 7.48s/it][2024-12-31 17:44:03,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 234.65 | bwd_microstep: 382.97 | bwd_inner_microstep: 382.47 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.29 +[2024-12-31 17:44:03,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.09 | bwd_microstep: 291.60 | bwd_inner_microstep: 291.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:04,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.61 | bwd_microstep: 263.96 | bwd_inner_microstep: 263.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:04,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.84 | bwd_microstep: 268.93 | bwd_inner_microstep: 268.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:05,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.62 | bwd_microstep: 255.60 | bwd_inner_microstep: 255.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:05,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:06,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:06,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.30 | bwd_microstep: 244.53 | bwd_inner_microstep: 244.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:44:06,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:44:07,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:07,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.69 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:08,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:08,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:09,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.91 | bwd_microstep: 241.49 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:09,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:09,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.74 | optimizer_gradients: 0.77 | optimizer_step: 3.40 +[2024-12-31 17:44:09,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.20 | bwd_microstep: 255.88 | bwd_inner_microstep: 242.05 | bwd_allreduce_microstep: 13.71 | step_microstep: 11.45 +[2024-12-31 17:44:09,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2889.96 | bwd: 4165.24 | bwd_inner: 4150.40 | bwd_allreduce: 14.12 | step: 14.61 + 30%|███ | 230/759 [34:04<1:05:37, 7.44s/it] {'loss': 1.2307, 'learning_rate': 1.6343932841636455e-05, 'epoch': 0.3} + 30%|███ | 230/759 [34:04<1:05:37, 7.44s/it][2024-12-31 17:44:10,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.99 | bwd_microstep: 336.25 | bwd_inner_microstep: 335.88 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.26 +[2024-12-31 17:44:11,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.29 | bwd_microstep: 292.10 | bwd_inner_microstep: 292.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:44:11,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.53 | bwd_microstep: 269.10 | bwd_inner_microstep: 269.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:11,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.00 | bwd_microstep: 264.08 | bwd_inner_microstep: 264.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:12,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.73 | bwd_microstep: 254.81 | bwd_inner_microstep: 254.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:44:12,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 248.67 | bwd_inner_microstep: 248.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:13,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 255.13 | bwd_inner_microstep: 254.93 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.26 +[2024-12-31 17:44:13,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.49 | bwd_inner_microstep: 245.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:14,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:14,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 246.13 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:44:15,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 248.01 | bwd_inner_microstep: 247.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:44:15,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:44:15,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:16,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:16,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:17,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.83 | optimizer_step: 3.49 +[2024-12-31 17:44:17,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.83 | bwd_microstep: 256.65 | bwd_inner_microstep: 242.89 | bwd_allreduce_microstep: 13.65 | step_microstep: 11.42 +[2024-12-31 17:44:17,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.82 | bwd: 4139.12 | bwd_inner: 4124.24 | bwd_allreduce: 14.05 | step: 14.20 + 30%|███ | 231/759 [34:12<1:04:57, 7.38s/it] {'loss': 1.2892, 'learning_rate': 1.631087944326053e-05, 'epoch': 0.3} + 30%|███ | 231/759 [34:12<1:04:57, 7.38s/it][2024-12-31 17:44:17,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.06 | bwd_microstep: 345.19 | bwd_inner_microstep: 344.83 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:44:18,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.13 | bwd_microstep: 296.53 | bwd_inner_microstep: 296.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:44:18,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.48 | bwd_microstep: 286.44 | bwd_inner_microstep: 286.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:19,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.54 | bwd_microstep: 269.57 | bwd_inner_microstep: 269.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:19,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 264.60 | bwd_inner_microstep: 264.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:20,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.67 | bwd_microstep: 247.88 | bwd_inner_microstep: 247.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:20,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 248.64 | bwd_inner_microstep: 248.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:21,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 248.55 | bwd_inner_microstep: 248.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:21,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 263.97 | bwd_inner_microstep: 263.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:44:21,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:22,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 246.25 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:22,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:23,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:23,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 244.02 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:24,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.50 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:24,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.91 | optimizer_gradients: 0.74 | optimizer_step: 3.14 +[2024-12-31 17:44:24,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 350.87 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 106.81 | step_microstep: 11.90 +[2024-12-31 17:44:24,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.10 | bwd: 4288.67 | bwd_inner: 4180.95 | bwd_allreduce: 107.10 | step: 14.90 + 31%|███ | 232/759 [34:19<1:04:56, 7.39s/it] {'loss': 1.2696, 'learning_rate': 1.6277711061962525e-05, 'epoch': 0.31} + 31%|███ | 232/759 [34:19<1:04:56, 7.39s/it][2024-12-31 17:44:25,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.70 | bwd_microstep: 309.79 | bwd_inner_microstep: 309.44 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:44:25,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.74 | bwd_microstep: 264.28 | bwd_inner_microstep: 264.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:26,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.80 | bwd_microstep: 261.88 | bwd_inner_microstep: 261.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 17:44:26,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.96 | bwd_microstep: 263.76 | bwd_inner_microstep: 263.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:44:26,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 246.69 | bwd_inner_microstep: 246.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:27,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 247.73 | bwd_inner_microstep: 247.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:44:27,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 262.31 | bwd_inner_microstep: 262.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:28,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 252.89 | bwd_inner_microstep: 252.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:44:28,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.03 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:29,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:29,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:44:30,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 246.63 | bwd_inner_microstep: 246.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:44:30,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:30,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:31,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.49 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.55 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 17:44:32,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.78 | optimizer_step: 3.13 +[2024-12-31 17:44:32,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.20 | bwd_microstep: 1154.76 | bwd_inner_microstep: 241.86 | bwd_allreduce_microstep: 912.85 | step_microstep: 10.47 +[2024-12-31 17:44:32,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2794.55 | bwd: 4978.95 | bwd_inner: 4064.73 | bwd_allreduce: 913.32 | step: 12.98 + 31%|███ | 233/759 [34:27<1:06:30, 7.59s/it] {'loss': 1.2858, 'learning_rate': 1.6244428302063506e-05, 'epoch': 0.31} + 31%|███ | 233/759 [34:27<1:06:30, 7.59s/it][2024-12-31 17:44:33,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.76 | bwd_microstep: 358.66 | bwd_inner_microstep: 358.28 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 17:44:33,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.85 | bwd_microstep: 289.01 | bwd_inner_microstep: 288.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:34,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.28 | bwd_microstep: 281.70 | bwd_inner_microstep: 281.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:34,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.94 | bwd_microstep: 254.70 | bwd_inner_microstep: 254.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:44:35,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:44:35,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:44:36,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.75 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:36,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.27 | bwd_microstep: 249.52 | bwd_inner_microstep: 249.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:36,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:37,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:37,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:38,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.93 | bwd_microstep: 253.97 | bwd_inner_microstep: 253.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:44:38,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:39,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:44:39,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.85 | bwd_microstep: 249.22 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:40,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.84 | optimizer_step: 4.38 +[2024-12-31 17:44:40,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.87 | bwd_microstep: 542.91 | bwd_inner_microstep: 242.01 | bwd_allreduce_microstep: 300.85 | step_microstep: 12.19 +[2024-12-31 17:44:40,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2823.91 | bwd: 4440.49 | bwd_inner: 4138.71 | bwd_allreduce: 301.12 | step: 15.11 + 31%|███ | 234/759 [34:35<1:06:17, 7.58s/it] {'loss': 1.2654, 'learning_rate': 1.6211031769968503e-05, 'epoch': 0.31} + 31%|███ | 234/759 [34:35<1:06:17, 7.58s/it][2024-12-31 17:44:40,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.86 | bwd_microstep: 345.73 | bwd_inner_microstep: 345.38 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:44:41,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.16 | bwd_microstep: 286.05 | bwd_inner_microstep: 286.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:41,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.89 | bwd_microstep: 268.22 | bwd_inner_microstep: 268.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:42,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.78 | bwd_microstep: 266.88 | bwd_inner_microstep: 266.36 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.27 +[2024-12-31 17:44:42,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.12 | bwd_microstep: 261.93 | bwd_inner_microstep: 261.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:43,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 248.19 | bwd_inner_microstep: 248.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:44:43,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 248.50 | bwd_inner_microstep: 248.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:44,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.10 | bwd_microstep: 248.38 | bwd_inner_microstep: 248.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 17:44:44,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:44,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:45,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 256.14 | bwd_inner_microstep: 256.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:45,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:46,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:46,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.12 | bwd_microstep: 241.58 | bwd_inner_microstep: 241.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:44:47,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 240.60 | bwd_inner_microstep: 240.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:47,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.82 | optimizer_gradients: 0.67 | optimizer_step: 3.45 +[2024-12-31 17:44:47,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.34 | bwd_microstep: 270.64 | bwd_inner_microstep: 241.93 | bwd_allreduce_microstep: 28.63 | step_microstep: 12.23 +[2024-12-31 17:44:47,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.33 | bwd: 4159.54 | bwd_inner: 4129.63 | bwd_allreduce: 29.12 | step: 15.33 + 31%|███ | 235/759 [34:42<1:05:27, 7.49s/it] {'loss': 1.274, 'learning_rate': 1.6177522074155436e-05, 'epoch': 0.31} + 31%|███ | 235/759 [34:42<1:05:27, 7.49s/it][2024-12-31 17:44:48,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.61 | bwd_microstep: 311.84 | bwd_inner_microstep: 311.47 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:44:48,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.62 | bwd_microstep: 287.59 | bwd_inner_microstep: 287.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 17:44:49,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.72 | bwd_microstep: 265.52 | bwd_inner_microstep: 265.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:49,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.29 | bwd_microstep: 277.71 | bwd_inner_microstep: 277.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:49,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.89 | bwd_microstep: 255.70 | bwd_inner_microstep: 255.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:50,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:50,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 250.03 | bwd_inner_microstep: 250.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:51,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:51,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.49 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:44:52,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:52,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 245.76 | bwd_inner_microstep: 245.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:44:53,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 17:44:53,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 242.72 | bwd_inner_microstep: 242.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:53,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 253.50 | bwd_inner_microstep: 253.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:44:54,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:54,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.82 | optimizer_step: 3.45 +[2024-12-31 17:44:54,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 259.08 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 13.58 | step_microstep: 12.41 +[2024-12-31 17:44:54,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.20 | bwd: 4123.09 | bwd_inner: 4108.49 | bwd_allreduce: 13.93 | step: 15.25 + 31%|███ | 236/759 [34:49<1:04:41, 7.42s/it] {'loss': 1.255, 'learning_rate': 1.6143899825164058e-05, 'epoch': 0.31} + 31%|███ | 236/759 [34:49<1:04:41, 7.42s/it][2024-12-31 17:44:55,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.87 | bwd_microstep: 393.81 | bwd_inner_microstep: 393.44 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:44:56,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 233.52 | bwd_microstep: 380.36 | bwd_inner_microstep: 380.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:44:56,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.73 | bwd_microstep: 291.34 | bwd_inner_microstep: 291.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:57,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.82 | bwd_microstep: 255.89 | bwd_inner_microstep: 255.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:44:57,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 262.34 | bwd_inner_microstep: 262.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:57,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.54 | bwd_microstep: 249.92 | bwd_inner_microstep: 249.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:44:58,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:58,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:44:59,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 251.25 | bwd_inner_microstep: 251.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:44:59,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:00,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:00,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:00,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:01,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:01,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.61 | bwd_microstep: 241.06 | bwd_inner_microstep: 241.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:02,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.79 | optimizer_gradients: 0.71 | optimizer_step: 3.31 +[2024-12-31 17:45:02,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.89 | bwd_microstep: 254.70 | bwd_inner_microstep: 241.00 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.17 +[2024-12-31 17:45:02,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2892.78 | bwd: 4295.32 | bwd_inner: 4280.76 | bwd_allreduce: 13.88 | step: 14.23 + 31%|███ | 237/759 [34:57<1:04:42, 7.44s/it] {'loss': 1.2317, 'learning_rate': 1.6110165635584807e-05, 'epoch': 0.31} + 31%|███ | 237/759 [34:57<1:04:42, 7.44s/it][2024-12-31 17:45:02,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.96 | bwd_microstep: 343.49 | bwd_inner_microstep: 343.15 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:45:03,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.91 | bwd_microstep: 289.28 | bwd_inner_microstep: 289.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:45:03,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.79 | bwd_microstep: 266.34 | bwd_inner_microstep: 266.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:45:04,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.94 | bwd_microstep: 255.77 | bwd_inner_microstep: 255.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:04,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.00 | bwd_microstep: 255.26 | bwd_inner_microstep: 255.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:05,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.06 | bwd_microstep: 255.83 | bwd_inner_microstep: 255.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:05,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.29 | bwd_microstep: 254.86 | bwd_inner_microstep: 254.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:06,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:06,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 252.07 | bwd_inner_microstep: 252.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:06,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.86 | bwd_inner_microstep: 245.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:07,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:45:07,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:08,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:08,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:09,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.09 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:09,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 11.71 | optimizer_gradients: 0.67 | optimizer_step: 4.87 +[2024-12-31 17:45:09,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.77 | bwd_microstep: 255.22 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 13.53 | step_microstep: 31.70 +[2024-12-31 17:45:09,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.84 | bwd: 4140.37 | bwd_inner: 4126.00 | bwd_allreduce: 13.80 | step: 34.55 + 31%|███▏ | 238/759 [35:04<1:04:12, 7.39s/it] {'loss': 1.2737, 'learning_rate': 1.6076320120047667e-05, 'epoch': 0.31} + 31%|███▏ | 238/759 [35:04<1:04:12, 7.39s/it][2024-12-31 17:45:10,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.37 | bwd_microstep: 339.82 | bwd_inner_microstep: 339.49 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 17:45:10,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.91 | bwd_microstep: 290.34 | bwd_inner_microstep: 290.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:11,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.13 | bwd_microstep: 266.82 | bwd_inner_microstep: 266.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:11,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.30 | bwd_microstep: 262.52 | bwd_inner_microstep: 262.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:12,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.56 | bwd_microstep: 255.21 | bwd_inner_microstep: 255.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:12,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 245.61 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:12,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:13,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:13,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.97 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:45:14,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.52 | bwd_microstep: 242.91 | bwd_inner_microstep: 242.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:14,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.69 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:15,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:15,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.08 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:15,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.17 | bwd_microstep: 241.30 | bwd_inner_microstep: 241.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:16,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.13 | bwd_microstep: 241.08 | bwd_inner_microstep: 241.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:16,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.10 | optimizer_gradients: 0.75 | optimizer_step: 3.26 +[2024-12-31 17:45:16,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.59 | bwd_microstep: 253.88 | bwd_inner_microstep: 240.28 | bwd_allreduce_microstep: 13.51 | step_microstep: 11.73 +[2024-12-31 17:45:16,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.80 | bwd: 4109.91 | bwd_inner: 4095.49 | bwd_allreduce: 13.78 | step: 14.84 + 31%|███▏ | 239/759 [35:11<1:03:42, 7.35s/it] {'loss': 1.2599, 'learning_rate': 1.6042363895210948e-05, 'epoch': 0.31} + 31%|███▏ | 239/759 [35:11<1:03:42, 7.35s/it][2024-12-31 17:45:17,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.72 | bwd_microstep: 353.00 | bwd_inner_microstep: 352.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:45:18,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 247.14 | bwd_microstep: 414.76 | bwd_inner_microstep: 414.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 17:45:18,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.85 | bwd_microstep: 283.58 | bwd_inner_microstep: 283.25 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.26 +[2024-12-31 17:45:19,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.93 | bwd_microstep: 280.42 | bwd_inner_microstep: 280.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:45:19,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 250.13 | bwd_inner_microstep: 250.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:19,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.88 | bwd_microstep: 254.53 | bwd_inner_microstep: 254.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:20,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 253.97 | bwd_inner_microstep: 253.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:20,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 245.77 | bwd_inner_microstep: 245.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:21,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 245.73 | bwd_inner_microstep: 245.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:21,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 246.28 | bwd_inner_microstep: 246.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:22,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 262.29 | bwd_inner_microstep: 262.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:22,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:23,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.35 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.27 +[2024-12-31 17:45:23,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:45:23,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:24,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.76 | optimizer_step: 3.37 +[2024-12-31 17:45:24,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.92 | bwd_microstep: 255.13 | bwd_inner_microstep: 241.41 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.23 +[2024-12-31 17:45:24,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2897.71 | bwd: 4321.23 | bwd_inner: 4306.12 | bwd_allreduce: 14.21 | step: 14.44 + 32%|███▏ | 240/759 [35:19<1:03:59, 7.40s/it] {'loss': 1.2414, 'learning_rate': 1.6008297579750063e-05, 'epoch': 0.32} + 32%|███▏ | 240/759 [35:19<1:03:59, 7.40s/it][2024-12-31 17:45:24,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.09 | bwd_microstep: 365.78 | bwd_inner_microstep: 365.41 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:45:25,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.54 | bwd_microstep: 387.48 | bwd_inner_microstep: 387.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:26,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.46 | bwd_microstep: 266.06 | bwd_inner_microstep: 266.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:26,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.70 | bwd_microstep: 257.10 | bwd_inner_microstep: 257.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:26,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 250.69 | bwd_inner_microstep: 250.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:27,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 263.09 | bwd_inner_microstep: 263.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:27,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:28,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 246.09 | bwd_inner_microstep: 246.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:28,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.19 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:29,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 249.01 | bwd_inner_microstep: 248.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:29,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.54 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:29,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:30,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 247.89 | bwd_inner_microstep: 247.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:30,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:31,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 17:45:31,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 8.09 | optimizer_gradients: 6.62 | optimizer_step: 8.50 +[2024-12-31 17:45:31,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 260.62 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 15.94 | step_microstep: 25.33 +[2024-12-31 17:45:31,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2877.32 | bwd: 4264.08 | bwd_inner: 4247.11 | bwd_allreduce: 16.24 | step: 28.46 + 32%|███▏ | 241/759 [35:26<1:04:01, 7.42s/it] {'loss': 1.2389, 'learning_rate': 1.597412179434626e-05, 'epoch': 0.32} + 32%|███▏ | 241/759 [35:26<1:04:01, 7.42s/it][2024-12-31 17:45:32,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.98 | bwd_microstep: 360.87 | bwd_inner_microstep: 360.51 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:45:33,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.79 | bwd_microstep: 372.64 | bwd_inner_microstep: 372.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:33,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.57 | bwd_microstep: 262.95 | bwd_inner_microstep: 262.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 17:45:33,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.69 | bwd_microstep: 255.23 | bwd_inner_microstep: 255.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:34,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 256.61 | bwd_inner_microstep: 256.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:34,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 245.91 | bwd_inner_microstep: 245.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:35,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 248.21 | bwd_inner_microstep: 248.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:35,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 252.71 | bwd_inner_microstep: 252.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:36,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.68 | bwd_microstep: 246.32 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:36,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:36,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:37,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:37,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 252.86 | bwd_inner_microstep: 252.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:38,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 241.99 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:38,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.94 | bwd_microstep: 241.39 | bwd_inner_microstep: 241.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:39,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.76 | optimizer_step: 3.36 +[2024-12-31 17:45:39,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.38 | bwd_microstep: 255.66 | bwd_inner_microstep: 241.93 | bwd_allreduce_microstep: 13.62 | step_microstep: 11.46 +[2024-12-31 17:45:39,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.72 | bwd: 4225.63 | bwd_inner: 4210.96 | bwd_allreduce: 13.94 | step: 14.48 + 32%|███▏ | 242/759 [35:34<1:03:47, 7.40s/it] {'loss': 1.2738, 'learning_rate': 1.5939837161675297e-05, 'epoch': 0.32} + 32%|███▏ | 242/759 [35:34<1:03:47, 7.40s/it][2024-12-31 17:45:39,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 268.92 | bwd_microstep: 451.11 | bwd_inner_microstep: 450.57 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.30 +[2024-12-31 17:45:40,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.67 | bwd_microstep: 298.83 | bwd_inner_microstep: 298.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:40,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.69 | bwd_microstep: 270.59 | bwd_inner_microstep: 270.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:45:41,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.11 | bwd_microstep: 267.57 | bwd_inner_microstep: 267.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:41,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.00 | bwd_microstep: 269.04 | bwd_inner_microstep: 269.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:42,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.46 | bwd_microstep: 255.88 | bwd_inner_microstep: 255.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:42,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 289.38 | bwd_inner_microstep: 289.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:45:43,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 247.41 | bwd_inner_microstep: 247.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:43,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.85 | bwd_microstep: 246.63 | bwd_inner_microstep: 246.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:44,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 246.38 | bwd_inner_microstep: 246.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:44,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 17:45:44,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:45,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:45,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.92 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:45:46,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:46,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.98 | optimizer_step: 3.54 +[2024-12-31 17:45:46,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.88 | bwd_microstep: 255.63 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 13.92 | step_microstep: 12.52 +[2024-12-31 17:45:46,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2904.93 | bwd: 4319.83 | bwd_inner: 4304.79 | bwd_allreduce: 14.28 | step: 15.59 + 32%|███▏ | 243/759 [35:41<1:03:58, 7.44s/it] {'loss': 1.2611, 'learning_rate': 1.590544430639611e-05, 'epoch': 0.32} + 32%|███▏ | 243/759 [35:41<1:03:58, 7.44s/it][2024-12-31 17:45:47,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.22 | bwd_microstep: 316.14 | bwd_inner_microstep: 315.81 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:45:47,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.30 | bwd_microstep: 292.54 | bwd_inner_microstep: 292.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:48,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.91 | bwd_microstep: 283.32 | bwd_inner_microstep: 283.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:48,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.91 | bwd_microstep: 268.46 | bwd_inner_microstep: 268.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.81 +[2024-12-31 17:45:49,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 249.17 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:49,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 248.88 | bwd_inner_microstep: 248.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:49,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 248.34 | bwd_inner_microstep: 248.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:50,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:50,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:51,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:51,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:52,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 253.92 | bwd_inner_microstep: 253.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:45:52,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:53,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:53,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.48 | bwd_microstep: 241.14 | bwd_inner_microstep: 241.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:45:54,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.25 | optimizer_gradients: 0.63 | optimizer_step: 3.10 +[2024-12-31 17:45:54,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 404.26 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 159.33 | step_microstep: 11.62 +[2024-12-31 17:45:54,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2813.55 | bwd: 4269.51 | bwd_inner: 4109.41 | bwd_allreduce: 159.57 | step: 15.23 + 32%|███▏ | 244/759 [35:49<1:03:40, 7.42s/it] {'loss': 1.2908, 'learning_rate': 1.5870943855139437e-05, 'epoch': 0.32} + 32%|███▏ | 244/759 [35:49<1:03:40, 7.42s/it][2024-12-31 17:45:54,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 289.27 | bwd_microstep: 486.17 | bwd_inner_microstep: 485.80 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.25 +[2024-12-31 17:45:55,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.63 | bwd_microstep: 297.39 | bwd_inner_microstep: 297.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:45:55,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.02 | bwd_microstep: 286.30 | bwd_inner_microstep: 286.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:56,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.35 | bwd_microstep: 266.03 | bwd_inner_microstep: 266.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:56,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.19 | bwd_microstep: 264.53 | bwd_inner_microstep: 264.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:57,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.79 | bwd_microstep: 258.01 | bwd_inner_microstep: 257.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:57,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 256.64 | bwd_inner_microstep: 256.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:58,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 249.43 | bwd_inner_microstep: 249.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:45:58,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:58,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:45:59,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:45:59,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:00,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 242.86 | bwd_inner_microstep: 242.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:00,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:01,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.28 | bwd_microstep: 241.57 | bwd_inner_microstep: 241.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:01,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 0.72 | optimizer_step: 3.27 +[2024-12-31 17:46:01,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.66 | bwd_microstep: 255.57 | bwd_inner_microstep: 241.93 | bwd_allreduce_microstep: 13.54 | step_microstep: 11.21 +[2024-12-31 17:46:01,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2925.32 | bwd: 4327.07 | bwd_inner: 4312.68 | bwd_allreduce: 13.83 | step: 14.12 + 32%|███▏ | 245/759 [35:56<1:03:52, 7.46s/it] {'loss': 1.2377, 'learning_rate': 1.5836336436496377e-05, 'epoch': 0.32} + 32%|███▏ | 245/759 [35:56<1:03:52, 7.46s/it][2024-12-31 17:46:02,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.41 | bwd_microstep: 306.20 | bwd_inner_microstep: 305.81 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.19 +[2024-12-31 17:46:02,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.73 | bwd_microstep: 292.75 | bwd_inner_microstep: 292.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:03,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.26 | bwd_microstep: 283.11 | bwd_inner_microstep: 283.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:03,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.85 | bwd_microstep: 268.77 | bwd_inner_microstep: 268.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:04,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 250.03 | bwd_inner_microstep: 250.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:04,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:04,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:05,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:05,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 254.17 | bwd_inner_microstep: 254.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:06,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:06,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:07,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.96 | bwd_microstep: 240.50 | bwd_inner_microstep: 240.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:07,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.48 | bwd_microstep: 240.92 | bwd_inner_microstep: 240.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:46:07,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.10 | bwd_microstep: 240.56 | bwd_inner_microstep: 240.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:08,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.30 | bwd_microstep: 240.49 | bwd_inner_microstep: 240.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:09,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.83 | optimizer_gradients: 0.60 | optimizer_step: 3.10 +[2024-12-31 17:46:09,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 966.19 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 722.38 | step_microstep: 11.74 +[2024-12-31 17:46:09,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2804.80 | bwd: 4807.06 | bwd_inner: 4083.85 | bwd_allreduce: 722.64 | step: 14.71 + 32%|███▏ | 246/759 [36:04<1:04:52, 7.59s/it] {'loss': 1.2655, 'learning_rate': 1.5801622681006966e-05, 'epoch': 0.32} + 32%|███▏ | 246/759 [36:04<1:04:52, 7.59s/it][2024-12-31 17:46:10,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.28 | bwd_microstep: 359.94 | bwd_inner_microstep: 359.59 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:46:10,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.84 | bwd_microstep: 288.49 | bwd_inner_microstep: 288.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:11,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.28 | bwd_microstep: 268.61 | bwd_inner_microstep: 268.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:11,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.90 | bwd_microstep: 261.97 | bwd_inner_microstep: 261.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:11,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 249.88 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:46:12,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:46:12,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:46:13,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 247.15 | bwd_inner_microstep: 247.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:46:13,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 246.49 | bwd_inner_microstep: 246.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:14,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:14,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 251.60 | bwd_inner_microstep: 251.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:15,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:46:15,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:46:15,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:46:16,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 242.72 | bwd_inner_microstep: 242.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:16,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.04 | optimizer_gradients: 0.81 | optimizer_step: 11.26 +[2024-12-31 17:46:16,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.79 | bwd_microstep: 444.76 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 200.60 | step_microstep: 22.11 +[2024-12-31 17:46:16,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.66 | bwd: 4333.56 | bwd_inner: 4132.14 | bwd_allreduce: 200.86 | step: 25.02 + 33%|███▎ | 247/759 [36:11<1:04:32, 7.56s/it] {'loss': 1.2419, 'learning_rate': 1.5766803221148676e-05, 'epoch': 0.33} + 33%|███▎ | 247/759 [36:11<1:04:32, 7.56s/it][2024-12-31 17:46:17,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.90 | bwd_microstep: 334.12 | bwd_inner_microstep: 333.76 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:46:18,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.73 | bwd_microstep: 266.81 | bwd_inner_microstep: 266.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:46:18,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.12 | bwd_microstep: 263.46 | bwd_inner_microstep: 263.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:18,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.02 | bwd_microstep: 256.53 | bwd_inner_microstep: 256.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:46:19,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 253.13 | bwd_inner_microstep: 252.73 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.28 +[2024-12-31 17:46:19,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 247.18 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:20,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:20,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:21,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.46 | bwd_inner_microstep: 246.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:21,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 258.74 | bwd_inner_microstep: 258.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:22,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:22,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 266.14 | bwd_inner_microstep: 266.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:22,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.22 +[2024-12-31 17:46:23,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 245.02 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:23,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.64 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:46:24,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.75 | optimizer_gradients: 0.63 | optimizer_step: 3.11 +[2024-12-31 17:46:24,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 312.95 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 66.15 | step_microstep: 12.03 +[2024-12-31 17:46:24,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2806.44 | bwd: 4174.85 | bwd_inner: 4107.21 | bwd_allreduce: 66.76 | step: 15.11 + 33%|███▎ | 248/759 [36:19<1:03:45, 7.49s/it] {'loss': 1.2521, 'learning_rate': 1.5731878691324874e-05, 'epoch': 0.33} + 33%|███▎ | 248/759 [36:19<1:03:45, 7.49s/it][2024-12-31 17:46:24,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.81 | bwd_microstep: 362.42 | bwd_inner_microstep: 362.07 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:46:25,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.67 | bwd_microstep: 346.70 | bwd_inner_microstep: 346.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:46:25,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.78 | bwd_microstep: 262.74 | bwd_inner_microstep: 262.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:26,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.98 | bwd_microstep: 255.38 | bwd_inner_microstep: 255.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:26,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 251.93 | bwd_inner_microstep: 251.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:27,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:27,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.32 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:28,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:28,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 250.40 | bwd_inner_microstep: 250.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:29,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 245.03 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:29,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 254.20 | bwd_inner_microstep: 254.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:29,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:30,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:30,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:31,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:46:31,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.21 | optimizer_gradients: 0.62 | optimizer_step: 3.13 +[2024-12-31 17:46:31,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 290.14 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 45.76 | step_microstep: 10.98 +[2024-12-31 17:46:31,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2849.08 | bwd: 4233.82 | bwd_inner: 4187.13 | bwd_allreduce: 46.04 | step: 13.89 + 33%|███▎ | 249/759 [36:26<1:03:20, 7.45s/it] {'loss': 1.2574, 'learning_rate': 1.5696849727853297e-05, 'epoch': 0.33} + 33%|███▎ | 249/759 [36:26<1:03:20, 7.45s/it][2024-12-31 17:46:32,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 290.06 | bwd_microstep: 461.96 | bwd_inner_microstep: 461.63 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.26 +[2024-12-31 17:46:32,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.64 | bwd_microstep: 281.43 | bwd_inner_microstep: 281.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:46:33,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.77 | bwd_microstep: 256.57 | bwd_inner_microstep: 256.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:33,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 249.04 | bwd_inner_microstep: 249.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:46:34,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:46:34,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:46:35,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.59 | bwd_microstep: 247.73 | bwd_inner_microstep: 247.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:35,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.71 | bwd_microstep: 246.04 | bwd_inner_microstep: 246.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:36,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:46:36,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 249.60 | bwd_inner_microstep: 249.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:36,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:37,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:37,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:38,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:38,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:39,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.87 | optimizer_gradients: 0.61 | optimizer_step: 3.08 +[2024-12-31 17:46:39,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.48 | bwd_microstep: 337.50 | bwd_inner_microstep: 242.72 | bwd_allreduce_microstep: 94.73 | step_microstep: 10.46 +[2024-12-31 17:46:39,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.50 | bwd: 4292.82 | bwd_inner: 4197.30 | bwd_allreduce: 94.98 | step: 13.20 + 33%|███▎ | 250/759 [36:34<1:03:13, 7.45s/it] {'loss': 1.2392, 'learning_rate': 1.5661716968954436e-05, 'epoch': 0.33} + 33%|███▎ | 250/759 [36:34<1:03:13, 7.45s/it][2024-12-31 17:46:39,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.35 | bwd_microstep: 375.14 | bwd_inner_microstep: 374.77 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:46:40,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.06 | bwd_microstep: 291.07 | bwd_inner_microstep: 291.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:41,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 283.50 | bwd_microstep: 487.93 | bwd_inner_microstep: 487.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.30 +[2024-12-31 17:46:41,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.48 | bwd_microstep: 263.36 | bwd_inner_microstep: 263.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:41,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.42 | bwd_microstep: 257.76 | bwd_inner_microstep: 257.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:42,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 250.77 | bwd_inner_microstep: 250.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:46:42,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 247.52 | bwd_inner_microstep: 247.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:43,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:43,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:44,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.03 | bwd_microstep: 271.86 | bwd_inner_microstep: 271.45 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:46:44,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 248.95 | bwd_inner_microstep: 248.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:45,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 248.41 | bwd_inner_microstep: 248.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:45,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:46:45,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:46,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:46:46,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.73 | optimizer_step: 3.28 +[2024-12-31 17:46:46,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 257.60 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.76 +[2024-12-31 17:46:46,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2997.90 | bwd: 4426.29 | bwd_inner: 4411.17 | bwd_allreduce: 14.12 | step: 14.92 + 33%|███▎ | 251/759 [36:41<1:03:47, 7.53s/it] {'loss': 1.2319, 'learning_rate': 1.5626481054739916e-05, 'epoch': 0.33} + 33%|███▎ | 251/759 [36:41<1:03:47, 7.53s/it][2024-12-31 17:46:47,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.68 | bwd_microstep: 346.56 | bwd_inner_microstep: 346.20 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.24 +[2024-12-31 17:46:47,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.63 | bwd_microstep: 289.12 | bwd_inner_microstep: 289.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:48,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.27 | bwd_microstep: 273.17 | bwd_inner_microstep: 272.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 17:46:48,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.73 | bwd_microstep: 271.81 | bwd_inner_microstep: 271.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:49,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.86 | bwd_microstep: 257.71 | bwd_inner_microstep: 257.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:49,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 249.16 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:50,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:50,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 260.40 | bwd_inner_microstep: 260.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:51,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:46:51,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:51,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:52,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 243.79 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:52,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:46:53,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:53,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:46:54,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.33 | optimizer_gradients: 0.62 | optimizer_step: 3.09 +[2024-12-31 17:46:54,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 493.95 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 250.30 | step_microstep: 11.34 +[2024-12-31 17:46:54,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2834.89 | bwd: 4406.19 | bwd_inner: 4154.81 | bwd_allreduce: 250.64 | step: 14.28 + 33%|███▎ | 252/759 [36:49<1:03:40, 7.54s/it] {'loss': 1.2687, 'learning_rate': 1.5591142627200825e-05, 'epoch': 0.33} + 33%|███▎ | 252/759 [36:49<1:03:40, 7.54s/it][2024-12-31 17:46:55,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.86 | bwd_microstep: 371.02 | bwd_inner_microstep: 370.65 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:46:55,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.85 | bwd_microstep: 299.19 | bwd_inner_microstep: 299.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:56,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.68 | bwd_microstep: 285.74 | bwd_inner_microstep: 285.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:46:56,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.18 | bwd_microstep: 257.58 | bwd_inner_microstep: 257.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:46:56,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 250.12 | bwd_inner_microstep: 250.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:46:57,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 330.06 | bwd_microstep: 257.54 | bwd_inner_microstep: 257.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:57,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 248.07 | bwd_inner_microstep: 248.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:46:58,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:58,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:59,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.35 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:46:59,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:00,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.28 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.20 +[2024-12-31 17:47:00,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:00,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:01,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 242.78 | bwd_inner_microstep: 242.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:01,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 8.54 | optimizer_step: 12.24 +[2024-12-31 17:47:01,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.50 | bwd_microstep: 291.58 | bwd_inner_microstep: 277.65 | bwd_allreduce_microstep: 13.80 | step_microstep: 28.51 +[2024-12-31 17:47:01,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3003.93 | bwd: 4218.31 | bwd_inner: 4203.29 | bwd_allreduce: 14.24 | step: 31.38 + 33%|███▎ | 253/759 [36:56<1:03:32, 7.53s/it] {'loss': 1.2643, 'learning_rate': 1.5555702330196024e-05, 'epoch': 0.33} + 33%|███▎ | 253/759 [36:56<1:03:32, 7.53s/it][2024-12-31 17:47:02,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.50 | bwd_microstep: 338.23 | bwd_inner_microstep: 337.87 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:47:03,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.44 | bwd_microstep: 297.85 | bwd_inner_microstep: 297.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:03,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.30 | bwd_microstep: 291.60 | bwd_inner_microstep: 291.26 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.27 +[2024-12-31 17:47:03,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 250.72 | bwd_inner_microstep: 250.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:04,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.51 | bwd_microstep: 255.38 | bwd_inner_microstep: 255.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:04,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 249.08 | bwd_inner_microstep: 249.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:05,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:05,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:06,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 245.86 | bwd_inner_microstep: 245.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:06,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 260.29 | bwd_inner_microstep: 260.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:07,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:07,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:07,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.48 | bwd_inner_microstep: 245.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:08,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.75 | bwd_microstep: 246.87 | bwd_inner_microstep: 246.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:47:08,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.37 | bwd_microstep: 241.56 | bwd_inner_microstep: 241.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:09,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.41 | optimizer_gradients: 0.80 | optimizer_step: 3.22 +[2024-12-31 17:47:09,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.43 | bwd_microstep: 698.63 | bwd_inner_microstep: 242.08 | bwd_allreduce_microstep: 456.51 | step_microstep: 25.27 +[2024-12-31 17:47:09,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.69 | bwd: 4603.12 | bwd_inner: 4145.37 | bwd_allreduce: 457.05 | step: 28.13 + 33%|███▎ | 254/759 [37:04<1:03:55, 7.60s/it] {'loss': 1.2178, 'learning_rate': 1.552016080944042e-05, 'epoch': 0.33} + 33%|███▎ | 254/759 [37:04<1:03:55, 7.60s/it][2024-12-31 17:47:10,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.25 | bwd_microstep: 336.51 | bwd_inner_microstep: 336.17 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:47:10,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.45 | bwd_microstep: 285.52 | bwd_inner_microstep: 285.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:11,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.30 | bwd_microstep: 263.27 | bwd_inner_microstep: 263.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:47:11,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 250.29 | bwd_inner_microstep: 250.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:12,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 248.15 | bwd_inner_microstep: 248.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:12,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 248.95 | bwd_inner_microstep: 248.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:12,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:13,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 255.11 | bwd_inner_microstep: 254.92 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.27 +[2024-12-31 17:47:13,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 246.15 | bwd_inner_microstep: 246.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:14,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:14,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 254.13 | bwd_inner_microstep: 254.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:15,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:15,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:16,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 256.62 | bwd_inner_microstep: 256.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:47:16,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 241.86 | bwd_inner_microstep: 241.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:16,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.74 | optimizer_step: 3.37 +[2024-12-31 17:47:16,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.10 | bwd_microstep: 259.11 | bwd_inner_microstep: 242.69 | bwd_allreduce_microstep: 16.33 | step_microstep: 12.04 +[2024-12-31 17:47:16,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2798.97 | bwd: 4123.45 | bwd_inner: 4105.95 | bwd_allreduce: 16.73 | step: 15.22 + 34%|███▎ | 255/759 [37:11<1:02:55, 7.49s/it] {'loss': 1.2774, 'learning_rate': 1.5484518712493188e-05, 'epoch': 0.34} + 34%|███▎ | 255/759 [37:11<1:02:55, 7.49s/it][2024-12-31 17:47:17,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.58 | bwd_microstep: 313.02 | bwd_inner_microstep: 312.64 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.26 +[2024-12-31 17:47:17,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.30 | bwd_microstep: 282.45 | bwd_inner_microstep: 282.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:18,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.25 | bwd_microstep: 266.20 | bwd_inner_microstep: 266.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:47:18,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.06 | bwd_microstep: 261.51 | bwd_inner_microstep: 261.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:47:19,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.67 | bwd_microstep: 261.93 | bwd_inner_microstep: 261.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:19,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.79 | bwd_microstep: 255.15 | bwd_inner_microstep: 255.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:47:20,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.64 | bwd_microstep: 253.53 | bwd_inner_microstep: 253.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:47:20,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 245.90 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:21,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.70 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:21,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.29 | bwd_inner_microstep: 245.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:21,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:22,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:47:22,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:47:23,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.52 | bwd_inner_microstep: 244.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:23,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 252.93 | bwd_inner_microstep: 252.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:24,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.66 | optimizer_gradients: 0.82 | optimizer_step: 3.18 +[2024-12-31 17:47:24,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 284.02 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 37.48 | step_microstep: 11.00 +[2024-12-31 17:47:24,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.39 | bwd: 4143.26 | bwd_inner: 4104.97 | bwd_allreduce: 37.75 | step: 14.13 + 34%|███▎ | 256/759 [37:19<1:02:16, 7.43s/it] {'loss': 1.2382, 'learning_rate': 1.544877668874599e-05, 'epoch': 0.34} + 34%|███▎ | 256/759 [37:19<1:02:16, 7.43s/it][2024-12-31 17:47:24,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.27 | bwd_microstep: 335.11 | bwd_inner_microstep: 334.79 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.18 +[2024-12-31 17:47:25,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.68 | bwd_microstep: 280.75 | bwd_inner_microstep: 280.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:25,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.33 | bwd_microstep: 274.36 | bwd_inner_microstep: 274.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:47:26,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.95 | bwd_microstep: 258.19 | bwd_inner_microstep: 258.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:26,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 250.33 | bwd_inner_microstep: 250.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:47:27,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 246.45 | bwd_inner_microstep: 246.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:47:27,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 247.52 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:27,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 262.02 | bwd_inner_microstep: 262.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:28,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.84 | bwd_microstep: 245.61 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:28,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 245.57 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:29,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:29,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:47:30,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:30,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.48 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:47:30,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 242.84 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:31,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.62 | optimizer_gradients: 0.61 | optimizer_step: 21.04 +[2024-12-31 17:47:31,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.91 | bwd_microstep: 276.39 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 31.44 | step_microstep: 28.98 +[2024-12-31 17:47:31,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.68 | bwd: 4142.82 | bwd_inner: 4110.55 | bwd_allreduce: 31.68 | step: 31.72 + 34%|███▍ | 257/759 [37:26<1:01:48, 7.39s/it] {'loss': 1.2719, 'learning_rate': 1.5412935389411124e-05, 'epoch': 0.34} + 34%|███▍ | 257/759 [37:26<1:01:48, 7.39s/it][2024-12-31 17:47:32,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.61 | bwd_microstep: 311.90 | bwd_inner_microstep: 311.52 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:47:32,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.89 | bwd_microstep: 360.68 | bwd_inner_microstep: 360.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:33,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.66 | bwd_microstep: 263.65 | bwd_inner_microstep: 263.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:33,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.90 | bwd_microstep: 262.82 | bwd_inner_microstep: 262.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:34,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.91 | bwd_microstep: 261.95 | bwd_inner_microstep: 261.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:34,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:47:34,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 247.74 | bwd_inner_microstep: 247.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:35,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:35,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 263.28 | bwd_inner_microstep: 263.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:36,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:36,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:37,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:37,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 249.15 | bwd_inner_microstep: 249.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:37,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.13 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:38,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:38,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.64 | optimizer_step: 17.91 +[2024-12-31 17:47:38,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 318.10 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 73.94 | step_microstep: 25.41 +[2024-12-31 17:47:38,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.29 | bwd: 4252.69 | bwd_inner: 4177.96 | bwd_allreduce: 74.19 | step: 28.55 + 34%|███▍ | 258/759 [37:33<1:01:44, 7.39s/it] {'loss': 1.2459, 'learning_rate': 1.5376995467509673e-05, 'epoch': 0.34} + 34%|███▍ | 258/759 [37:33<1:01:44, 7.39s/it][2024-12-31 17:47:39,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.98 | bwd_microstep: 340.98 | bwd_inner_microstep: 340.60 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:47:39,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.04 | bwd_microstep: 304.06 | bwd_inner_microstep: 304.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:40,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.08 | bwd_microstep: 265.41 | bwd_inner_microstep: 265.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:47:40,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.88 | bwd_microstep: 257.67 | bwd_inner_microstep: 257.31 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.34 +[2024-12-31 17:47:41,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.87 | bwd_microstep: 256.08 | bwd_inner_microstep: 256.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:47:41,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 257.21 | bwd_inner_microstep: 257.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:42,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 248.09 | bwd_inner_microstep: 248.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:42,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 253.20 | bwd_inner_microstep: 253.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:43,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.74 | bwd_microstep: 247.02 | bwd_inner_microstep: 246.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:43,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:47:43,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:44,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 245.03 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 17:47:44,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.97 | bwd_microstep: 306.53 | bwd_inner_microstep: 306.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:47:45,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:47:45,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 241.75 | bwd_inner_microstep: 241.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:46,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.21 | optimizer_gradients: 0.66 | optimizer_step: 3.14 +[2024-12-31 17:47:46,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.85 | bwd_microstep: 312.98 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 69.83 | step_microstep: 11.54 +[2024-12-31 17:47:46,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2844.11 | bwd: 4267.17 | bwd_inner: 4196.10 | bwd_allreduce: 70.33 | step: 13.79 + 34%|███▍ | 259/759 [37:41<1:01:32, 7.38s/it] {'loss': 1.2503, 'learning_rate': 1.5340957577859605e-05, 'epoch': 0.34} + 34%|███▍ | 259/759 [37:41<1:01:32, 7.38s/it][2024-12-31 17:47:46,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.28 | bwd_microstep: 339.05 | bwd_inner_microstep: 338.71 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:47:47,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.79 | bwd_microstep: 354.61 | bwd_inner_microstep: 354.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:47,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.16 | bwd_microstep: 281.34 | bwd_inner_microstep: 281.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:48,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.15 | bwd_microstep: 263.33 | bwd_inner_microstep: 263.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:48,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.46 | bwd_microstep: 263.12 | bwd_inner_microstep: 263.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:47:49,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 251.01 | bwd_inner_microstep: 250.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:49,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:50,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:47:50,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:47:50,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 247.85 | bwd_inner_microstep: 247.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:51,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 17:47:51,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:47:52,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:47:52,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 242.49 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:53,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.19 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:47:54,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.51 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 17:47:54,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.16 | bwd_microstep: 902.49 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 657.97 | step_microstep: 11.18 +[2024-12-31 17:47:54,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2878.92 | bwd: 4855.90 | bwd_inner: 4197.07 | bwd_allreduce: 658.22 | step: 13.44 + 34%|███▍ | 260/759 [37:49<1:02:53, 7.56s/it] {'loss': 1.2523, 'learning_rate': 1.530482237706383e-05, 'epoch': 0.34} + 34%|███▍ | 260/759 [37:49<1:02:53, 7.56s/it][2024-12-31 17:47:54,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.05 | bwd_microstep: 339.00 | bwd_inner_microstep: 338.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:47:55,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.43 | bwd_microstep: 281.47 | bwd_inner_microstep: 281.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:55,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.40 | bwd_microstep: 266.94 | bwd_inner_microstep: 266.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:56,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.89 | bwd_microstep: 268.48 | bwd_inner_microstep: 268.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:47:56,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 250.37 | bwd_inner_microstep: 250.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:47:57,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.16 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:47:57,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 249.42 | bwd_inner_microstep: 249.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:47:57,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:47:58,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:47:58,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:47:59,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:47:59,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.41 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:00,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 247.06 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:00,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.85 | bwd_microstep: 268.43 | bwd_inner_microstep: 268.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:48:01,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.42 | bwd_microstep: 242.05 | bwd_inner_microstep: 242.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:01,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.84 | optimizer_step: 3.20 +[2024-12-31 17:48:01,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 492.08 | bwd_inner_microstep: 241.51 | bwd_allreduce_microstep: 250.52 | step_microstep: 10.89 +[2024-12-31 17:48:01,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.58 | bwd: 4378.95 | bwd_inner: 4127.56 | bwd_allreduce: 250.77 | step: 13.62 + 34%|███▍ | 261/759 [37:56<1:02:30, 7.53s/it] {'loss': 1.2432, 'learning_rate': 1.526859052349827e-05, 'epoch': 0.34} + 34%|███▍ | 261/759 [37:56<1:02:30, 7.53s/it][2024-12-31 17:48:02,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.41 | bwd_microstep: 337.21 | bwd_inner_microstep: 336.68 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.29 +[2024-12-31 17:48:02,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.28 | bwd_microstep: 288.82 | bwd_inner_microstep: 288.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:03,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.22 | bwd_microstep: 267.80 | bwd_inner_microstep: 267.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:03,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.66 | bwd_microstep: 262.80 | bwd_inner_microstep: 262.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:04,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 249.50 | bwd_inner_microstep: 249.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:04,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 247.88 | bwd_inner_microstep: 247.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:05,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.21 +[2024-12-31 17:48:05,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 252.60 | bwd_inner_microstep: 252.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:48:05,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:48:06,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:48:06,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:07,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.48 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:07,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.32 | bwd_microstep: 264.27 | bwd_inner_microstep: 264.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:08,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:08,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.45 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:08,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.89 | optimizer_step: 3.15 +[2024-12-31 17:48:08,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.76 | bwd_microstep: 306.04 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 62.86 | step_microstep: 11.76 +[2024-12-31 17:48:08,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2808.78 | bwd: 4182.89 | bwd_inner: 4118.83 | bwd_allreduce: 63.25 | step: 14.60 + 35%|███▍ | 262/759 [38:03<1:01:45, 7.46s/it] {'loss': 1.2565, 'learning_rate': 1.5232262677299816e-05, 'epoch': 0.35} + 35%|███▍ | 262/759 [38:03<1:01:45, 7.46s/it][2024-12-31 17:48:09,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.25 | bwd_microstep: 361.22 | bwd_inner_microstep: 360.85 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.19 +[2024-12-31 17:48:10,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.84 | bwd_microstep: 304.93 | bwd_inner_microstep: 304.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.31 +[2024-12-31 17:48:10,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.71 | bwd_microstep: 301.43 | bwd_inner_microstep: 301.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:11,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.32 | bwd_microstep: 267.18 | bwd_inner_microstep: 267.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:48:11,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.31 | bwd_microstep: 257.71 | bwd_inner_microstep: 257.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:11,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.53 | bwd_microstep: 258.98 | bwd_inner_microstep: 258.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:12,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.57 | bwd_microstep: 255.88 | bwd_inner_microstep: 255.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:12,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 249.81 | bwd_inner_microstep: 249.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:13,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:48:13,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 246.32 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:14,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:14,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.62 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:15,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:15,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 281.88 | bwd_inner_microstep: 281.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:15,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 241.21 | bwd_inner_microstep: 241.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:16,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.05 | optimizer_gradients: 0.62 | optimizer_step: 3.12 +[2024-12-31 17:48:16,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 279.70 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 35.30 | step_microstep: 10.73 +[2024-12-31 17:48:16,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2872.34 | bwd: 4284.27 | bwd_inner: 4248.00 | bwd_allreduce: 35.59 | step: 13.74 + 35%|███▍ | 263/759 [38:11<1:01:36, 7.45s/it] {'loss': 1.219, 'learning_rate': 1.5195839500354337e-05, 'epoch': 0.35} + 35%|███▍ | 263/759 [38:11<1:01:36, 7.45s/it][2024-12-31 17:48:17,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.74 | bwd_microstep: 357.81 | bwd_inner_microstep: 357.45 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:48:17,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.22 | bwd_microstep: 306.22 | bwd_inner_microstep: 306.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:48:18,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.23 | bwd_microstep: 288.18 | bwd_inner_microstep: 288.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:18,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.32 | bwd_microstep: 281.82 | bwd_inner_microstep: 281.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:48:19,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.51 | bwd_microstep: 292.39 | bwd_inner_microstep: 292.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:19,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.35 | bwd_microstep: 255.48 | bwd_inner_microstep: 255.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:19,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.38 | bwd_microstep: 256.18 | bwd_inner_microstep: 256.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:20,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:20,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:21,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 251.63 | bwd_inner_microstep: 251.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:21,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:22,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:22,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:22,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:48:23,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.77 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 17:48:23,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.93 | optimizer_gradients: 1.77 | optimizer_step: 3.09 +[2024-12-31 17:48:23,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.71 | bwd_microstep: 302.57 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 59.19 | step_microstep: 12.85 +[2024-12-31 17:48:23,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2881.28 | bwd: 4313.96 | bwd_inner: 4253.83 | bwd_allreduce: 59.44 | step: 16.11 + 35%|███▍ | 264/759 [38:18<1:01:34, 7.46s/it] {'loss': 1.2263, 'learning_rate': 1.5159321656284602e-05, 'epoch': 0.35} + 35%|███▍ | 264/759 [38:18<1:01:34, 7.46s/it][2024-12-31 17:48:24,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.82 | bwd_microstep: 391.45 | bwd_inner_microstep: 391.10 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:48:25,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.37 | bwd_microstep: 288.28 | bwd_inner_microstep: 288.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:25,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.12 | bwd_microstep: 263.65 | bwd_inner_microstep: 263.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:48:25,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.13 | bwd_microstep: 267.07 | bwd_inner_microstep: 267.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:48:26,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 258.32 | bwd_inner_microstep: 258.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:26,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.76 | bwd_microstep: 255.26 | bwd_inner_microstep: 255.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:27,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 254.59 | bwd_inner_microstep: 254.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:27,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:48:28,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.36 | bwd_microstep: 247.59 | bwd_inner_microstep: 247.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:28,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:29,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 251.84 | bwd_inner_microstep: 251.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:29,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 247.12 | bwd_inner_microstep: 247.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:48:29,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:30,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 263.21 | bwd_inner_microstep: 263.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:48:30,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 160.95 | bwd_microstep: 228.12 | bwd_inner_microstep: 228.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:31,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 9.43 | optimizer_step: 3.40 +[2024-12-31 17:48:31,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 262.10 | bwd_inner_microstep: 248.33 | bwd_allreduce_microstep: 13.69 | step_microstep: 19.63 +[2024-12-31 17:48:31,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.32 | bwd: 4216.68 | bwd_inner: 4202.17 | bwd_allreduce: 13.95 | step: 22.25 + 35%|███▍ | 265/759 [38:26<1:01:12, 7.43s/it] {'loss': 1.2517, 'learning_rate': 1.5122709810438205e-05, 'epoch': 0.35} + 35%|███▍ | 265/759 [38:26<1:01:12, 7.43s/it][2024-12-31 17:48:31,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 211.74 | bwd_microstep: 337.05 | bwd_inner_microstep: 336.69 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:48:32,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.70 | bwd_microstep: 268.67 | bwd_inner_microstep: 268.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:32,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 255.97 | bwd_inner_microstep: 255.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:33,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 248.00 | bwd_inner_microstep: 247.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:48:33,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.50 | bwd_microstep: 245.25 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:34,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 248.05 | bwd_inner_microstep: 248.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:34,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 253.02 | bwd_inner_microstep: 252.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:34,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:48:35,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:35,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:48:36,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:36,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 17:48:37,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.63 | bwd_microstep: 241.18 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:37,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 242.60 | bwd_inner_microstep: 242.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:37,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.53 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:38,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.42 | optimizer_gradients: 0.84 | optimizer_step: 3.17 +[2024-12-31 17:48:38,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 324.05 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 80.85 | step_microstep: 12.70 +[2024-12-31 17:48:38,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2787.58 | bwd: 4125.01 | bwd_inner: 4043.26 | bwd_allreduce: 81.12 | step: 15.43 + 35%|███▌ | 266/759 [38:33<1:00:28, 7.36s/it] {'loss': 1.2761, 'learning_rate': 1.5086004629875426e-05, 'epoch': 0.35} + 35%|███▌ | 266/759 [38:33<1:00:28, 7.36s/it][2024-12-31 17:48:39,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.94 | bwd_microstep: 395.66 | bwd_inner_microstep: 395.31 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:48:39,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.74 | bwd_microstep: 321.77 | bwd_inner_microstep: 321.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:48:40,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.83 | bwd_microstep: 273.91 | bwd_inner_microstep: 273.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:40,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 257.54 | bwd_inner_microstep: 257.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:41,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 249.81 | bwd_inner_microstep: 249.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:41,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.09 | bwd_microstep: 249.20 | bwd_inner_microstep: 249.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:41,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:42,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:42,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:43,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:43,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 252.31 | bwd_inner_microstep: 252.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:48:44,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:48:44,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:48:44,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 244.24 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:45,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:45,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.75 | optimizer_gradients: 0.80 | optimizer_step: 3.33 +[2024-12-31 17:48:45,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 255.58 | bwd_inner_microstep: 241.88 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.54 +[2024-12-31 17:48:45,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.25 | bwd: 4213.03 | bwd_inner: 4198.53 | bwd_allreduce: 13.86 | step: 14.62 + 35%|███▌ | 267/759 [38:40<1:00:23, 7.36s/it] {'loss': 1.2597, 'learning_rate': 1.5049206783357082e-05, 'epoch': 0.35} + 35%|███▌ | 267/759 [38:40<1:00:23, 7.36s/it][2024-12-31 17:48:46,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.57 | bwd_microstep: 317.12 | bwd_inner_microstep: 316.78 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:48:46,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.69 | bwd_microstep: 283.65 | bwd_inner_microstep: 283.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:47,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.73 | bwd_microstep: 283.25 | bwd_inner_microstep: 283.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:48:47,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 262.02 | bwd_inner_microstep: 261.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:48,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.72 | bwd_microstep: 262.74 | bwd_inner_microstep: 262.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:48:48,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 260.39 | bwd_inner_microstep: 260.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:49,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 251.79 | bwd_inner_microstep: 251.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:49,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 248.21 | bwd_inner_microstep: 248.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:48:50,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 247.01 | bwd_inner_microstep: 246.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:50,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 253.53 | bwd_inner_microstep: 253.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:50,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:51,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:51,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:52,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 247.78 | bwd_inner_microstep: 247.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:52,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.16 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:53,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 1.16 | optimizer_step: 3.54 +[2024-12-31 17:48:53,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 255.86 | bwd_inner_microstep: 241.84 | bwd_allreduce_microstep: 13.91 | step_microstep: 12.43 +[2024-12-31 17:48:53,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.94 | bwd: 4151.18 | bwd_inner: 4136.23 | bwd_allreduce: 14.23 | step: 15.60 + 35%|███▌ | 268/759 [38:48<1:00:04, 7.34s/it] {'loss': 1.2765, 'learning_rate': 1.501231694133235e-05, 'epoch': 0.35} + 35%|███▌ | 268/759 [38:48<1:00:04, 7.34s/it][2024-12-31 17:48:53,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.74 | bwd_microstep: 313.19 | bwd_inner_microstep: 313.00 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.11 +[2024-12-31 17:48:54,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.85 | bwd_microstep: 265.64 | bwd_inner_microstep: 265.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:54,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 257.81 | bwd_inner_microstep: 257.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:55,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 249.93 | bwd_inner_microstep: 249.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:55,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 248.06 | bwd_inner_microstep: 248.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:55,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 248.22 | bwd_inner_microstep: 248.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:56,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.46 | bwd_microstep: 247.15 | bwd_inner_microstep: 247.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:56,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:57,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:48:57,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.60 | bwd_microstep: 262.99 | bwd_inner_microstep: 262.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:58,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.16 | bwd_microstep: 241.72 | bwd_inner_microstep: 241.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:48:58,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:48:58,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:48:59,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:48:59,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.95 | bwd_microstep: 241.55 | bwd_inner_microstep: 241.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:00,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.51 | optimizer_gradients: 0.61 | optimizer_step: 3.11 +[2024-12-31 17:49:00,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.53 | bwd_microstep: 345.62 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 102.13 | step_microstep: 11.47 +[2024-12-31 17:49:00,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.18 | bwd: 4142.56 | bwd_inner: 4039.63 | bwd_allreduce: 102.32 | step: 14.24 + 35%|███▌ | 269/759 [38:55<59:44, 7.32s/it] {'loss': 1.2937, 'learning_rate': 1.4975335775926547e-05, 'epoch': 0.35} + 35%|███▌ | 269/759 [38:55<59:44, 7.32s/it][2024-12-31 17:49:01,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 234.75 | bwd_microstep: 381.74 | bwd_inner_microstep: 381.37 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:49:01,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.64 | bwd_microstep: 268.18 | bwd_inner_microstep: 268.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:02,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.33 | bwd_microstep: 268.40 | bwd_inner_microstep: 268.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:02,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.70 | bwd_microstep: 255.60 | bwd_inner_microstep: 255.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:02,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.41 | bwd_microstep: 254.90 | bwd_inner_microstep: 254.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:49:03,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.46 | bwd_microstep: 255.03 | bwd_inner_microstep: 255.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:03,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:04,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 260.63 | bwd_inner_microstep: 260.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:04,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:05,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 244.31 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:05,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 257.49 | bwd_inner_microstep: 257.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:05,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:06,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.59 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:06,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:07,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 245.98 | bwd_inner_microstep: 245.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:07,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.66 | optimizer_gradients: 0.81 | optimizer_step: 3.35 +[2024-12-31 17:49:07,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.50 | bwd_microstep: 255.66 | bwd_inner_microstep: 242.02 | bwd_allreduce_microstep: 13.54 | step_microstep: 11.19 +[2024-12-31 17:49:07,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2889.28 | bwd: 4173.94 | bwd_inner: 4159.39 | bwd_allreduce: 13.83 | step: 14.18 + 36%|███▌ | 270/759 [39:02<59:44, 7.33s/it] {'loss': 1.2656, 'learning_rate': 1.4938263960928878e-05, 'epoch': 0.36} + 36%|███▌ | 270/759 [39:02<59:44, 7.33s/it][2024-12-31 17:49:08,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 275.34 | bwd_microstep: 472.40 | bwd_inner_microstep: 472.03 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:49:09,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.93 | bwd_microstep: 405.78 | bwd_inner_microstep: 405.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:49:09,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.23 | bwd_microstep: 286.87 | bwd_inner_microstep: 286.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:10,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.16 | bwd_microstep: 263.61 | bwd_inner_microstep: 263.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:10,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 250.66 | bwd_inner_microstep: 250.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:11,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.39 | bwd_microstep: 250.56 | bwd_inner_microstep: 250.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:11,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 254.93 | bwd_inner_microstep: 254.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:11,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:12,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.65 | bwd_microstep: 247.72 | bwd_inner_microstep: 247.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:49:12,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 246.48 | bwd_inner_microstep: 246.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:13,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.35 +[2024-12-31 17:49:13,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:14,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:14,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:14,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:15,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.92 | optimizer_gradients: 0.71 | optimizer_step: 3.17 +[2024-12-31 17:49:15,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 393.36 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 149.52 | step_microstep: 10.75 +[2024-12-31 17:49:15,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2953.67 | bwd: 4544.41 | bwd_inner: 4393.62 | bwd_allreduce: 150.07 | step: 13.96 + 36%|███▌ | 271/759 [39:10<1:00:44, 7.47s/it] {'loss': 1.2233, 'learning_rate': 1.4901102171780175e-05, 'epoch': 0.36} + 36%|███▌ | 271/759 [39:10<1:00:44, 7.47s/it][2024-12-31 17:49:16,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.27 | bwd_microstep: 344.74 | bwd_inner_microstep: 344.40 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:49:16,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.62 | bwd_microstep: 391.77 | bwd_inner_microstep: 391.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:49:17,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.17 | bwd_microstep: 263.25 | bwd_inner_microstep: 263.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:17,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.45 | bwd_microstep: 258.08 | bwd_inner_microstep: 258.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:18,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 257.70 | bwd_inner_microstep: 257.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:49:18,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:19,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 250.47 | bwd_inner_microstep: 250.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:19,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.47 | bwd_inner_microstep: 245.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:49:19,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 247.40 | bwd_inner_microstep: 247.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:20,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:20,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.00 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:21,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:21,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.84 | bwd_microstep: 241.78 | bwd_inner_microstep: 241.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:22,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.61 | bwd_microstep: 240.91 | bwd_inner_microstep: 240.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:22,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:22,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.85 | optimizer_gradients: 1.14 | optimizer_step: 3.29 +[2024-12-31 17:49:22,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.31 | bwd_microstep: 255.66 | bwd_inner_microstep: 241.63 | bwd_allreduce_microstep: 13.94 | step_microstep: 11.48 +[2024-12-31 17:49:22,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2883.91 | bwd: 4224.35 | bwd_inner: 4209.24 | bwd_allreduce: 14.21 | step: 14.65 + 36%|███▌ | 272/759 [39:17<1:00:27, 7.45s/it] {'loss': 1.2493, 'learning_rate': 1.4863851085560563e-05, 'epoch': 0.36} + 36%|███▌ | 272/759 [39:17<1:00:27, 7.45s/it][2024-12-31 17:49:23,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.44 | bwd_microstep: 337.02 | bwd_inner_microstep: 336.67 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.33 +[2024-12-31 17:49:24,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.42 | bwd_microstep: 351.80 | bwd_inner_microstep: 351.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:24,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.58 | bwd_microstep: 266.89 | bwd_inner_microstep: 266.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:25,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.64 | bwd_microstep: 265.65 | bwd_inner_microstep: 265.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:25,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 254.84 | bwd_inner_microstep: 254.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:25,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.56 | bwd_microstep: 246.04 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:26,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 270.24 | bwd_microstep: 458.19 | bwd_inner_microstep: 458.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:27,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 264.42 | bwd_inner_microstep: 264.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:27,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 252.14 | bwd_inner_microstep: 252.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:27,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:28,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 254.05 | bwd_inner_microstep: 254.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:28,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:29,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.44 | bwd_microstep: 241.24 | bwd_inner_microstep: 241.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:29,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:30,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:30,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.16 | optimizer_gradients: 0.75 | optimizer_step: 3.27 +[2024-12-31 17:49:30,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 256.78 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 13.52 | step_microstep: 11.84 +[2024-12-31 17:49:30,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2960.39 | bwd: 4426.99 | bwd_inner: 4412.56 | bwd_allreduce: 13.79 | step: 15.09 + 36%|███▌ | 273/759 [39:25<1:00:54, 7.52s/it] {'loss': 1.2665, 'learning_rate': 1.4826511380977155e-05, 'epoch': 0.36} + 36%|███▌ | 273/759 [39:25<1:00:54, 7.52s/it][2024-12-31 17:49:31,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.19 | bwd_microstep: 342.08 | bwd_inner_microstep: 341.73 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:49:31,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.72 | bwd_microstep: 300.98 | bwd_inner_microstep: 299.68 | bwd_allreduce_microstep: 0.51 | step_microstep: 0.53 +[2024-12-31 17:49:32,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.88 | bwd_microstep: 266.66 | bwd_inner_microstep: 266.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:32,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.88 | bwd_microstep: 264.22 | bwd_inner_microstep: 264.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:33,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:33,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:33,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:34,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.74 | bwd_microstep: 252.15 | bwd_inner_microstep: 252.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:49:34,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:35,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:35,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 243.57 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:36,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 243.79 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 17:49:36,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.79 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:37,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.18 | bwd_microstep: 240.96 | bwd_inner_microstep: 240.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:37,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.50 | bwd_microstep: 242.37 | bwd_inner_microstep: 242.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:37,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.74 | optimizer_gradients: 0.76 | optimizer_step: 3.40 +[2024-12-31 17:49:37,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.45 | bwd_microstep: 257.20 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.23 +[2024-12-31 17:49:37,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2915.85 | bwd: 4122.33 | bwd_inner: 4106.67 | bwd_allreduce: 14.28 | step: 14.56 + 36%|███▌ | 274/759 [39:32<1:00:18, 7.46s/it] {'loss': 1.2582, 'learning_rate': 1.478908373835167e-05, 'epoch': 0.36} + 36%|███▌ | 274/759 [39:32<1:00:18, 7.46s/it][2024-12-31 17:49:38,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.76 | bwd_microstep: 345.60 | bwd_inner_microstep: 345.26 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:49:39,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.03 | bwd_microstep: 287.84 | bwd_inner_microstep: 287.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:39,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.31 | bwd_microstep: 281.52 | bwd_inner_microstep: 281.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:39,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 261.91 | bwd_inner_microstep: 261.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:40,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.57 | bwd_microstep: 262.78 | bwd_inner_microstep: 262.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:49:40,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 248.65 | bwd_inner_microstep: 248.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:41,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 251.30 | bwd_inner_microstep: 251.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:41,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 252.76 | bwd_inner_microstep: 252.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:49:42,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 246.24 | bwd_inner_microstep: 246.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:42,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.04 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:43,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.57 | bwd_microstep: 266.70 | bwd_inner_microstep: 266.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:43,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 246.74 | bwd_inner_microstep: 246.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:43,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:49:44,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:44,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 159.28 | bwd_microstep: 228.16 | bwd_inner_microstep: 228.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:45,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.86 | optimizer_step: 3.40 +[2024-12-31 17:49:45,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 257.06 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 13.67 | step_microstep: 11.99 +[2024-12-31 17:49:45,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.61 | bwd: 4169.73 | bwd_inner: 4155.10 | bwd_allreduce: 13.95 | step: 14.94 + 36%|███▌ | 275/759 [39:40<59:49, 7.42s/it] {'loss': 1.2779, 'learning_rate': 1.4751568839608036e-05, 'epoch': 0.36} + 36%|███▌ | 275/759 [39:40<59:49, 7.42s/it][2024-12-31 17:49:45,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.05 | bwd_microstep: 339.01 | bwd_inner_microstep: 338.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:49:46,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.64 | bwd_microstep: 395.33 | bwd_inner_microstep: 395.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:49:46,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.62 | bwd_microstep: 280.42 | bwd_inner_microstep: 280.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:47,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.72 | bwd_microstep: 264.47 | bwd_inner_microstep: 264.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:49:47,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.34 | bwd_microstep: 255.82 | bwd_inner_microstep: 255.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 17:49:48,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 248.76 | bwd_inner_microstep: 248.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:49:48,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:49:49,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.28 | bwd_microstep: 246.96 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:49:49,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:50,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:50,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.41 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:50,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 252.61 | bwd_inner_microstep: 252.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:49:51,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:51,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:52,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.35 | bwd_microstep: 241.65 | bwd_inner_microstep: 241.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:52,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.90 | optimizer_gradients: 0.87 | optimizer_step: 3.19 +[2024-12-31 17:49:52,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.31 | bwd_microstep: 370.58 | bwd_inner_microstep: 226.57 | bwd_allreduce_microstep: 143.97 | step_microstep: 10.86 +[2024-12-31 17:49:52,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2870.49 | bwd: 4365.31 | bwd_inner: 4220.52 | bwd_allreduce: 144.22 | step: 13.08 + 36%|███▋ | 276/759 [39:47<59:51, 7.44s/it] {'loss': 1.2421, 'learning_rate': 1.4713967368259981e-05, 'epoch': 0.36} + 36%|███▋ | 276/759 [39:47<59:51, 7.44s/it][2024-12-31 17:49:53,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.65 | bwd_microstep: 318.51 | bwd_inner_microstep: 318.17 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 17:49:53,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.99 | bwd_microstep: 305.60 | bwd_inner_microstep: 305.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:54,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.54 | bwd_microstep: 283.61 | bwd_inner_microstep: 283.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:49:54,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.59 | bwd_microstep: 263.77 | bwd_inner_microstep: 263.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:55,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.77 | bwd_microstep: 256.62 | bwd_inner_microstep: 256.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:49:55,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 248.87 | bwd_inner_microstep: 248.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:49:56,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:56,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:56,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:57,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:57,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 253.94 | bwd_inner_microstep: 253.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:58,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:49:58,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 243.86 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:49:59,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 242.57 | bwd_inner_microstep: 242.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:49:59,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:00,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.65 | optimizer_gradients: 0.80 | optimizer_step: 3.09 +[2024-12-31 17:50:00,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 442.69 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 198.72 | step_microstep: 22.36 +[2024-12-31 17:50:00,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.98 | bwd: 4335.52 | bwd_inner: 4135.96 | bwd_allreduce: 198.96 | step: 25.34 + 36%|███▋ | 277/759 [39:55<59:46, 7.44s/it] {'loss': 1.232, 'learning_rate': 1.4676280009398544e-05, 'epoch': 0.36} + 36%|███▋ | 277/759 [39:55<59:46, 7.44s/it][2024-12-31 17:50:00,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.79 | bwd_microstep: 311.28 | bwd_inner_microstep: 310.94 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:50:01,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.53 | bwd_microstep: 365.61 | bwd_inner_microstep: 365.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:01,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.98 | bwd_microstep: 267.36 | bwd_inner_microstep: 267.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:02,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.10 | bwd_microstep: 254.51 | bwd_inner_microstep: 254.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:02,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.66 | bwd_microstep: 256.07 | bwd_inner_microstep: 256.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:03,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.10 | bwd_microstep: 250.65 | bwd_inner_microstep: 250.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:03,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 251.42 | bwd_inner_microstep: 251.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:04,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:04,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 255.14 | bwd_inner_microstep: 255.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:50:04,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.18 | bwd_microstep: 252.26 | bwd_inner_microstep: 252.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:05,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:05,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:50:06,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:06,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:07,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.25 | bwd_microstep: 241.45 | bwd_inner_microstep: 241.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:07,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.86 | optimizer_step: 3.52 +[2024-12-31 17:50:07,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.33 | bwd_microstep: 255.73 | bwd_inner_microstep: 242.01 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.41 +[2024-12-31 17:50:07,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.99 | bwd: 4184.88 | bwd_inner: 4170.37 | bwd_allreduce: 13.88 | step: 14.28 + 37%|███▋ | 278/759 [40:02<59:20, 7.40s/it] {'loss': 1.2347, 'learning_rate': 1.4638507449679642e-05, 'epoch': 0.37} + 37%|███▋ | 278/759 [40:02<59:20, 7.40s/it][2024-12-31 17:50:08,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 244.98 | bwd_microstep: 402.45 | bwd_inner_microstep: 402.06 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:50:08,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.63 | bwd_microstep: 298.30 | bwd_inner_microstep: 298.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:50:09,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.59 | bwd_microstep: 279.66 | bwd_inner_microstep: 279.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:50:09,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.44 | bwd_microstep: 262.36 | bwd_inner_microstep: 262.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:10,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.21 | bwd_microstep: 256.71 | bwd_inner_microstep: 256.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:10,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.39 | bwd_microstep: 248.90 | bwd_inner_microstep: 248.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:10,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:11,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.95 | bwd_microstep: 247.32 | bwd_inner_microstep: 247.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:11,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 245.86 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:12,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:12,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:13,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:50:13,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.63 | bwd_microstep: 241.56 | bwd_inner_microstep: 241.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:14,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.16 | bwd_microstep: 242.69 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:14,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 240.82 | bwd_inner_microstep: 240.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:14,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.62 | optimizer_step: 3.36 +[2024-12-31 17:50:14,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 257.20 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 13.63 | step_microstep: 11.27 +[2024-12-31 17:50:14,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2899.06 | bwd: 4206.11 | bwd_inner: 4191.50 | bwd_allreduce: 13.93 | step: 14.24 + 37%|███▋ | 279/759 [40:09<59:12, 7.40s/it] {'loss': 1.2076, 'learning_rate': 1.4600650377311523e-05, 'epoch': 0.37} + 37%|███▋ | 279/759 [40:09<59:12, 7.40s/it][2024-12-31 17:50:15,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.17 | bwd_microstep: 381.64 | bwd_inner_microstep: 381.27 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:50:16,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.22 | bwd_microstep: 364.52 | bwd_inner_microstep: 364.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:50:16,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.71 | bwd_microstep: 279.54 | bwd_inner_microstep: 279.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:17,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.34 | bwd_microstep: 258.56 | bwd_inner_microstep: 258.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:17,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.79 | bwd_microstep: 255.24 | bwd_inner_microstep: 255.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:50:17,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 249.85 | bwd_inner_microstep: 249.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:50:18,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 247.49 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:18,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:19,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:19,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 250.50 | bwd_inner_microstep: 250.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:20,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:20,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:20,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 241.53 | bwd_inner_microstep: 241.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:21,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.06 | bwd_microstep: 250.48 | bwd_inner_microstep: 250.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:21,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 241.40 | bwd_inner_microstep: 241.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:22,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 1.00 | optimizer_step: 3.44 +[2024-12-31 17:50:22,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 369.01 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 125.37 | step_microstep: 11.62 +[2024-12-31 17:50:22,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2847.44 | bwd: 4368.19 | bwd_inner: 4241.96 | bwd_allreduce: 125.63 | step: 14.57 + 37%|███▋ | 280/759 [40:17<59:22, 7.44s/it] {'loss': 1.2774, 'learning_rate': 1.4562709482042237e-05, 'epoch': 0.37} + 37%|███▋ | 280/759 [40:17<59:22, 7.44s/it][2024-12-31 17:50:23,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.13 | bwd_microstep: 372.76 | bwd_inner_microstep: 372.40 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 17:50:23,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.78 | bwd_microstep: 292.69 | bwd_inner_microstep: 292.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:24,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.61 | bwd_microstep: 283.08 | bwd_inner_microstep: 283.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:24,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.48 | bwd_microstep: 257.83 | bwd_inner_microstep: 257.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:24,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.75 | bwd_microstep: 255.61 | bwd_inner_microstep: 255.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:25,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.08 | bwd_microstep: 250.48 | bwd_inner_microstep: 250.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:50:25,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 250.56 | bwd_inner_microstep: 250.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:26,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 268.48 | bwd_inner_microstep: 268.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:50:26,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.40 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:27,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 246.21 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.31 +[2024-12-31 17:50:27,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:28,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:28,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:28,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:29,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.30 | bwd_microstep: 244.40 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:29,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.64 | optimizer_gradients: 4.04 | optimizer_step: 3.17 +[2024-12-31 17:50:29,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 393.05 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 146.62 | step_microstep: 21.98 +[2024-12-31 17:50:29,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2869.96 | bwd: 4338.29 | bwd_inner: 4190.52 | bwd_allreduce: 147.00 | step: 24.76 + 37%|███▋ | 281/759 [40:24<59:26, 7.46s/it] {'loss': 1.221, 'learning_rate': 1.4524685455147071e-05, 'epoch': 0.37} + 37%|███▋ | 281/759 [40:24<59:26, 7.46s/it][2024-12-31 17:50:30,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.66 | bwd_microstep: 338.62 | bwd_inner_microstep: 338.27 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:50:31,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.56 | bwd_microstep: 352.77 | bwd_inner_microstep: 352.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:31,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.33 | bwd_microstep: 291.28 | bwd_inner_microstep: 291.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:32,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.00 | bwd_microstep: 263.56 | bwd_inner_microstep: 263.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:32,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.66 | bwd_microstep: 263.37 | bwd_inner_microstep: 263.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:32,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.97 | bwd_microstep: 255.38 | bwd_inner_microstep: 255.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:33,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 273.55 | bwd_inner_microstep: 273.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:33,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 245.30 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:34,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.35 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:34,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.90 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:35,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:35,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:36,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.12 | bwd_inner_microstep: 243.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:36,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 242.66 | bwd_inner_microstep: 242.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:36,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:37,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.18 | optimizer_gradients: 4.38 | optimizer_step: 3.14 +[2024-12-31 17:50:37,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 561.74 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 318.88 | step_microstep: 15.93 +[2024-12-31 17:50:37,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2884.34 | bwd: 4552.39 | bwd_inner: 4232.71 | bwd_allreduce: 319.15 | step: 18.70 + 37%|███▋ | 282/759 [40:32<59:56, 7.54s/it] {'loss': 1.2758, 'learning_rate': 1.448657898941596e-05, 'epoch': 0.37} + 37%|███▋ | 282/759 [40:32<59:56, 7.54s/it][2024-12-31 17:50:38,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.87 | bwd_microstep: 317.08 | bwd_inner_microstep: 316.71 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:50:38,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.17 | bwd_microstep: 288.04 | bwd_inner_microstep: 288.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:39,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.09 | bwd_microstep: 282.48 | bwd_inner_microstep: 282.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:39,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.01 | bwd_microstep: 255.76 | bwd_inner_microstep: 255.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:40,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 264.60 | bwd_inner_microstep: 264.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:40,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 276.60 | bwd_inner_microstep: 276.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:50:41,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 247.65 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:41,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:41,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:42,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:42,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:43,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:43,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:44,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.84 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:44,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:45,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.59 | optimizer_gradients: 0.56 | optimizer_step: 3.09 +[2024-12-31 17:50:45,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.27 | bwd_microstep: 471.32 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 227.78 | step_microstep: 11.52 +[2024-12-31 17:50:45,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.09 | bwd: 4363.01 | bwd_inner: 4134.42 | bwd_allreduce: 228.03 | step: 14.39 + 37%|███▋ | 283/759 [40:40<59:45, 7.53s/it] {'loss': 1.2019, 'learning_rate': 1.4448390779140844e-05, 'epoch': 0.37} + 37%|███▋ | 283/759 [40:40<59:45, 7.53s/it][2024-12-31 17:50:45,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.31 | bwd_microstep: 313.77 | bwd_inner_microstep: 313.43 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:50:46,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.76 | bwd_microstep: 288.89 | bwd_inner_microstep: 288.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:46,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.76 | bwd_microstep: 263.51 | bwd_inner_microstep: 263.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:47,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 249.67 | bwd_inner_microstep: 249.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 17:50:47,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 247.10 | bwd_inner_microstep: 247.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:47,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.59 | bwd_microstep: 246.85 | bwd_inner_microstep: 246.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:48,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 249.63 | bwd_inner_microstep: 249.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:48,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:49,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.31 | bwd_inner_microstep: 244.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:49,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:50,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:50,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:51,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:50:51,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 242.51 | bwd_inner_microstep: 242.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:51,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:52,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.07 | optimizer_gradients: 0.88 | optimizer_step: 3.88 +[2024-12-31 17:50:52,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.65 | bwd_microstep: 255.78 | bwd_inner_microstep: 242.08 | bwd_allreduce_microstep: 13.59 | step_microstep: 64.54 +[2024-12-31 17:50:52,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2799.22 | bwd: 4071.74 | bwd_inner: 4057.12 | bwd_allreduce: 13.89 | step: 67.29 + 37%|███▋ | 284/759 [40:47<58:53, 7.44s/it] {'loss': 1.2473, 'learning_rate': 1.4410121520103045e-05, 'epoch': 0.37} + 37%|███▋ | 284/759 [40:47<58:53, 7.44s/it][2024-12-31 17:50:52,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.82 | bwd_microstep: 339.33 | bwd_inner_microstep: 338.96 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:50:53,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.38 | bwd_microstep: 291.37 | bwd_inner_microstep: 291.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:50:53,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.50 | bwd_microstep: 291.85 | bwd_inner_microstep: 291.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:54,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.99 | bwd_microstep: 266.84 | bwd_inner_microstep: 266.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:50:54,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.65 | bwd_microstep: 266.91 | bwd_inner_microstep: 266.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:50:55,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.10 | bwd_microstep: 250.29 | bwd_inner_microstep: 250.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:50:55,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 250.50 | bwd_inner_microstep: 250.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:50:56,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 248.38 | bwd_inner_microstep: 248.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:56,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:50:57,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.22 +[2024-12-31 17:50:57,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 243.57 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:57,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 249.84 | bwd_inner_microstep: 249.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:50:58,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 282.35 | bwd_microstep: 450.66 | bwd_inner_microstep: 450.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:50:59,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:50:59,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.68 | bwd_microstep: 241.45 | bwd_inner_microstep: 241.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:50:59,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.62 | optimizer_gradients: 1.07 | optimizer_step: 3.53 +[2024-12-31 17:50:59,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.82 | bwd_microstep: 240.26 | bwd_inner_microstep: 226.27 | bwd_allreduce_microstep: 13.90 | step_microstep: 12.02 +[2024-12-31 17:50:59,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2935.89 | bwd: 4363.85 | bwd_inner: 4348.97 | bwd_allreduce: 14.22 | step: 15.03 + 38%|███▊ | 285/759 [40:54<59:06, 7.48s/it] {'loss': 1.2239, 'learning_rate': 1.4371771909560566e-05, 'epoch': 0.38} + 38%|███▊ | 285/759 [40:54<59:06, 7.48s/it][2024-12-31 17:51:00,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.05 | bwd_microstep: 336.52 | bwd_inner_microstep: 336.15 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:51:01,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.24 | bwd_microstep: 287.15 | bwd_inner_microstep: 287.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:01,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.92 | bwd_microstep: 284.54 | bwd_inner_microstep: 284.19 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:51:01,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.52 | bwd_microstep: 257.52 | bwd_inner_microstep: 257.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:02,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.51 | bwd_microstep: 254.62 | bwd_inner_microstep: 254.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:02,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 249.23 | bwd_inner_microstep: 249.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:51:03,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 247.64 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:03,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:04,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:04,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.04 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:05,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:05,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.51 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:05,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.75 | bwd_microstep: 241.22 | bwd_inner_microstep: 241.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:06,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:06,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.28 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:07,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.67 | optimizer_gradients: 0.62 | optimizer_step: 3.12 +[2024-12-31 17:51:07,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 648.91 | bwd_inner_microstep: 241.75 | bwd_allreduce_microstep: 407.11 | step_microstep: 13.39 +[2024-12-31 17:51:07,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2807.61 | bwd: 4520.30 | bwd_inner: 4112.02 | bwd_allreduce: 407.49 | step: 16.45 + 38%|███▊ | 286/759 [41:02<59:19, 7.53s/it] {'loss': 1.2389, 'learning_rate': 1.4333342646235407e-05, 'epoch': 0.38} + 38%|███▊ | 286/759 [41:02<59:19, 7.53s/it][2024-12-31 17:51:08,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.79 | bwd_microstep: 313.07 | bwd_inner_microstep: 312.70 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:51:09,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.73 | bwd_microstep: 680.83 | bwd_inner_microstep: 680.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.34 +[2024-12-31 17:51:09,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.26 | bwd_microstep: 262.19 | bwd_inner_microstep: 262.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:09,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.45 | bwd_microstep: 262.67 | bwd_inner_microstep: 262.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:51:10,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 248.49 | bwd_inner_microstep: 248.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:10,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 249.20 | bwd_inner_microstep: 249.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:11,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.28 | bwd_microstep: 248.23 | bwd_inner_microstep: 248.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:11,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 247.52 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:12,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.61 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:12,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.79 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 17:51:13,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:13,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:51:13,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.14 | bwd_microstep: 242.76 | bwd_inner_microstep: 242.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:51:14,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.16 | bwd_microstep: 242.66 | bwd_inner_microstep: 242.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:14,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.43 | bwd_microstep: 240.80 | bwd_inner_microstep: 240.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:15,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.87 | optimizer_gradients: 0.82 | optimizer_step: 3.11 +[2024-12-31 17:51:15,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 282.99 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 39.96 | step_microstep: 11.25 +[2024-12-31 17:51:15,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2901.33 | bwd: 4499.33 | bwd_inner: 4458.34 | bwd_allreduce: 40.35 | step: 14.30 + 38%|███▊ | 287/759 [41:10<59:34, 7.57s/it] {'loss': 1.2811, 'learning_rate': 1.4294834430300822e-05, 'epoch': 0.38} + 38%|███▊ | 287/759 [41:10<59:34, 7.57s/it][2024-12-31 17:51:15,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.11 | bwd_microstep: 316.67 | bwd_inner_microstep: 316.32 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:51:16,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.57 | bwd_microstep: 296.89 | bwd_inner_microstep: 296.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:16,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.92 | bwd_microstep: 298.77 | bwd_inner_microstep: 298.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:17,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.34 | bwd_microstep: 261.96 | bwd_inner_microstep: 261.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:17,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 249.00 | bwd_inner_microstep: 248.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:18,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 249.54 | bwd_inner_microstep: 249.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 17:51:18,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 246.46 | bwd_inner_microstep: 246.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:19,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 248.00 | bwd_inner_microstep: 247.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:51:19,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:19,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:51:20,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:20,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.76 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:51:21,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.63 | bwd_microstep: 241.03 | bwd_inner_microstep: 241.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:21,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:22,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:22,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.44 | optimizer_gradients: 0.62 | optimizer_step: 3.09 +[2024-12-31 17:51:22,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.53 | bwd_microstep: 299.61 | bwd_inner_microstep: 245.33 | bwd_allreduce_microstep: 54.24 | step_microstep: 11.41 +[2024-12-31 17:51:22,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.99 | bwd: 4174.59 | bwd_inner: 4119.59 | bwd_allreduce: 54.48 | step: 14.50 + 38%|███▊ | 288/759 [41:17<58:46, 7.49s/it] {'loss': 1.2226, 'learning_rate': 1.425624796336856e-05, 'epoch': 0.38} + 38%|███▊ | 288/759 [41:17<58:46, 7.49s/it][2024-12-31 17:51:23,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.25 | bwd_microstep: 386.80 | bwd_inner_microstep: 386.45 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 17:51:23,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.66 | bwd_microstep: 267.42 | bwd_inner_microstep: 267.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:24,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.83 | bwd_microstep: 262.01 | bwd_inner_microstep: 261.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:24,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 282.77 | bwd_microstep: 261.31 | bwd_inner_microstep: 261.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:25,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 250.68 | bwd_inner_microstep: 250.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:25,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:51:26,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:26,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:26,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 253.16 | bwd_inner_microstep: 253.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:27,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.96 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:51:27,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.52 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:28,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 242.35 | bwd_inner_microstep: 242.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:28,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:29,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:29,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 256.86 | bwd_inner_microstep: 256.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:30,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.39 | optimizer_gradients: 0.63 | optimizer_step: 3.08 +[2024-12-31 17:51:30,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 508.83 | bwd_inner_microstep: 241.49 | bwd_allreduce_microstep: 266.91 | step_microstep: 11.26 +[2024-12-31 17:51:30,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2947.51 | bwd: 4404.07 | bwd_inner: 4135.97 | bwd_allreduce: 266.84 | step: 14.37 + 38%|███▊ | 289/759 [41:25<59:05, 7.54s/it] {'loss': 1.2374, 'learning_rate': 1.4217583948476094e-05, 'epoch': 0.38} + 38%|███▊ | 289/759 [41:25<59:05, 7.54s/it][2024-12-31 17:51:30,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.87 | bwd_microstep: 341.37 | bwd_inner_microstep: 340.89 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.25 +[2024-12-31 17:51:31,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.66 | bwd_microstep: 418.17 | bwd_inner_microstep: 418.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:31,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.84 | bwd_microstep: 264.51 | bwd_inner_microstep: 264.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:51:32,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 249.39 | bwd_inner_microstep: 249.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:32,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 291.15 | bwd_inner_microstep: 291.05 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.84 +[2024-12-31 17:51:33,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.80 | bwd_microstep: 247.47 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:33,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 246.20 | bwd_inner_microstep: 246.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:34,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:34,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 246.15 | bwd_inner_microstep: 246.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:35,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:51:35,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.83 | bwd_microstep: 242.29 | bwd_inner_microstep: 242.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:35,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 251.79 | bwd_inner_microstep: 251.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:36,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:36,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:37,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 246.38 | bwd_inner_microstep: 246.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:37,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.71 | optimizer_gradients: 0.73 | optimizer_step: 3.39 +[2024-12-31 17:51:37,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.92 | bwd_microstep: 254.64 | bwd_inner_microstep: 240.99 | bwd_allreduce_microstep: 13.56 | step_microstep: 14.17 +[2024-12-31 17:51:37,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2866.71 | bwd: 4276.00 | bwd_inner: 4261.38 | bwd_allreduce: 13.91 | step: 17.99 + 38%|███▊ | 290/759 [41:32<58:43, 7.51s/it] {'loss': 1.2145, 'learning_rate': 1.4178843090073802e-05, 'epoch': 0.38} + 38%|███▊ | 290/759 [41:32<58:43, 7.51s/it][2024-12-31 17:51:38,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.96 | bwd_microstep: 343.71 | bwd_inner_microstep: 343.32 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.24 +[2024-12-31 17:51:38,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.02 | bwd_microstep: 379.05 | bwd_inner_microstep: 379.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:39,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.14 | bwd_microstep: 281.24 | bwd_inner_microstep: 281.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:39,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.63 | bwd_microstep: 263.63 | bwd_inner_microstep: 263.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:40,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.60 | bwd_microstep: 258.02 | bwd_inner_microstep: 258.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:40,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 248.37 | bwd_inner_microstep: 248.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:41,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.66 | bwd_microstep: 248.62 | bwd_inner_microstep: 248.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:41,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:42,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:42,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:42,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 245.96 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:43,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:51:43,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:51:44,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:44,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 240.89 | bwd_inner_microstep: 240.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:45,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.78 | optimizer_gradients: 0.71 | optimizer_step: 3.25 +[2024-12-31 17:51:45,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 483.25 | bwd_inner_microstep: 241.12 | bwd_allreduce_microstep: 241.59 | step_microstep: 13.85 +[2024-12-31 17:51:45,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2933.41 | bwd: 4464.61 | bwd_inner: 4221.27 | bwd_allreduce: 242.22 | step: 16.91 + 38%|███▊ | 291/759 [41:40<59:02, 7.57s/it] {'loss': 1.2324, 'learning_rate': 1.4140026094012136e-05, 'epoch': 0.38} + 38%|███▊ | 291/759 [41:40<59:02, 7.57s/it][2024-12-31 17:51:46,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.56 | bwd_microstep: 385.71 | bwd_inner_microstep: 385.34 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:51:46,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.44 | bwd_microstep: 347.20 | bwd_inner_microstep: 347.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:51:47,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.25 | bwd_microstep: 288.01 | bwd_inner_microstep: 287.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:47,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.12 | bwd_microstep: 267.55 | bwd_inner_microstep: 267.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:48,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.43 | bwd_microstep: 261.95 | bwd_inner_microstep: 261.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:48,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 248.73 | bwd_inner_microstep: 248.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:48,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:49,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 248.64 | bwd_inner_microstep: 248.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:49,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 297.84 | bwd_inner_microstep: 297.71 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.22 +[2024-12-31 17:51:50,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 252.49 | bwd_inner_microstep: 252.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:51:50,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 261.72 | bwd_inner_microstep: 261.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:51,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.83 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:51:51,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:52,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.51 | bwd_microstep: 241.08 | bwd_inner_microstep: 241.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:52,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.84 | bwd_microstep: 241.66 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:52,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.98 | optimizer_gradients: 0.73 | optimizer_step: 3.32 +[2024-12-31 17:51:52,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 257.70 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 13.80 | step_microstep: 11.29 +[2024-12-31 17:51:52,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2858.68 | bwd: 4335.54 | bwd_inner: 4320.71 | bwd_allreduce: 14.16 | step: 14.27 + 38%|███▊ | 292/759 [41:47<58:44, 7.55s/it] {'loss': 1.2144, 'learning_rate': 1.4101133667528761e-05, 'epoch': 0.38} + 38%|███▊ | 292/759 [41:47<58:44, 7.55s/it][2024-12-31 17:51:53,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.01 | bwd_microstep: 347.20 | bwd_inner_microstep: 346.83 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:51:54,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.47 | bwd_microstep: 312.69 | bwd_inner_microstep: 312.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:51:54,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.75 | bwd_microstep: 283.57 | bwd_inner_microstep: 283.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:51:54,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.71 | bwd_microstep: 255.71 | bwd_inner_microstep: 255.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:55,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.55 | bwd_microstep: 256.63 | bwd_inner_microstep: 256.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:55,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 249.37 | bwd_inner_microstep: 249.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:51:56,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 247.98 | bwd_inner_microstep: 247.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:56,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 251.82 | bwd_inner_microstep: 251.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:51:57,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:51:57,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:58,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:51:58,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 244.09 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.28 +[2024-12-31 17:51:58,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 249.43 | bwd_inner_microstep: 249.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:59,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:51:59,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.96 | bwd_microstep: 241.14 | bwd_inner_microstep: 241.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:00,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.79 | optimizer_gradients: 0.72 | optimizer_step: 3.27 +[2024-12-31 17:52:00,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.32 | bwd_microstep: 239.46 | bwd_inner_microstep: 225.73 | bwd_allreduce_microstep: 13.60 | step_microstep: 12.65 +[2024-12-31 17:52:00,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.91 | bwd: 4156.47 | bwd_inner: 4141.64 | bwd_allreduce: 14.01 | step: 15.65 + 39%|███▊ | 293/759 [41:55<57:59, 7.47s/it] {'loss': 1.2395, 'learning_rate': 1.4062166519235665e-05, 'epoch': 0.39} + 39%|███▊ | 293/759 [41:55<57:59, 7.47s/it][2024-12-31 17:52:00,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 263.86 | bwd_microstep: 441.91 | bwd_inner_microstep: 441.50 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:52:01,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.76 | bwd_microstep: 288.75 | bwd_inner_microstep: 288.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:01,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.19 | bwd_microstep: 281.45 | bwd_inner_microstep: 281.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:02,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.50 | bwd_microstep: 261.78 | bwd_inner_microstep: 261.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:02,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.55 | bwd_microstep: 261.65 | bwd_inner_microstep: 261.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:03,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:03,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 248.63 | bwd_inner_microstep: 248.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:04,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:04,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 248.13 | bwd_inner_microstep: 248.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:04,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:05,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.04 | bwd_microstep: 243.62 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:05,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:06,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:06,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:07,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:52:07,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 0.64 | optimizer_step: 37.13 +[2024-12-31 17:52:07,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.96 | bwd_microstep: 371.72 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 130.39 | step_microstep: 57.74 +[2024-12-31 17:52:07,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2872.55 | bwd: 4362.37 | bwd_inner: 4231.09 | bwd_allreduce: 130.65 | step: 60.03 + 39%|███▊ | 294/759 [42:02<57:59, 7.48s/it] {'loss': 1.2463, 'learning_rate': 1.4023125359106253e-05, 'epoch': 0.39} + 39%|███▊ | 294/759 [42:02<57:59, 7.48s/it][2024-12-31 17:52:08,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.64 | bwd_microstep: 296.29 | bwd_inner_microstep: 296.17 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.07 +[2024-12-31 17:52:08,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.94 | bwd_microstep: 420.34 | bwd_inner_microstep: 420.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:09,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.82 | bwd_microstep: 256.41 | bwd_inner_microstep: 256.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:09,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.99 | bwd_microstep: 266.37 | bwd_inner_microstep: 266.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:10,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 253.82 | bwd_inner_microstep: 253.70 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.08 +[2024-12-31 17:52:10,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:52:11,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 248.20 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:11,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 246.92 | bwd_inner_microstep: 246.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:52:11,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.51 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:12,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:12,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 253.85 | bwd_inner_microstep: 253.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:13,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.15 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:52:13,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:52:14,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:14,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:15,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.86 | optimizer_gradients: 10.12 | optimizer_step: 37.52 +[2024-12-31 17:52:15,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 256.96 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 13.59 | step_microstep: 55.32 +[2024-12-31 17:52:15,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2854.32 | bwd: 4210.82 | bwd_inner: 4196.43 | bwd_allreduce: 13.83 | step: 57.08 + 39%|███▉ | 295/759 [42:09<57:32, 7.44s/it] {'loss': 1.2726, 'learning_rate': 1.3984010898462417e-05, 'epoch': 0.39} + 39%|███▉ | 295/759 [42:10<57:32, 7.44s/it][2024-12-31 17:52:15,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 269.14 | bwd_microstep: 451.18 | bwd_inner_microstep: 450.82 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:52:16,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.31 | bwd_microstep: 291.45 | bwd_inner_microstep: 291.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:16,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.67 | bwd_microstep: 257.48 | bwd_inner_microstep: 257.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:17,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.14 | bwd_microstep: 255.50 | bwd_inner_microstep: 255.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:17,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 254.88 | bwd_inner_microstep: 254.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:18,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:18,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:52:18,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 246.69 | bwd_inner_microstep: 246.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:19,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:19,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:20,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:20,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 242.88 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:21,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.85 | bwd_microstep: 240.31 | bwd_inner_microstep: 240.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:21,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 249.94 | bwd_inner_microstep: 249.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:21,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.48 | bwd_microstep: 239.99 | bwd_inner_microstep: 239.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:22,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.02 | optimizer_gradients: 0.75 | optimizer_step: 3.29 +[2024-12-31 17:52:22,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.41 | bwd_microstep: 256.76 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.40 +[2024-12-31 17:52:22,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2874.00 | bwd: 4210.50 | bwd_inner: 4196.04 | bwd_allreduce: 13.82 | step: 14.11 + 39%|███▉ | 296/759 [42:17<57:18, 7.43s/it] {'loss': 1.2394, 'learning_rate': 1.3944823849961557e-05, 'epoch': 0.39} + 39%|███▉ | 296/759 [42:17<57:18, 7.43s/it][2024-12-31 17:52:22,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.38 | bwd_microstep: 312.52 | bwd_inner_microstep: 312.14 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.24 +[2024-12-31 17:52:23,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.71 | bwd_microstep: 280.80 | bwd_inner_microstep: 280.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:23,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.59 | bwd_microstep: 258.80 | bwd_inner_microstep: 258.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:24,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.50 | bwd_microstep: 260.78 | bwd_inner_microstep: 260.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:52:24,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:25,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.31 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:25,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 245.86 | bwd_inner_microstep: 245.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:52:26,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 247.90 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:26,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 254.47 | bwd_inner_microstep: 254.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:26,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:52:27,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:52:27,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 276.42 | bwd_inner_microstep: 276.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:52:28,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.79 | bwd_microstep: 241.88 | bwd_inner_microstep: 241.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:52:28,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 242.92 | bwd_inner_microstep: 242.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:29,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 252.86 | bwd_inner_microstep: 252.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:30,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.68 | optimizer_gradients: 0.84 | optimizer_step: 10.22 +[2024-12-31 17:52:30,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.08 | bwd_microstep: 750.91 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 507.40 | step_microstep: 18.02 +[2024-12-31 17:52:30,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2792.13 | bwd: 4612.39 | bwd_inner: 4103.98 | bwd_allreduce: 507.64 | step: 21.02 + 39%|███▉ | 297/759 [42:25<57:47, 7.51s/it] {'loss': 1.2473, 'learning_rate': 1.3905564927583625e-05, 'epoch': 0.39} + 39%|███▉ | 297/759 [42:25<57:47, 7.51s/it][2024-12-31 17:52:30,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.18 | bwd_microstep: 384.06 | bwd_inner_microstep: 383.71 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:52:31,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.10 | bwd_microstep: 292.17 | bwd_inner_microstep: 292.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:31,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.44 | bwd_microstep: 281.16 | bwd_inner_microstep: 281.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:32,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.09 | bwd_microstep: 262.68 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:32,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 250.32 | bwd_inner_microstep: 250.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:52:33,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 247.35 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:33,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.04 | bwd_microstep: 260.92 | bwd_inner_microstep: 260.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:52:34,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:34,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:34,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:35,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:35,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:36,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 272.23 | bwd_inner_microstep: 272.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:36,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.62 | bwd_microstep: 241.19 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:37,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.01 | bwd_microstep: 241.89 | bwd_inner_microstep: 241.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:37,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.97 | optimizer_gradients: 0.75 | optimizer_step: 3.31 +[2024-12-31 17:52:37,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 255.39 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 13.70 | step_microstep: 11.18 +[2024-12-31 17:52:37,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.88 | bwd: 4209.69 | bwd_inner: 4194.94 | bwd_allreduce: 13.99 | step: 14.27 + 39%|███▉ | 298/759 [42:32<57:25, 7.47s/it] {'loss': 1.2524, 'learning_rate': 1.3866234846618083e-05, 'epoch': 0.39} + 39%|███▉ | 298/759 [42:32<57:25, 7.47s/it][2024-12-31 17:52:38,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.82 | bwd_microstep: 414.76 | bwd_inner_microstep: 414.42 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:52:38,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.69 | bwd_microstep: 391.02 | bwd_inner_microstep: 391.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:52:39,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 265.49 | bwd_microstep: 446.66 | bwd_inner_microstep: 446.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:52:40,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.13 | bwd_microstep: 264.36 | bwd_inner_microstep: 264.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:40,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.62 | bwd_microstep: 262.50 | bwd_inner_microstep: 262.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:40,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 250.46 | bwd_inner_microstep: 250.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:41,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 249.28 | bwd_inner_microstep: 249.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:41,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 272.84 | bwd_inner_microstep: 272.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:42,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 245.73 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:42,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:52:43,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:43,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:43,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:44,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:52:44,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:45,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.45 | optimizer_gradients: 0.74 | optimizer_step: 3.34 +[2024-12-31 17:52:45,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 257.87 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 13.58 | step_microstep: 13.01 +[2024-12-31 17:52:45,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2987.17 | bwd: 4522.37 | bwd_inner: 4507.91 | bwd_allreduce: 13.86 | step: 15.86 + 39%|███▉ | 299/759 [42:40<58:01, 7.57s/it] {'loss': 1.2286, 'learning_rate': 1.3826834323650899e-05, 'epoch': 0.39} + 39%|███▉ | 299/759 [42:40<58:01, 7.57s/it][2024-12-31 17:52:45,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.54 | bwd_microstep: 297.28 | bwd_inner_microstep: 296.92 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:52:46,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.74 | bwd_microstep: 287.15 | bwd_inner_microstep: 287.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:46,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.33 | bwd_microstep: 266.37 | bwd_inner_microstep: 266.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:47,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.11 | bwd_microstep: 258.00 | bwd_inner_microstep: 257.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:52:47,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.81 | bwd_microstep: 254.43 | bwd_inner_microstep: 254.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:52:48,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 250.53 | bwd_inner_microstep: 250.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:52:48,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 248.46 | bwd_inner_microstep: 248.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:49,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 246.96 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:49,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.19 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:52:49,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:52:50,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.98 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:52:50,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 243.01 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:51,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:52:51,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.67 | bwd_microstep: 241.51 | bwd_inner_microstep: 241.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:52,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.79 | bwd_microstep: 240.61 | bwd_inner_microstep: 240.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:52,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.71 | optimizer_gradients: 0.63 | optimizer_step: 3.09 +[2024-12-31 17:52:52,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.30 | bwd_microstep: 478.48 | bwd_inner_microstep: 257.03 | bwd_allreduce_microstep: 221.41 | step_microstep: 14.10 +[2024-12-31 17:52:52,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.43 | bwd: 4290.74 | bwd_inner: 4068.56 | bwd_allreduce: 221.66 | step: 16.78 + 40%|███▉ | 300/759 [42:47<57:31, 7.52s/it] {'loss': 1.2528, 'learning_rate': 1.3787364076551478e-05, 'epoch': 0.4} + 40%|███▉ | 300/759 [42:47<57:31, 7.52s/it][2024-12-31 17:52:53,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.26 | bwd_microstep: 319.89 | bwd_inner_microstep: 319.78 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.07 +[2024-12-31 17:52:53,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.37 | bwd_microstep: 306.92 | bwd_inner_microstep: 306.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:54,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.60 | bwd_microstep: 267.55 | bwd_inner_microstep: 267.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:54,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.86 | bwd_microstep: 262.10 | bwd_inner_microstep: 262.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:55,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.85 | bwd_microstep: 268.76 | bwd_inner_microstep: 268.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:55,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.67 | bwd_microstep: 254.62 | bwd_inner_microstep: 254.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:52:55,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 247.39 | bwd_inner_microstep: 247.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:56,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:52:56,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:57,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:57,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:58,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:58,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:58,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.81 | bwd_microstep: 271.62 | bwd_inner_microstep: 271.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:52:59,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.65 | bwd_microstep: 250.11 | bwd_inner_microstep: 250.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:00,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.92 | optimizer_gradients: 0.81 | optimizer_step: 3.13 +[2024-12-31 17:53:00,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.44 | bwd_microstep: 408.61 | bwd_inner_microstep: 242.16 | bwd_allreduce_microstep: 166.40 | step_microstep: 11.15 +[2024-12-31 17:53:00,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.75 | bwd: 4322.46 | bwd_inner: 4155.52 | bwd_allreduce: 166.55 | step: 11.80 + 40%|███▉ | 301/759 [42:54<56:52, 7.45s/it] {'loss': 1.2363, 'learning_rate': 1.3747824824459577e-05, 'epoch': 0.4} + 40%|███▉ | 301/759 [42:54<56:52, 7.45s/it][2024-12-31 17:53:00,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.47 | bwd_microstep: 382.64 | bwd_inner_microstep: 382.53 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:53:01,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.26 | bwd_microstep: 286.71 | bwd_inner_microstep: 286.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:01,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.07 | bwd_microstep: 280.38 | bwd_inner_microstep: 280.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:02,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.82 | bwd_microstep: 264.62 | bwd_inner_microstep: 264.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:02,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.90 | bwd_microstep: 263.44 | bwd_inner_microstep: 263.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:02,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.64 | bwd_microstep: 249.07 | bwd_inner_microstep: 249.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:03,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 248.04 | bwd_inner_microstep: 248.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:03,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 257.08 | bwd_inner_microstep: 257.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:04,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.44 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:04,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:05,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 246.39 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:05,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:05,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.92 | bwd_microstep: 248.24 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:06,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:06,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 241.85 | bwd_inner_microstep: 241.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:53:07,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.31 | optimizer_gradients: 0.62 | optimizer_step: 3.15 +[2024-12-31 17:53:07,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 602.15 | bwd_inner_microstep: 248.28 | bwd_allreduce_microstep: 353.82 | step_microstep: 11.48 +[2024-12-31 17:53:07,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2917.94 | bwd: 4550.65 | bwd_inner: 4196.28 | bwd_allreduce: 353.98 | step: 12.30 + 40%|███▉ | 302/759 [43:02<57:06, 7.50s/it] {'loss': 1.2293, 'learning_rate': 1.3708217287772227e-05, 'epoch': 0.4} + 40%|███▉ | 302/759 [43:02<57:06, 7.50s/it][2024-12-31 17:53:08,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.99 | bwd_microstep: 357.85 | bwd_inner_microstep: 357.49 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:53:08,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.12 | bwd_microstep: 345.56 | bwd_inner_microstep: 345.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:53:09,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.14 | bwd_microstep: 261.51 | bwd_inner_microstep: 261.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:53:09,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 250.85 | bwd_inner_microstep: 250.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:10,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.52 | bwd_microstep: 250.13 | bwd_inner_microstep: 250.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:53:10,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 247.56 | bwd_inner_microstep: 247.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:53:11,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:53:11,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 247.42 | bwd_inner_microstep: 247.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:53:11,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.53 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:53:12,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:53:12,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:53:13,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 250.84 | bwd_inner_microstep: 250.56 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.28 +[2024-12-31 17:53:13,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 241.69 | bwd_inner_microstep: 241.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:53:14,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:53:14,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 246.97 | bwd_inner_microstep: 246.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:53:14,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.81 | optimizer_step: 5.32 +[2024-12-31 17:53:14,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 257.61 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 13.59 | step_microstep: 36.95 +[2024-12-31 17:53:14,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2843.56 | bwd: 4179.46 | bwd_inner: 4164.54 | bwd_allreduce: 14.10 | step: 40.00 + 40%|███▉ | 303/759 [43:09<56:37, 7.45s/it] {'loss': 1.2363, 'learning_rate': 1.3668542188130567e-05, 'epoch': 0.4} + 40%|███▉ | 303/759 [43:09<56:37, 7.45s/it][2024-12-31 17:53:15,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.05 | bwd_microstep: 337.83 | bwd_inner_microstep: 337.47 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:53:16,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.68 | bwd_microstep: 363.18 | bwd_inner_microstep: 363.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:53:16,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.14 | bwd_microstep: 268.42 | bwd_inner_microstep: 268.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:53:17,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.59 | bwd_microstep: 262.68 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:53:17,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.15 | bwd_microstep: 256.44 | bwd_inner_microstep: 256.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:17,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.08 | bwd_microstep: 254.67 | bwd_inner_microstep: 254.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:18,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 247.84 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:18,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 247.64 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:19,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 257.56 | bwd_inner_microstep: 257.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:19,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:20,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:20,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.95 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:20,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:21,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 241.97 | bwd_inner_microstep: 241.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:21,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:22,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.88 | optimizer_gradients: 0.68 | optimizer_step: 3.14 +[2024-12-31 17:53:22,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 257.94 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 13.52 | step_microstep: 10.75 +[2024-12-31 17:53:22,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2868.20 | bwd: 4215.51 | bwd_inner: 4201.23 | bwd_allreduce: 13.76 | step: 12.05 + 40%|████ | 304/759 [43:17<56:02, 7.39s/it] {'loss': 1.2268, 'learning_rate': 1.3628800248406738e-05, 'epoch': 0.4} + 40%|████ | 304/759 [43:17<56:02, 7.39s/it][2024-12-31 17:53:22,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.16 | bwd_microstep: 338.46 | bwd_inner_microstep: 338.35 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.07 +[2024-12-31 17:53:23,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.72 | bwd_microstep: 288.61 | bwd_inner_microstep: 288.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:23,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.93 | bwd_microstep: 293.99 | bwd_inner_microstep: 293.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:24,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.90 | bwd_microstep: 269.28 | bwd_inner_microstep: 269.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:24,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.09 | bwd_microstep: 264.51 | bwd_inner_microstep: 264.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:25,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.26 | bwd_microstep: 255.96 | bwd_inner_microstep: 255.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:25,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 247.96 | bwd_inner_microstep: 247.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:25,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:26,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 245.73 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:26,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:27,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:27,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:28,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:28,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 260.79 | bwd_inner_microstep: 260.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:28,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:29,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.17 | optimizer_gradients: 0.64 | optimizer_step: 3.15 +[2024-12-31 17:53:29,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.01 | bwd_microstep: 744.59 | bwd_inner_microstep: 242.70 | bwd_allreduce_microstep: 501.83 | step_microstep: 17.02 +[2024-12-31 17:53:29,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2858.37 | bwd: 4677.09 | bwd_inner: 4174.70 | bwd_allreduce: 501.99 | step: 17.68 + 40%|████ | 305/759 [43:24<56:33, 7.47s/it] {'loss': 1.2158, 'learning_rate': 1.3588992192690683e-05, 'epoch': 0.4} + 40%|████ | 305/759 [43:24<56:33, 7.47s/it][2024-12-31 17:53:30,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.68 | bwd_microstep: 336.18 | bwd_inner_microstep: 336.07 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:53:30,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.70 | bwd_microstep: 287.51 | bwd_inner_microstep: 287.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:31,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.06 | bwd_microstep: 281.88 | bwd_inner_microstep: 281.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:31,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.41 | bwd_microstep: 261.29 | bwd_inner_microstep: 261.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:32,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.98 | bwd_microstep: 266.88 | bwd_inner_microstep: 266.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:32,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:33,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 248.46 | bwd_inner_microstep: 248.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:33,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 261.50 | bwd_inner_microstep: 261.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:53:34,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 244.24 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:34,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.42 | bwd_microstep: 241.06 | bwd_inner_microstep: 241.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:34,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:35,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:35,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:36,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.58 | bwd_microstep: 240.34 | bwd_inner_microstep: 240.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:36,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.27 | bwd_microstep: 240.64 | bwd_inner_microstep: 240.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:37,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.64 | optimizer_step: 6.41 +[2024-12-31 17:53:37,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 649.60 | bwd_inner_microstep: 242.14 | bwd_allreduce_microstep: 407.42 | step_microstep: 13.99 +[2024-12-31 17:53:37,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2818.78 | bwd: 4537.36 | bwd_inner: 4129.40 | bwd_allreduce: 407.57 | step: 14.64 + 40%|████ | 306/759 [43:32<56:27, 7.48s/it] {'loss': 1.2529, 'learning_rate': 1.3549118746276968e-05, 'epoch': 0.4} + 40%|████ | 306/759 [43:32<56:27, 7.48s/it][2024-12-31 17:53:37,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.67 | bwd_microstep: 310.82 | bwd_inner_microstep: 310.72 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:53:38,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.47 | bwd_microstep: 301.59 | bwd_inner_microstep: 301.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:38,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.31 | bwd_microstep: 266.09 | bwd_inner_microstep: 266.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:39,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.48 | bwd_microstep: 256.57 | bwd_inner_microstep: 256.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:39,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 249.67 | bwd_inner_microstep: 249.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:40,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:40,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:40,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:41,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:41,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:42,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:42,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:43,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.87 | bwd_microstep: 242.72 | bwd_inner_microstep: 242.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:43,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:43,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 243.25 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 1.57 | step_microstep: 0.07 +[2024-12-31 17:53:44,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.60 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 17:53:44,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 450.04 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 206.01 | step_microstep: 11.65 +[2024-12-31 17:53:44,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2798.91 | bwd: 4280.62 | bwd_inner: 4072.48 | bwd_allreduce: 207.73 | step: 12.32 + 40%|████ | 307/759 [43:39<55:43, 7.40s/it] {'loss': 1.2637, 'learning_rate': 1.350918063565157e-05, 'epoch': 0.4} + 40%|████ | 307/759 [43:39<55:43, 7.40s/it][2024-12-31 17:53:45,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.57 | bwd_microstep: 394.12 | bwd_inner_microstep: 394.01 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:53:45,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.89 | bwd_microstep: 289.29 | bwd_inner_microstep: 289.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:46,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.78 | bwd_microstep: 281.92 | bwd_inner_microstep: 281.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:46,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.89 | bwd_microstep: 256.83 | bwd_inner_microstep: 256.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:47,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.05 | bwd_microstep: 285.68 | bwd_inner_microstep: 285.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:47,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 246.86 | bwd_inner_microstep: 246.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:47,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 247.10 | bwd_inner_microstep: 247.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:48,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 248.29 | bwd_inner_microstep: 248.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:48,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 252.84 | bwd_inner_microstep: 252.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:49,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 253.06 | bwd_inner_microstep: 253.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:49,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:50,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:50,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 242.68 | bwd_inner_microstep: 242.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:50,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:51,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.95 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:51,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.36 | optimizer_gradients: 0.71 | optimizer_step: 3.10 +[2024-12-31 17:53:51,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 372.47 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 127.52 | step_microstep: 11.32 +[2024-12-31 17:53:51,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.46 | bwd: 4350.88 | bwd_inner: 4222.81 | bwd_allreduce: 127.68 | step: 11.96 + 41%|████ | 308/759 [43:46<55:26, 7.38s/it] {'loss': 1.25, 'learning_rate': 1.3469178588478621e-05, 'epoch': 0.41} + 41%|████ | 308/759 [43:46<55:26, 7.38s/it][2024-12-31 17:53:52,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.91 | bwd_microstep: 388.50 | bwd_inner_microstep: 388.39 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:53:53,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.95 | bwd_microstep: 295.96 | bwd_inner_microstep: 295.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:53:53,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.68 | bwd_microstep: 268.83 | bwd_inner_microstep: 268.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:53,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.63 | bwd_microstep: 263.49 | bwd_inner_microstep: 263.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:54,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.16 | bwd_microstep: 256.69 | bwd_inner_microstep: 256.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:54,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.49 | bwd_microstep: 256.67 | bwd_inner_microstep: 256.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:55,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 252.45 | bwd_inner_microstep: 252.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:55,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 247.69 | bwd_inner_microstep: 247.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:56,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 253.55 | bwd_inner_microstep: 253.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:56,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 245.92 | bwd_inner_microstep: 245.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:56,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:57,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 249.48 | bwd_inner_microstep: 249.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:57,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:58,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:58,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:53:59,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.79 | optimizer_gradients: 0.62 | optimizer_step: 3.14 +[2024-12-31 17:53:59,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 524.25 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 279.65 | step_microstep: 12.09 +[2024-12-31 17:53:59,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.96 | bwd: 4477.52 | bwd_inner: 4197.33 | bwd_allreduce: 279.80 | step: 12.74 + 41%|████ | 309/759 [43:54<55:31, 7.40s/it] {'loss': 1.2452, 'learning_rate': 1.3429113333587181e-05, 'epoch': 0.41} + 41%|████ | 309/759 [43:54<55:31, 7.40s/it][2024-12-31 17:53:59,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.64 | bwd_microstep: 349.98 | bwd_inner_microstep: 349.87 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:54:00,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.68 | bwd_microstep: 302.47 | bwd_inner_microstep: 302.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:00,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.78 | bwd_microstep: 292.06 | bwd_inner_microstep: 292.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:01,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 264.23 | bwd_inner_microstep: 264.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:01,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 249.99 | bwd_inner_microstep: 249.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:02,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.49 | bwd_microstep: 251.10 | bwd_inner_microstep: 251.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:02,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 249.44 | bwd_inner_microstep: 249.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:03,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:03,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.83 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:03,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:04,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:04,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:05,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:06,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:06,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.84 | optimizer_gradients: 0.80 | optimizer_step: 3.13 +[2024-12-31 17:54:06,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 709.19 | bwd_inner_microstep: 241.98 | bwd_allreduce_microstep: 467.16 | step_microstep: 12.06 +[2024-12-31 17:54:06,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.72 | bwd: 4625.47 | bwd_inner: 4157.76 | bwd_allreduce: 467.31 | step: 12.69 + 41%|████ | 310/759 [44:01<55:50, 7.46s/it] {'loss': 1.2186, 'learning_rate': 1.3388985600957922e-05, 'epoch': 0.41} + 41%|████ | 310/759 [44:01<55:50, 7.46s/it][2024-12-31 17:54:07,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 342.09 | bwd_microstep: 334.70 | bwd_inner_microstep: 334.58 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:54:08,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.35 | bwd_microstep: 292.34 | bwd_inner_microstep: 292.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:54:08,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.61 | bwd_microstep: 286.83 | bwd_inner_microstep: 286.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:09,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.45 | bwd_microstep: 260.48 | bwd_inner_microstep: 260.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:09,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.40 | bwd_microstep: 260.99 | bwd_inner_microstep: 260.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:09,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 250.26 | bwd_inner_microstep: 250.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:10,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 249.01 | bwd_inner_microstep: 248.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:10,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:11,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 245.88 | bwd_inner_microstep: 245.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:11,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:12,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:12,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:12,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:13,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:13,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 241.15 | bwd_inner_microstep: 241.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:14,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.13 | optimizer_gradients: 0.86 | optimizer_step: 3.15 +[2024-12-31 17:54:14,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 468.72 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 224.75 | step_microstep: 11.75 +[2024-12-31 17:54:14,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2958.22 | bwd: 4361.26 | bwd_inner: 4135.97 | bwd_allreduce: 224.91 | step: 12.38 + 41%|████ | 311/759 [44:09<55:40, 7.46s/it] {'loss': 1.2639, 'learning_rate': 1.3348796121709862e-05, 'epoch': 0.41} + 41%|████ | 311/759 [44:09<55:40, 7.46s/it][2024-12-31 17:54:14,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.95 | bwd_microstep: 351.61 | bwd_inner_microstep: 351.50 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 17:54:15,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.90 | bwd_microstep: 304.75 | bwd_inner_microstep: 304.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:15,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.20 | bwd_microstep: 281.70 | bwd_inner_microstep: 281.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:16,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.04 | bwd_microstep: 262.82 | bwd_inner_microstep: 262.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:16,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 257.47 | bwd_inner_microstep: 257.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:17,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 261.23 | bwd_inner_microstep: 261.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:54:17,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 247.85 | bwd_inner_microstep: 247.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:18,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.35 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:18,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.76 | bwd_microstep: 245.83 | bwd_inner_microstep: 245.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:19,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:19,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:19,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:20,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.86 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:20,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:21,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 247.40 | bwd_inner_microstep: 247.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:21,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.51 | optimizer_gradients: 0.84 | optimizer_step: 3.15 +[2024-12-31 17:54:21,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.23 | bwd_microstep: 351.23 | bwd_inner_microstep: 242.09 | bwd_allreduce_microstep: 109.10 | step_microstep: 13.97 +[2024-12-31 17:54:21,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.05 | bwd: 4276.69 | bwd_inner: 4167.05 | bwd_allreduce: 109.25 | step: 14.61 + 41%|████ | 312/759 [44:16<55:07, 7.40s/it] {'loss': 1.253, 'learning_rate': 1.3308545628087029e-05, 'epoch': 0.41} + 41%|████ | 312/759 [44:16<55:07, 7.40s/it][2024-12-31 17:54:22,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 244.15 | bwd_microstep: 399.49 | bwd_inner_microstep: 399.37 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.07 +[2024-12-31 17:54:22,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.69 | bwd_microstep: 288.34 | bwd_inner_microstep: 288.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:23,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.38 | bwd_microstep: 281.12 | bwd_inner_microstep: 281.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:23,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.55 | bwd_microstep: 267.14 | bwd_inner_microstep: 267.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:24,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.33 | bwd_microstep: 255.16 | bwd_inner_microstep: 255.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:54:24,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.63 | bwd_microstep: 256.67 | bwd_inner_microstep: 256.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:25,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.69 | bwd_microstep: 250.80 | bwd_inner_microstep: 250.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:25,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 250.15 | bwd_inner_microstep: 250.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:54:25,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 252.28 | bwd_inner_microstep: 252.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:26,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:26,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:54:27,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 245.82 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.29 +[2024-12-31 17:54:27,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 265.18 | bwd_inner_microstep: 265.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:54:28,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.65 | bwd_microstep: 241.62 | bwd_inner_microstep: 241.27 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:54:28,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.83 | bwd_microstep: 242.29 | bwd_inner_microstep: 242.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:29,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.34 | optimizer_gradients: 0.62 | optimizer_step: 3.12 +[2024-12-31 17:54:29,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 291.42 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 48.53 | step_microstep: 11.38 +[2024-12-31 17:54:29,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.44 | bwd: 4276.90 | bwd_inner: 4227.23 | bwd_allreduce: 48.97 | step: 13.29 + 41%|████ | 313/759 [44:23<54:53, 7.38s/it] {'loss': 1.242, 'learning_rate': 1.3268234853445113e-05, 'epoch': 0.41} + 41%|████ | 313/759 [44:23<54:53, 7.38s/it][2024-12-31 17:54:29,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.80 | bwd_microstep: 352.14 | bwd_inner_microstep: 346.32 | bwd_allreduce_microstep: 5.59 | step_microstep: 0.20 +[2024-12-31 17:54:30,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.13 | bwd_microstep: 283.84 | bwd_inner_microstep: 283.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:54:30,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.02 | bwd_microstep: 256.53 | bwd_inner_microstep: 256.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:30,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 254.30 | bwd_inner_microstep: 254.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:31,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 248.16 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:54:31,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 246.74 | bwd_inner_microstep: 246.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:54:32,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:54:32,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:54:33,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:33,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.62 | bwd_microstep: 242.84 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:54:34,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:34,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 248.66 | bwd_inner_microstep: 248.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:34,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 248.86 | bwd_inner_microstep: 248.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:54:35,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.14 | bwd_microstep: 241.49 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:54:35,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.21 | bwd_microstep: 242.40 | bwd_inner_microstep: 242.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:36,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.39 | optimizer_gradients: 0.61 | optimizer_step: 3.12 +[2024-12-31 17:54:36,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 446.48 | bwd_inner_microstep: 242.45 | bwd_allreduce_microstep: 203.99 | step_microstep: 11.09 +[2024-12-31 17:54:36,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2803.36 | bwd: 4292.51 | bwd_inner: 4082.18 | bwd_allreduce: 209.70 | step: 14.04 + 41%|████▏ | 314/759 [44:31<54:45, 7.38s/it] {'loss': 1.2593, 'learning_rate': 1.3227864532238113e-05, 'epoch': 0.41} + 41%|████▏ | 314/759 [44:31<54:45, 7.38s/it][2024-12-31 17:54:37,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 232.40 | bwd_microstep: 359.76 | bwd_inner_microstep: 359.41 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:54:37,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.04 | bwd_microstep: 292.09 | bwd_inner_microstep: 292.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:54:37,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.21 | bwd_microstep: 266.38 | bwd_inner_microstep: 266.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:38,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 249.36 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:38,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 249.54 | bwd_inner_microstep: 249.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:39,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:54:39,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 246.31 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:54:40,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 247.16 | bwd_inner_microstep: 246.95 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.19 +[2024-12-31 17:54:40,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:41,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:41,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.67 | bwd_microstep: 243.91 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:54:41,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:42,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:42,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:43,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:43,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.95 | optimizer_gradients: 0.72 | optimizer_step: 3.32 +[2024-12-31 17:54:43,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.05 | bwd_microstep: 255.24 | bwd_inner_microstep: 241.56 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.55 +[2024-12-31 17:54:43,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.63 | bwd: 4125.43 | bwd_inner: 4110.65 | bwd_allreduce: 14.00 | step: 14.38 + 42%|████▏ | 315/759 [44:38<54:20, 7.34s/it] {'loss': 1.2187, 'learning_rate': 1.318743540000496e-05, 'epoch': 0.41} + 42%|████▏ | 315/759 [44:38<54:20, 7.34s/it][2024-12-31 17:54:44,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.65 | bwd_microstep: 342.96 | bwd_inner_microstep: 342.59 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 17:54:44,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.52 | bwd_microstep: 288.42 | bwd_inner_microstep: 288.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:54:45,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.63 | bwd_microstep: 268.47 | bwd_inner_microstep: 268.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:45,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.11 | bwd_microstep: 281.76 | bwd_inner_microstep: 281.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:54:46,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.63 | bwd_microstep: 257.16 | bwd_inner_microstep: 257.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:54:46,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.24 | bwd_microstep: 257.86 | bwd_inner_microstep: 257.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:47,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 249.41 | bwd_inner_microstep: 249.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:47,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:47,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 247.57 | bwd_inner_microstep: 247.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:54:48,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:48,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.36 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:49,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:49,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:50,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:54:50,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.78 | bwd_inner_microstep: 244.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:50,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.74 | optimizer_step: 3.35 +[2024-12-31 17:54:50,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 258.01 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 13.53 | step_microstep: 12.98 +[2024-12-31 17:54:50,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2877.45 | bwd: 4162.00 | bwd_inner: 4147.59 | bwd_allreduce: 13.80 | step: 15.92 + 42%|████▏ | 316/759 [44:45<54:10, 7.34s/it] {'loss': 1.2392, 'learning_rate': 1.3146948193356105e-05, 'epoch': 0.42} + 42%|████▏ | 316/759 [44:45<54:10, 7.34s/it][2024-12-31 17:54:51,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.83 | bwd_microstep: 365.92 | bwd_inner_microstep: 365.57 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:54:52,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.93 | bwd_microstep: 280.83 | bwd_inner_microstep: 280.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:52,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.12 | bwd_microstep: 262.30 | bwd_inner_microstep: 262.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:52,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 249.58 | bwd_inner_microstep: 249.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:53,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 246.74 | bwd_inner_microstep: 246.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:53,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:54,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:54:54,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.23 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:55,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:54:55,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:56,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:56,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 279.78 | bwd_inner_microstep: 279.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:56,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.72 | bwd_microstep: 241.72 | bwd_inner_microstep: 241.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:57,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.30 | bwd_microstep: 242.35 | bwd_inner_microstep: 242.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:57,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:58,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.40 | optimizer_gradients: 0.62 | optimizer_step: 3.10 +[2024-12-31 17:54:58,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 377.69 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 128.50 | step_microstep: 11.03 +[2024-12-31 17:54:58,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.59 | bwd: 4257.14 | bwd_inner: 4127.80 | bwd_allreduce: 128.76 | step: 14.01 + 42%|████▏ | 317/759 [44:53<54:08, 7.35s/it] {'loss': 1.2819, 'learning_rate': 1.3106403649960109e-05, 'epoch': 0.42} + 42%|████▏ | 317/759 [44:53<54:08, 7.35s/it][2024-12-31 17:54:58,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.36 | bwd_microstep: 342.73 | bwd_inner_microstep: 342.37 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 17:54:59,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.35 | bwd_microstep: 315.84 | bwd_inner_microstep: 315.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:54:59,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.23 | bwd_microstep: 267.49 | bwd_inner_microstep: 267.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:00,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.31 | bwd_microstep: 298.47 | bwd_inner_microstep: 298.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:55:00,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 249.67 | bwd_inner_microstep: 249.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:01,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 248.39 | bwd_inner_microstep: 248.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:01,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.90 | bwd_microstep: 247.85 | bwd_inner_microstep: 247.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:02,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 246.91 | bwd_inner_microstep: 246.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:02,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:03,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:03,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:03,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 246.74 | bwd_inner_microstep: 246.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:04,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:04,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.35 | bwd_microstep: 241.85 | bwd_inner_microstep: 241.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:05,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.66 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:05,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.67 | optimizer_step: 3.23 +[2024-12-31 17:55:05,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.70 | bwd_microstep: 267.04 | bwd_inner_microstep: 226.27 | bwd_allreduce_microstep: 40.72 | step_microstep: 10.60 +[2024-12-31 17:55:05,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.18 | bwd: 4192.06 | bwd_inner: 4150.45 | bwd_allreduce: 40.97 | step: 13.62 + 42%|████▏ | 318/759 [45:00<53:54, 7.33s/it] {'loss': 1.2638, 'learning_rate': 1.3065802508530186e-05, 'epoch': 0.42} + 42%|████▏ | 318/759 [45:00<53:54, 7.33s/it][2024-12-31 17:55:06,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.75 | bwd_microstep: 342.12 | bwd_inner_microstep: 341.78 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:55:06,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.01 | bwd_microstep: 292.53 | bwd_inner_microstep: 292.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:55:07,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.79 | bwd_microstep: 281.60 | bwd_inner_microstep: 281.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:07,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 261.11 | bwd_inner_microstep: 261.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:08,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 250.66 | bwd_inner_microstep: 250.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:55:08,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 248.61 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:55:09,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 258.99 | bwd_inner_microstep: 258.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:55:09,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:09,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:10,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:10,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:55:11,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:11,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 241.97 | bwd_inner_microstep: 241.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:12,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.57 | bwd_microstep: 241.63 | bwd_inner_microstep: 241.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:12,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 244.78 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.22 +[2024-12-31 17:55:12,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.54 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 17:55:12,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 307.20 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 61.75 | step_microstep: 11.10 +[2024-12-31 17:55:12,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2818.85 | bwd: 4195.27 | bwd_inner: 4132.56 | bwd_allreduce: 62.12 | step: 14.22 + 42%|████▏ | 319/759 [45:07<53:43, 7.33s/it] {'loss': 1.2472, 'learning_rate': 1.302514550881076e-05, 'epoch': 0.42} + 42%|████▏ | 319/759 [45:07<53:43, 7.33s/it][2024-12-31 17:55:13,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 302.22 | bwd_microstep: 375.80 | bwd_inner_microstep: 375.44 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:55:14,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.10 | bwd_microstep: 284.20 | bwd_inner_microstep: 284.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:55:14,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.91 | bwd_microstep: 268.04 | bwd_inner_microstep: 268.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:55:15,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.33 | bwd_microstep: 263.79 | bwd_inner_microstep: 263.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:55:15,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.24 | bwd_microstep: 257.17 | bwd_inner_microstep: 257.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:55:15,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.72 | bwd_microstep: 248.93 | bwd_inner_microstep: 248.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:16,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 259.09 | bwd_inner_microstep: 259.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:16,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 248.70 | bwd_inner_microstep: 248.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:17,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 247.18 | bwd_inner_microstep: 247.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:17,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:18,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:18,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:19,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:19,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:55:19,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.70 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:20,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.23 | optimizer_gradients: 0.76 | optimizer_step: 3.16 +[2024-12-31 17:55:20,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 350.10 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 106.07 | step_microstep: 11.53 +[2024-12-31 17:55:20,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2919.36 | bwd: 4277.17 | bwd_inner: 4170.29 | bwd_allreduce: 106.32 | step: 14.27 + 42%|████▏ | 320/759 [45:15<53:55, 7.37s/it] {'loss': 1.2181, 'learning_rate': 1.2984433391563984e-05, 'epoch': 0.42} + 42%|████▏ | 320/759 [45:15<53:55, 7.37s/it][2024-12-31 17:55:21,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.75 | bwd_microstep: 338.98 | bwd_inner_microstep: 338.64 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:55:21,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.15 | bwd_microstep: 306.90 | bwd_inner_microstep: 306.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:55:22,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.62 | bwd_microstep: 293.72 | bwd_inner_microstep: 293.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:22,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.81 | bwd_microstep: 262.82 | bwd_inner_microstep: 262.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:55:22,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.17 | bwd_microstep: 255.54 | bwd_inner_microstep: 255.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:23,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.57 | bwd_microstep: 255.33 | bwd_inner_microstep: 255.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:55:23,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 255.67 | bwd_inner_microstep: 255.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:24,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 247.94 | bwd_inner_microstep: 247.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:24,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:25,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:25,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.77 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:26,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:55:26,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:26,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:27,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.75 | bwd_microstep: 241.32 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:27,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.01 | optimizer_gradients: 0.73 | optimizer_step: 3.33 +[2024-12-31 17:55:27,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 258.33 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 14.17 | step_microstep: 13.61 +[2024-12-31 17:55:27,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.54 | bwd: 4184.63 | bwd_inner: 4169.53 | bwd_allreduce: 14.45 | step: 16.44 + 42%|████▏ | 321/759 [45:22<53:42, 7.36s/it] {'loss': 1.2298, 'learning_rate': 1.294366689855624e-05, 'epoch': 0.42} + 42%|████▏ | 321/759 [45:22<53:42, 7.36s/it][2024-12-31 17:55:28,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.65 | bwd_microstep: 363.29 | bwd_inner_microstep: 362.94 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:55:28,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.16 | bwd_microstep: 282.17 | bwd_inner_microstep: 282.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:29,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.70 | bwd_microstep: 259.16 | bwd_inner_microstep: 259.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:29,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.45 | bwd_microstep: 255.45 | bwd_inner_microstep: 255.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:30,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.51 | bwd_microstep: 252.77 | bwd_inner_microstep: 252.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:30,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:31,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:55:31,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 246.13 | bwd_inner_microstep: 246.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:31,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 251.20 | bwd_inner_microstep: 250.66 | bwd_allreduce_microstep: 0.27 | step_microstep: 0.34 +[2024-12-31 17:55:32,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 254.92 | bwd_inner_microstep: 254.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:32,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:33,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 17:55:33,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:34,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:34,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:55:35,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 0.87 | optimizer_step: 3.26 +[2024-12-31 17:55:35,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 446.95 | bwd_inner_microstep: 245.65 | bwd_allreduce_microstep: 201.24 | step_microstep: 11.24 +[2024-12-31 17:55:35,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2824.80 | bwd: 4325.24 | bwd_inner: 4122.74 | bwd_allreduce: 201.77 | step: 14.24 + 42%|████▏ | 322/759 [45:30<53:47, 7.39s/it] {'loss': 1.2335, 'learning_rate': 1.2902846772544625e-05, 'epoch': 0.42} + 42%|████▏ | 322/759 [45:30<53:47, 7.39s/it][2024-12-31 17:55:35,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.41 | bwd_microstep: 344.51 | bwd_inner_microstep: 344.14 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 17:55:36,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.79 | bwd_microstep: 299.05 | bwd_inner_microstep: 299.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:55:36,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.58 | bwd_microstep: 286.98 | bwd_inner_microstep: 286.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:55:37,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.66 | bwd_microstep: 266.39 | bwd_inner_microstep: 266.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:37,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 250.13 | bwd_inner_microstep: 250.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:38,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 249.39 | bwd_inner_microstep: 249.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:38,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 246.88 | bwd_inner_microstep: 246.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:39,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 252.70 | bwd_inner_microstep: 252.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:39,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 248.03 | bwd_inner_microstep: 248.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:39,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:40,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:40,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.75 | bwd_microstep: 242.55 | bwd_inner_microstep: 242.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:41,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 242.86 | bwd_inner_microstep: 242.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:41,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.03 | bwd_microstep: 263.31 | bwd_inner_microstep: 263.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:42,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 242.59 | bwd_inner_microstep: 242.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:42,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 0.74 | optimizer_step: 3.15 +[2024-12-31 17:55:42,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 312.21 | bwd_inner_microstep: 240.99 | bwd_allreduce_microstep: 71.17 | step_microstep: 10.48 +[2024-12-31 17:55:42,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2855.96 | bwd: 4235.18 | bwd_inner: 4163.22 | bwd_allreduce: 71.42 | step: 13.36 + 43%|████▎ | 323/759 [45:37<53:38, 7.38s/it] {'loss': 1.2078, 'learning_rate': 1.2861973757263416e-05, 'epoch': 0.43} + 43%|████▎ | 323/759 [45:37<53:38, 7.38s/it][2024-12-31 17:55:43,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.56 | bwd_microstep: 396.31 | bwd_inner_microstep: 395.94 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:55:43,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.87 | bwd_microstep: 290.87 | bwd_inner_microstep: 290.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:44,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.69 | bwd_microstep: 273.59 | bwd_inner_microstep: 273.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:44,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.26 | bwd_microstep: 265.38 | bwd_inner_microstep: 265.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:45,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.82 | bwd_microstep: 254.67 | bwd_inner_microstep: 254.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:45,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 249.36 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:46,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.89 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:55:46,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 246.91 | bwd_inner_microstep: 246.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:55:46,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 245.89 | bwd_inner_microstep: 245.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:47,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 248.12 | bwd_inner_microstep: 248.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:47,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:55:48,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:55:48,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:55:49,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:55:49,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:55:49,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.34 | optimizer_gradients: 0.57 | optimizer_step: 3.11 +[2024-12-31 17:55:49,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 277.75 | bwd_inner_microstep: 242.73 | bwd_allreduce_microstep: 34.96 | step_microstep: 11.42 +[2024-12-31 17:55:49,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.44 | bwd: 4217.14 | bwd_inner: 4181.40 | bwd_allreduce: 35.21 | step: 13.91 + 43%|████▎ | 324/759 [45:44<53:24, 7.37s/it] {'loss': 1.2237, 'learning_rate': 1.282104859741052e-05, 'epoch': 0.43} + 43%|████▎ | 324/759 [45:44<53:24, 7.37s/it][2024-12-31 17:55:50,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.47 | bwd_microstep: 336.82 | bwd_inner_microstep: 336.53 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.19 +[2024-12-31 17:55:50,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.45 | bwd_microstep: 279.90 | bwd_inner_microstep: 279.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:51,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.32 | bwd_microstep: 277.77 | bwd_inner_microstep: 277.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:51,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.22 | bwd_microstep: 255.82 | bwd_inner_microstep: 255.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:52,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 248.84 | bwd_inner_microstep: 248.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:52,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 276.48 | bwd_inner_microstep: 276.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:53,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.92 | bwd_microstep: 254.47 | bwd_inner_microstep: 254.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:55:53,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 248.87 | bwd_inner_microstep: 248.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:54,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.44 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:55:54,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.14 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:54,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:55,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:55,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 245.91 | bwd_inner_microstep: 245.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:56,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:55:56,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.21 | bwd_microstep: 241.99 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:57,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 1.63 | optimizer_step: 3.50 +[2024-12-31 17:55:57,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.74 | bwd_microstep: 268.33 | bwd_inner_microstep: 254.38 | bwd_allreduce_microstep: 13.88 | step_microstep: 13.19 +[2024-12-31 17:55:57,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.68 | bwd: 4164.98 | bwd_inner: 4150.36 | bwd_allreduce: 14.12 | step: 16.08 + 43%|████▎ | 325/759 [45:52<53:08, 7.35s/it] {'loss': 1.2552, 'learning_rate': 1.2780072038633913e-05, 'epoch': 0.43} + 43%|████▎ | 325/759 [45:52<53:08, 7.35s/it][2024-12-31 17:55:57,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.20 | bwd_microstep: 353.33 | bwd_inner_microstep: 352.99 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:55:58,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.07 | bwd_microstep: 290.29 | bwd_inner_microstep: 290.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:55:58,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.80 | bwd_microstep: 268.53 | bwd_inner_microstep: 268.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:55:59,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.53 | bwd_microstep: 261.84 | bwd_inner_microstep: 261.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:55:59,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 256.01 | bwd_inner_microstep: 255.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:00,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.56 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:00,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.24 | bwd_microstep: 245.25 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:56:01,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.54 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:01,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:01,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 246.90 | bwd_inner_microstep: 246.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:02,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:56:02,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 246.85 | bwd_inner_microstep: 246.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:03,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.14 | bwd_microstep: 248.94 | bwd_inner_microstep: 248.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:03,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:04,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:04,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.02 | optimizer_gradients: 0.62 | optimizer_step: 75.67 +[2024-12-31 17:56:04,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 258.91 | bwd_inner_microstep: 245.15 | bwd_allreduce_microstep: 13.65 | step_microstep: 91.21 +[2024-12-31 17:56:04,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2872.02 | bwd: 4148.17 | bwd_inner: 4133.38 | bwd_allreduce: 13.93 | step: 94.40 + 43%|████▎ | 326/759 [45:59<53:17, 7.38s/it] {'loss': 1.2443, 'learning_rate': 1.2739044827518043e-05, 'epoch': 0.43} + 43%|████▎ | 326/759 [45:59<53:17, 7.38s/it][2024-12-31 17:56:05,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.10 | bwd_microstep: 343.66 | bwd_inner_microstep: 343.20 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.32 +[2024-12-31 17:56:05,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 245.92 | bwd_microstep: 407.49 | bwd_inner_microstep: 407.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:06,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.29 | bwd_microstep: 267.54 | bwd_inner_microstep: 267.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:06,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.15 | bwd_microstep: 257.33 | bwd_inner_microstep: 257.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:07,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.80 | bwd_microstep: 261.85 | bwd_inner_microstep: 261.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:07,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 248.37 | bwd_inner_microstep: 248.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:08,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:08,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 258.64 | bwd_inner_microstep: 258.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:09,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.57 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:09,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:09,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:10,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 251.73 | bwd_inner_microstep: 251.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:56:10,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:11,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:56:11,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 241.67 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:12,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 0.62 | optimizer_step: 3.24 +[2024-12-31 17:56:12,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 453.08 | bwd_inner_microstep: 248.90 | bwd_allreduce_microstep: 204.14 | step_microstep: 11.38 +[2024-12-31 17:56:12,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2871.69 | bwd: 4460.75 | bwd_inner: 4255.59 | bwd_allreduce: 204.45 | step: 14.44 + 43%|████▎ | 327/759 [46:07<53:42, 7.46s/it] {'loss': 1.2096, 'learning_rate': 1.2697967711570243e-05, 'epoch': 0.43} + 43%|████▎ | 327/759 [46:07<53:42, 7.46s/it][2024-12-31 17:56:12,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.09 | bwd_microstep: 353.45 | bwd_inner_microstep: 353.11 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:56:13,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.54 | bwd_microstep: 289.44 | bwd_inner_microstep: 289.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:13,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.06 | bwd_microstep: 266.95 | bwd_inner_microstep: 266.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:56:14,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.96 | bwd_microstep: 264.49 | bwd_inner_microstep: 264.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:56:14,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.56 | bwd_microstep: 248.18 | bwd_inner_microstep: 248.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:15,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 255.28 | bwd_inner_microstep: 255.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:15,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:56:16,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:16,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.23 +[2024-12-31 17:56:16,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:56:17,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:17,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.81 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:18,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.47 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:56:18,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.03 | bwd_microstep: 241.51 | bwd_inner_microstep: 241.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:19,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:19,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.70 | optimizer_gradients: 0.76 | optimizer_step: 3.32 +[2024-12-31 17:56:19,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 257.46 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 13.57 | step_microstep: 12.11 +[2024-12-31 17:56:19,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2868.81 | bwd: 4132.93 | bwd_inner: 4118.28 | bwd_allreduce: 14.01 | step: 15.19 + 43%|████▎ | 328/759 [46:14<53:14, 7.41s/it] {'loss': 1.2598, 'learning_rate': 1.2656841439207093e-05, 'epoch': 0.43} + 43%|████▎ | 328/759 [46:14<53:14, 7.41s/it][2024-12-31 17:56:20,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.17 | bwd_microstep: 366.57 | bwd_inner_microstep: 366.19 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.20 +[2024-12-31 17:56:20,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.36 | bwd_microstep: 316.82 | bwd_inner_microstep: 316.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:56:21,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.54 | bwd_microstep: 262.29 | bwd_inner_microstep: 262.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:21,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.85 | bwd_microstep: 255.05 | bwd_inner_microstep: 254.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 17:56:22,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:56:22,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:23,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:23,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:23,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 242.91 | bwd_inner_microstep: 242.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:24,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 250.39 | bwd_inner_microstep: 250.26 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.25 +[2024-12-31 17:56:24,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:25,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.63 | bwd_microstep: 241.10 | bwd_inner_microstep: 241.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:25,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:26,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.39 | bwd_microstep: 242.00 | bwd_inner_microstep: 241.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:26,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.10 | bwd_microstep: 240.90 | bwd_inner_microstep: 240.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:26,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.31 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 17:56:26,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.52 | bwd_microstep: 332.25 | bwd_inner_microstep: 241.57 | bwd_allreduce_microstep: 90.64 | step_microstep: 11.16 +[2024-12-31 17:56:26,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2837.23 | bwd: 4220.59 | bwd_inner: 4128.76 | bwd_allreduce: 90.98 | step: 14.40 + 43%|████▎ | 329/759 [46:21<53:00, 7.40s/it] {'loss': 1.2469, 'learning_rate': 1.2615666759740788e-05, 'epoch': 0.43} + 43%|████▎ | 329/759 [46:21<53:00, 7.40s/it][2024-12-31 17:56:27,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.68 | bwd_microstep: 336.03 | bwd_inner_microstep: 335.66 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:56:28,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.25 | bwd_microstep: 348.91 | bwd_inner_microstep: 348.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:28,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.46 | bwd_microstep: 263.33 | bwd_inner_microstep: 263.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:29,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 285.54 | bwd_inner_microstep: 285.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:56:29,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 265.66 | bwd_inner_microstep: 265.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:56:29,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.18 | bwd_microstep: 249.94 | bwd_inner_microstep: 249.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:30,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 248.12 | bwd_inner_microstep: 247.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:30,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 252.74 | bwd_inner_microstep: 252.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:31,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:31,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:32,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:32,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 244.44 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:33,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.55 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:33,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:33,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:34,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 0.79 | optimizer_step: 3.38 +[2024-12-31 17:56:34,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.19 | bwd_microstep: 277.73 | bwd_inner_microstep: 264.05 | bwd_allreduce_microstep: 13.56 | step_microstep: 11.36 +[2024-12-31 17:56:34,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.25 | bwd: 4236.52 | bwd_inner: 4221.77 | bwd_allreduce: 13.95 | step: 14.35 + 43%|████▎ | 330/759 [46:29<52:52, 7.39s/it] {'loss': 1.2612, 'learning_rate': 1.2574444423365503e-05, 'epoch': 0.43} + 43%|████▎ | 330/759 [46:29<52:52, 7.39s/it][2024-12-31 17:56:34,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.11 | bwd_microstep: 335.26 | bwd_inner_microstep: 334.91 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:56:35,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.42 | bwd_microstep: 360.60 | bwd_inner_microstep: 360.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:36,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.71 | bwd_microstep: 273.04 | bwd_inner_microstep: 273.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:36,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.43 | bwd_microstep: 267.32 | bwd_inner_microstep: 267.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:36,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.30 | bwd_microstep: 278.68 | bwd_inner_microstep: 278.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:56:37,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 247.19 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:37,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 247.33 | bwd_inner_microstep: 247.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:38,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 246.92 | bwd_inner_microstep: 246.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:38,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 247.35 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:39,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 260.29 | bwd_inner_microstep: 260.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:39,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:40,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:40,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.34 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:40,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 242.69 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:41,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 242.68 | bwd_inner_microstep: 242.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:41,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.62 | optimizer_gradients: 0.76 | optimizer_step: 3.92 +[2024-12-31 17:56:41,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 257.27 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 13.55 | step_microstep: 12.12 +[2024-12-31 17:56:41,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.46 | bwd: 4237.78 | bwd_inner: 4223.03 | bwd_allreduce: 13.92 | step: 15.19 + 44%|████▎ | 331/759 [46:36<52:45, 7.40s/it] {'loss': 1.2247, 'learning_rate': 1.2533175181143704e-05, 'epoch': 0.44} + 44%|████▎ | 331/759 [46:36<52:45, 7.40s/it][2024-12-31 17:56:42,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 245.40 | bwd_microstep: 450.05 | bwd_inner_microstep: 449.67 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:56:43,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.52 | bwd_microstep: 304.04 | bwd_inner_microstep: 304.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:56:43,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.44 | bwd_microstep: 283.86 | bwd_inner_microstep: 283.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:43,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.22 | bwd_microstep: 267.94 | bwd_inner_microstep: 267.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:44,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.96 | bwd_microstep: 257.07 | bwd_inner_microstep: 257.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:56:44,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.88 | bwd_microstep: 255.93 | bwd_inner_microstep: 255.71 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.19 +[2024-12-31 17:56:45,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:45,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 248.71 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:56:46,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:46,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:47,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 245.76 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:47,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 258.57 | bwd_inner_microstep: 258.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:47,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 17:56:48,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:48,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 240.20 | bwd_inner_microstep: 240.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:56:49,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.85 | optimizer_gradients: 8.01 | optimizer_step: 11.90 +[2024-12-31 17:56:49,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 327.54 | bwd_inner_microstep: 241.38 | bwd_allreduce_microstep: 86.12 | step_microstep: 29.53 +[2024-12-31 17:56:49,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2883.05 | bwd: 4367.96 | bwd_inner: 4280.75 | bwd_allreduce: 86.51 | step: 32.39 + 44%|████▎ | 332/759 [46:44<53:00, 7.45s/it] {'loss': 1.2132, 'learning_rate': 1.2491859784992477e-05, 'epoch': 0.44} + 44%|████▎ | 332/759 [46:44<53:00, 7.45s/it][2024-12-31 17:56:49,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 206.26 | bwd_microstep: 312.82 | bwd_inner_microstep: 312.47 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 17:56:50,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.05 | bwd_microstep: 288.43 | bwd_inner_microstep: 288.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:50,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.55 | bwd_microstep: 266.91 | bwd_inner_microstep: 266.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:51,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.84 | bwd_microstep: 256.07 | bwd_inner_microstep: 256.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:51,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.48 | bwd_microstep: 254.08 | bwd_inner_microstep: 254.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:52,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 247.19 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:52,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 251.60 | bwd_inner_microstep: 251.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:53,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:53,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.24 | bwd_inner_microstep: 245.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:53,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:56:54,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:54,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 243.16 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.19 +[2024-12-31 17:56:55,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:55,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:56,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:56,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.31 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 17:56:56,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 410.31 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 165.65 | step_microstep: 12.98 +[2024-12-31 17:56:56,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2803.42 | bwd: 4241.87 | bwd_inner: 4075.20 | bwd_allreduce: 166.02 | step: 15.97 + 44%|████▍ | 333/759 [46:51<52:39, 7.42s/it] {'loss': 1.2206, 'learning_rate': 1.245049898766982e-05, 'epoch': 0.44} + 44%|████▍ | 333/759 [46:51<52:39, 7.42s/it][2024-12-31 17:56:57,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.35 | bwd_microstep: 354.66 | bwd_inner_microstep: 354.32 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.24 +[2024-12-31 17:56:57,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.91 | bwd_microstep: 289.09 | bwd_inner_microstep: 289.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:56:58,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.81 | bwd_microstep: 268.51 | bwd_inner_microstep: 268.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:56:58,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.85 | bwd_microstep: 264.23 | bwd_inner_microstep: 264.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:59,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 251.46 | bwd_inner_microstep: 251.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:56:59,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 249.61 | bwd_inner_microstep: 249.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:00,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:00,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:00,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.80 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:01,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.86 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:01,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:02,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 252.10 | bwd_inner_microstep: 252.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:02,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:03,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:03,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.94 | bwd_microstep: 241.40 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:04,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 0.67 | optimizer_step: 3.11 +[2024-12-31 17:57:04,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 302.31 | bwd_inner_microstep: 246.47 | bwd_allreduce_microstep: 55.79 | step_microstep: 11.48 +[2024-12-31 17:57:04,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.96 | bwd: 4187.03 | bwd_inner: 4130.35 | bwd_allreduce: 56.06 | step: 14.52 + 44%|████▍ | 334/759 [46:58<52:18, 7.39s/it] {'loss': 1.2533, 'learning_rate': 1.2409093542760925e-05, 'epoch': 0.44} + 44%|████▍ | 334/759 [46:58<52:18, 7.39s/it][2024-12-31 17:57:04,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.15 | bwd_microstep: 334.24 | bwd_inner_microstep: 333.90 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:57:05,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.30 | bwd_microstep: 289.98 | bwd_inner_microstep: 289.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:05,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.20 | bwd_microstep: 283.90 | bwd_inner_microstep: 283.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:06,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.83 | bwd_microstep: 255.63 | bwd_inner_microstep: 255.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:06,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 250.13 | bwd_inner_microstep: 250.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:06,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:07,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:07,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 246.40 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:08,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:08,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:09,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:09,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 17:57:09,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:10,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.91 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:57:10,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.56 | bwd_microstep: 241.77 | bwd_inner_microstep: 241.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:11,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.74 | optimizer_step: 3.30 +[2024-12-31 17:57:11,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.38 | bwd_microstep: 256.31 | bwd_inner_microstep: 242.71 | bwd_allreduce_microstep: 13.51 | step_microstep: 10.64 +[2024-12-31 17:57:11,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2855.22 | bwd: 4111.66 | bwd_inner: 4097.13 | bwd_allreduce: 13.82 | step: 13.75 + 44%|████▍ | 335/759 [47:06<51:55, 7.35s/it] {'loss': 1.2584, 'learning_rate': 1.2367644204664468e-05, 'epoch': 0.44} + 44%|████▍ | 335/759 [47:06<51:55, 7.35s/it][2024-12-31 17:57:11,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.72 | bwd_microstep: 341.53 | bwd_inner_microstep: 341.15 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:57:12,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.01 | bwd_microstep: 297.76 | bwd_inner_microstep: 297.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:12,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.50 | bwd_microstep: 285.98 | bwd_inner_microstep: 285.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:13,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.48 | bwd_microstep: 268.75 | bwd_inner_microstep: 268.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:13,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 256.08 | bwd_inner_microstep: 256.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:14,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.04 | bwd_microstep: 255.61 | bwd_inner_microstep: 255.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:14,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.41 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:57:15,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 247.91 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:57:15,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 245.96 | bwd_inner_microstep: 245.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:57:15,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:57:16,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 246.39 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:16,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 255.29 | bwd_inner_microstep: 255.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:17,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:17,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:18,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:18,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.12 | optimizer_gradients: 0.65 | optimizer_step: 3.16 +[2024-12-31 17:57:18,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 270.96 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 26.73 | step_microstep: 11.23 +[2024-12-31 17:57:18,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2863.00 | bwd: 4195.48 | bwd_inner: 4167.93 | bwd_allreduce: 27.02 | step: 14.25 + 44%|████▍ | 336/759 [47:13<51:48, 7.35s/it] {'loss': 1.2136, 'learning_rate': 1.2326151728578839e-05, 'epoch': 0.44} + 44%|████▍ | 336/759 [47:13<51:48, 7.35s/it][2024-12-31 17:57:19,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.27 | bwd_microstep: 351.56 | bwd_inner_microstep: 351.19 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:57:19,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.92 | bwd_microstep: 290.14 | bwd_inner_microstep: 290.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:20,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.73 | bwd_microstep: 268.28 | bwd_inner_microstep: 268.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:57:20,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.03 | bwd_microstep: 256.03 | bwd_inner_microstep: 256.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:21,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 249.60 | bwd_inner_microstep: 249.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:21,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.08 | bwd_microstep: 250.62 | bwd_inner_microstep: 250.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:21,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:57:22,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:22,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 250.59 | bwd_inner_microstep: 250.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:57:23,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:23,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:57:24,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.01 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:24,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 242.75 | bwd_inner_microstep: 242.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:57:24,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 251.91 | bwd_inner_microstep: 251.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:25,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.18 | bwd_microstep: 240.92 | bwd_inner_microstep: 240.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:26,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.46 | optimizer_gradients: 0.62 | optimizer_step: 3.09 +[2024-12-31 17:57:26,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.22 | bwd_microstep: 450.06 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 206.56 | step_microstep: 11.50 +[2024-12-31 17:57:26,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2824.30 | bwd: 4325.43 | bwd_inner: 4118.07 | bwd_allreduce: 206.83 | step: 14.30 + 44%|████▍ | 337/759 [47:20<51:50, 7.37s/it] {'loss': 1.2554, 'learning_rate': 1.228461687048839e-05, 'epoch': 0.44} + 44%|████▍ | 337/759 [47:20<51:50, 7.37s/it][2024-12-31 17:57:26,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 245.60 | bwd_microstep: 388.97 | bwd_inner_microstep: 388.61 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:57:27,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.90 | bwd_microstep: 288.49 | bwd_inner_microstep: 288.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:27,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.15 | bwd_microstep: 262.96 | bwd_inner_microstep: 262.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:28,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.94 | bwd_microstep: 262.77 | bwd_inner_microstep: 262.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:28,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 257.45 | bwd_inner_microstep: 257.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:57:28,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.43 | bwd_microstep: 255.46 | bwd_inner_microstep: 255.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:29,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 247.31 | bwd_inner_microstep: 247.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:29,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.51 | bwd_microstep: 249.10 | bwd_inner_microstep: 249.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:30,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 248.96 | bwd_inner_microstep: 248.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:57:30,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.45 | bwd_microstep: 248.41 | bwd_inner_microstep: 248.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:31,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:31,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 246.25 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:32,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:32,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 248.71 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:32,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:33,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.07 | optimizer_gradients: 0.85 | optimizer_step: 3.25 +[2024-12-31 17:57:33,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 402.88 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 158.93 | step_microstep: 12.43 +[2024-12-31 17:57:33,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2873.05 | bwd: 4340.50 | bwd_inner: 4180.73 | bwd_allreduce: 159.19 | step: 15.27 + 45%|████▍ | 338/759 [47:28<51:59, 7.41s/it] {'loss': 1.2313, 'learning_rate': 1.2243040387149682e-05, 'epoch': 0.45} + 45%|████▍ | 338/759 [47:28<51:59, 7.41s/it][2024-12-31 17:57:34,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.38 | bwd_microstep: 384.69 | bwd_inner_microstep: 384.32 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:57:34,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.83 | bwd_microstep: 407.02 | bwd_inner_microstep: 406.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:57:35,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.42 | bwd_microstep: 310.80 | bwd_inner_microstep: 310.63 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.19 +[2024-12-31 17:57:35,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.51 | bwd_microstep: 254.79 | bwd_inner_microstep: 254.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:36,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.81 | bwd_microstep: 261.28 | bwd_inner_microstep: 261.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:36,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.74 | bwd_microstep: 265.12 | bwd_inner_microstep: 265.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:37,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 246.64 | bwd_inner_microstep: 246.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:37,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.98 | bwd_inner_microstep: 245.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:57:38,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:38,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:57:38,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 259.53 | bwd_inner_microstep: 259.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:57:39,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:39,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:40,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:40,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:41,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.78 | optimizer_step: 3.60 +[2024-12-31 17:57:41,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.53 | bwd_microstep: 255.67 | bwd_inner_microstep: 242.00 | bwd_allreduce_microstep: 13.56 | step_microstep: 11.61 +[2024-12-31 17:57:41,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2900.62 | bwd: 4353.62 | bwd_inner: 4338.95 | bwd_allreduce: 13.94 | step: 14.65 + 45%|████▍ | 339/759 [47:36<52:09, 7.45s/it] {'loss': 1.2444, 'learning_rate': 1.2201423036077657e-05, 'epoch': 0.45} + 45%|████▍ | 339/759 [47:36<52:09, 7.45s/it][2024-12-31 17:57:41,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.81 | bwd_microstep: 334.74 | bwd_inner_microstep: 334.38 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:57:42,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.41 | bwd_microstep: 268.62 | bwd_inner_microstep: 268.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:42,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.20 | bwd_microstep: 261.56 | bwd_inner_microstep: 261.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:43,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.55 | bwd_microstep: 261.48 | bwd_inner_microstep: 261.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:43,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 249.60 | bwd_inner_microstep: 249.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:43,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 247.85 | bwd_inner_microstep: 247.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 17:57:44,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 253.61 | bwd_inner_microstep: 253.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:44,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.38 | bwd_microstep: 246.74 | bwd_inner_microstep: 246.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 17:57:45,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:45,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 248.96 | bwd_inner_microstep: 248.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:57:46,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:46,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:47,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:47,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 244.12 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.31 +[2024-12-31 17:57:47,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.48 | bwd_microstep: 240.73 | bwd_inner_microstep: 240.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:48,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.65 | optimizer_gradients: 0.63 | optimizer_step: 3.11 +[2024-12-31 17:57:48,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 328.73 | bwd_inner_microstep: 254.20 | bwd_allreduce_microstep: 74.48 | step_microstep: 15.49 +[2024-12-31 17:57:48,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.71 | bwd: 4163.54 | bwd_inner: 4088.05 | bwd_allreduce: 74.82 | step: 18.72 + 45%|████▍ | 340/759 [47:43<51:45, 7.41s/it] {'loss': 1.2384, 'learning_rate': 1.2159765575531877e-05, 'epoch': 0.45} + 45%|████▍ | 340/759 [47:43<51:45, 7.41s/it][2024-12-31 17:57:48,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.94 | bwd_microstep: 346.72 | bwd_inner_microstep: 346.38 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:57:49,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.52 | bwd_microstep: 362.17 | bwd_inner_microstep: 362.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:57:50,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.37 | bwd_microstep: 267.63 | bwd_inner_microstep: 267.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 17:57:50,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.59 | bwd_microstep: 256.67 | bwd_inner_microstep: 256.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:57:50,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.97 | bwd_microstep: 259.29 | bwd_inner_microstep: 259.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:51,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 247.98 | bwd_inner_microstep: 247.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:51,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.10 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.34 +[2024-12-31 17:57:52,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:52,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.34 +[2024-12-31 17:57:53,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.16 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:53,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:53,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:57:54,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 257.25 | bwd_inner_microstep: 257.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:57:54,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:55,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.18 | bwd_microstep: 241.08 | bwd_inner_microstep: 241.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:57:55,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.75 | optimizer_step: 3.30 +[2024-12-31 17:57:55,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 258.57 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 13.81 | step_microstep: 11.09 +[2024-12-31 17:57:55,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.27 | bwd: 4212.68 | bwd_inner: 4197.24 | bwd_allreduce: 14.53 | step: 14.22 + 45%|████▍ | 341/759 [47:50<51:31, 7.40s/it] {'loss': 1.2357, 'learning_rate': 1.2118068764502677e-05, 'epoch': 0.45} + 45%|████▍ | 341/759 [47:50<51:31, 7.40s/it][2024-12-31 17:57:56,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.77 | bwd_microstep: 312.88 | bwd_inner_microstep: 312.52 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:57:56,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.26 | bwd_microstep: 295.60 | bwd_inner_microstep: 295.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:57:57,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.75 | bwd_microstep: 263.06 | bwd_inner_microstep: 263.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:57:57,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.98 | bwd_microstep: 255.26 | bwd_inner_microstep: 255.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:57:58,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 251.72 | bwd_inner_microstep: 251.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:58,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 247.05 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:57:59,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 252.02 | bwd_inner_microstep: 252.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:59,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:57:59,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:00,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:00,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:01,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:01,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:02,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.08 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:02,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 241.35 | bwd_inner_microstep: 241.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:03,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.24 | optimizer_gradients: 0.64 | optimizer_step: 3.13 +[2024-12-31 17:58:03,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 296.80 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 52.75 | step_microstep: 11.36 +[2024-12-31 17:58:03,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2807.58 | bwd: 4128.45 | bwd_inner: 4074.88 | bwd_allreduce: 53.01 | step: 14.39 + 45%|████▌ | 342/759 [47:57<51:07, 7.36s/it] {'loss': 1.2348, 'learning_rate': 1.2076333362697358e-05, 'epoch': 0.45} + 45%|████▌ | 342/759 [47:57<51:07, 7.36s/it][2024-12-31 17:58:03,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 305.88 | bwd_microstep: 500.29 | bwd_inner_microstep: 499.91 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 17:58:04,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.72 | bwd_microstep: 292.07 | bwd_inner_microstep: 292.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:04,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.54 | bwd_microstep: 267.48 | bwd_inner_microstep: 267.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:05,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.24 | bwd_microstep: 366.41 | bwd_inner_microstep: 366.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:05,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.51 | bwd_microstep: 258.04 | bwd_inner_microstep: 258.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:58:06,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.41 | bwd_microstep: 255.21 | bwd_inner_microstep: 254.96 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.31 +[2024-12-31 17:58:06,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 248.30 | bwd_inner_microstep: 248.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:58:07,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 247.92 | bwd_inner_microstep: 247.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:07,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 247.69 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:58:08,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:08,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.02 | bwd_microstep: 251.85 | bwd_inner_microstep: 251.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:58:08,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.40 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:09,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.50 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:09,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:10,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 252.86 | bwd_inner_microstep: 252.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:58:10,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.81 | optimizer_gradients: 5.26 | optimizer_step: 12.87 +[2024-12-31 17:58:10,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.81 | bwd_microstep: 255.67 | bwd_inner_microstep: 241.33 | bwd_allreduce_microstep: 14.22 | step_microstep: 27.64 +[2024-12-31 17:58:10,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2930.73 | bwd: 4421.24 | bwd_inner: 4405.82 | bwd_allreduce: 14.67 | step: 30.25 + 45%|████▌ | 343/759 [48:05<51:37, 7.45s/it] {'loss': 1.2197, 'learning_rate': 1.2034560130526341e-05, 'epoch': 0.45} + 45%|████▌ | 343/759 [48:05<51:37, 7.45s/it][2024-12-31 17:58:11,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.61 | bwd_microstep: 352.02 | bwd_inner_microstep: 351.67 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 17:58:11,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.88 | bwd_microstep: 292.53 | bwd_inner_microstep: 292.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:58:12,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.57 | bwd_microstep: 282.53 | bwd_inner_microstep: 282.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 17:58:12,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.59 | bwd_microstep: 264.30 | bwd_inner_microstep: 264.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:13,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.55 | bwd_microstep: 254.92 | bwd_inner_microstep: 254.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:58:13,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.33 | bwd_microstep: 273.85 | bwd_inner_microstep: 273.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:14,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:14,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:14,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:15,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 246.22 | bwd_inner_microstep: 246.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:15,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 246.00 | bwd_inner_microstep: 245.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:16,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:16,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 242.88 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:17,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:17,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 260.13 | bwd_inner_microstep: 260.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:18,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.10 | optimizer_gradients: 0.72 | optimizer_step: 3.37 +[2024-12-31 17:58:18,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 259.89 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 15.33 | step_microstep: 11.78 +[2024-12-31 17:58:18,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.71 | bwd: 4208.13 | bwd_inner: 4191.86 | bwd_allreduce: 15.60 | step: 14.77 + 45%|████▌ | 344/759 [48:12<51:18, 7.42s/it] {'loss': 1.2181, 'learning_rate': 1.199274982908929e-05, 'epoch': 0.45} + 45%|████▌ | 344/759 [48:13<51:18, 7.42s/it][2024-12-31 17:58:18,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 269.78 | bwd_microstep: 451.97 | bwd_inner_microstep: 451.63 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:58:19,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.93 | bwd_microstep: 335.99 | bwd_inner_microstep: 335.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:19,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.91 | bwd_microstep: 288.93 | bwd_inner_microstep: 288.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:20,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.29 | bwd_microstep: 264.87 | bwd_inner_microstep: 264.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:20,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.47 | bwd_microstep: 257.85 | bwd_inner_microstep: 257.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:58:21,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.13 | bwd_microstep: 257.01 | bwd_inner_microstep: 256.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:58:21,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 249.76 | bwd_inner_microstep: 249.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:22,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 250.39 | bwd_inner_microstep: 250.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:58:22,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 246.11 | bwd_inner_microstep: 245.93 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.27 +[2024-12-31 17:58:23,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.24 | bwd_microstep: 257.36 | bwd_inner_microstep: 257.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:58:23,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 17:58:23,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:24,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:58:24,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:25,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:25,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.75 | optimizer_step: 3.26 +[2024-12-31 17:58:25,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 256.93 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 13.55 | step_microstep: 10.87 +[2024-12-31 17:58:25,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2918.38 | bwd: 4340.93 | bwd_inner: 4326.31 | bwd_allreduce: 13.91 | step: 13.59 + 45%|████▌ | 345/759 [48:20<51:34, 7.47s/it] {'loss': 1.236, 'learning_rate': 1.1950903220161286e-05, 'epoch': 0.45} + 45%|████▌ | 345/759 [48:20<51:34, 7.47s/it][2024-12-31 17:58:26,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.90 | bwd_microstep: 315.72 | bwd_inner_microstep: 315.36 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:58:26,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.48 | bwd_microstep: 280.99 | bwd_inner_microstep: 280.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:58:27,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.00 | bwd_microstep: 262.24 | bwd_inner_microstep: 262.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:27,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.97 | bwd_inner_microstep: 248.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:28,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 248.09 | bwd_inner_microstep: 248.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:28,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 247.88 | bwd_inner_microstep: 247.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:28,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.98 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:58:29,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 246.04 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:29,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:58:30,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.33 | bwd_microstep: 246.65 | bwd_inner_microstep: 246.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:30,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:31,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:31,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 245.67 | bwd_inner_microstep: 245.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:58:31,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 245.69 | bwd_inner_microstep: 245.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:32,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.31 | bwd_microstep: 242.30 | bwd_inner_microstep: 242.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:32,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.21 | optimizer_step: 3.12 +[2024-12-31 17:58:32,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 387.38 | bwd_inner_microstep: 242.10 | bwd_allreduce_microstep: 145.24 | step_microstep: 10.92 +[2024-12-31 17:58:32,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.83 | bwd: 4198.34 | bwd_inner: 4052.15 | bwd_allreduce: 145.50 | step: 13.91 + 46%|████▌ | 346/759 [48:27<51:08, 7.43s/it] {'loss': 1.2416, 'learning_rate': 1.1909021066178906e-05, 'epoch': 0.46} + 46%|████▌ | 346/759 [48:27<51:08, 7.43s/it][2024-12-31 17:58:33,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.40 | bwd_microstep: 315.21 | bwd_inner_microstep: 314.85 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:58:33,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.94 | bwd_microstep: 285.00 | bwd_inner_microstep: 284.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:58:34,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.10 | bwd_microstep: 279.95 | bwd_inner_microstep: 279.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:58:34,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.80 | bwd_microstep: 258.05 | bwd_inner_microstep: 258.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:35,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.29 | bwd_microstep: 256.98 | bwd_inner_microstep: 256.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:58:35,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 248.47 | bwd_inner_microstep: 248.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:36,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 248.08 | bwd_inner_microstep: 248.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:36,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.78 | bwd_inner_microstep: 245.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:37,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 253.05 | bwd_inner_microstep: 253.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:37,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:37,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.10 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:38,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 241.22 | bwd_inner_microstep: 241.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:38,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:58:39,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.50 | bwd_microstep: 241.45 | bwd_inner_microstep: 241.24 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.20 +[2024-12-31 17:58:39,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.55 | bwd_microstep: 264.17 | bwd_inner_microstep: 264.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:40,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.74 | optimizer_step: 10.61 +[2024-12-31 17:58:40,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.65 | bwd_microstep: 570.52 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 327.55 | step_microstep: 19.05 +[2024-12-31 17:58:40,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2799.01 | bwd: 4440.25 | bwd_inner: 4111.63 | bwd_allreduce: 327.92 | step: 22.20 + 46%|████▌ | 347/759 [48:35<51:14, 7.46s/it] {'loss': 1.2631, 'learning_rate': 1.1867104130226363e-05, 'epoch': 0.46} + 46%|████▌ | 347/759 [48:35<51:14, 7.46s/it][2024-12-31 17:58:41,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 232.69 | bwd_microstep: 364.55 | bwd_inner_microstep: 364.19 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:58:41,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.05 | bwd_microstep: 281.28 | bwd_inner_microstep: 281.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:42,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.90 | bwd_microstep: 265.12 | bwd_inner_microstep: 265.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:42,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.67 | bwd_microstep: 255.83 | bwd_inner_microstep: 255.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:42,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.13 | bwd_microstep: 258.85 | bwd_inner_microstep: 258.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:43,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 247.90 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:43,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 247.38 | bwd_inner_microstep: 247.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:44,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 246.20 | bwd_inner_microstep: 246.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:44,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:45,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 246.45 | bwd_inner_microstep: 246.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:45,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.18 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:58:45,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 246.71 | bwd_inner_microstep: 246.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:58:46,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.31 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:46,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 269.57 | bwd_inner_microstep: 269.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:47,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.56 | bwd_microstep: 241.54 | bwd_inner_microstep: 241.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:58:47,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.82 | optimizer_gradients: 0.73 | optimizer_step: 3.29 +[2024-12-31 17:58:47,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.91 | bwd_microstep: 258.52 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 14.71 | step_microstep: 11.25 +[2024-12-31 17:58:47,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.40 | bwd: 4162.13 | bwd_inner: 4146.53 | bwd_allreduce: 15.00 | step: 14.05 + 46%|████▌ | 348/759 [48:42<50:46, 7.41s/it] {'loss': 1.2359, 'learning_rate': 1.1825153176021591e-05, 'epoch': 0.46} + 46%|████▌ | 348/759 [48:42<50:46, 7.41s/it][2024-12-31 17:58:48,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.04 | bwd_microstep: 340.29 | bwd_inner_microstep: 339.94 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:58:48,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.24 | bwd_microstep: 313.06 | bwd_inner_microstep: 313.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:58:49,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.44 | bwd_microstep: 283.21 | bwd_inner_microstep: 283.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:58:49,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.03 | bwd_microstep: 262.37 | bwd_inner_microstep: 262.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:58:50,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.77 | bwd_microstep: 257.18 | bwd_inner_microstep: 257.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:50,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 249.62 | bwd_inner_microstep: 249.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:51,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.68 | bwd_microstep: 255.28 | bwd_inner_microstep: 255.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:58:51,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 247.43 | bwd_inner_microstep: 247.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:52,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 261.97 | bwd_inner_microstep: 261.62 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.20 +[2024-12-31 17:58:52,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 246.31 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:52,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:53,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:58:53,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 17:58:54,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:58:54,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.27 | bwd_microstep: 240.93 | bwd_inner_microstep: 240.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:55,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.09 | optimizer_gradients: 0.57 | optimizer_step: 3.14 +[2024-12-31 17:58:55,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 255.31 | bwd_inner_microstep: 241.76 | bwd_allreduce_microstep: 13.49 | step_microstep: 11.19 +[2024-12-31 17:58:55,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.31 | bwd: 4189.74 | bwd_inner: 4175.08 | bwd_allreduce: 13.96 | step: 14.09 + 46%|████▌ | 349/759 [48:50<50:28, 7.39s/it] {'loss': 1.2483, 'learning_rate': 1.1783168967902314e-05, 'epoch': 0.46} + 46%|████▌ | 349/759 [48:50<50:28, 7.39s/it][2024-12-31 17:58:55,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.13 | bwd_microstep: 354.72 | bwd_inner_microstep: 354.36 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:58:56,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.22 | bwd_microstep: 340.08 | bwd_inner_microstep: 340.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:56,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.12 | bwd_microstep: 262.68 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:58:57,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.95 | bwd_microstep: 257.45 | bwd_inner_microstep: 257.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:58:57,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.33 | bwd_microstep: 256.26 | bwd_inner_microstep: 256.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:58,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 248.86 | bwd_inner_microstep: 248.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:58:58,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 246.00 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:58:58,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 268.73 | bwd_inner_microstep: 268.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:58:59,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 254.37 | bwd_inner_microstep: 254.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:58:59,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 245.48 | bwd_inner_microstep: 245.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:00,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 246.23 | bwd_inner_microstep: 246.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:00,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:01,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:01,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:02,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 247.38 | bwd_inner_microstep: 247.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:02,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.55 | optimizer_step: 3.30 +[2024-12-31 17:59:02,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 274.68 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 30.83 | step_microstep: 11.99 +[2024-12-31 17:59:02,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.78 | bwd: 4237.27 | bwd_inner: 4205.58 | bwd_allreduce: 31.09 | step: 15.01 + 46%|████▌ | 350/759 [48:57<50:18, 7.38s/it] {'loss': 1.2347, 'learning_rate': 1.1741152270812155e-05, 'epoch': 0.46} + 46%|████▌ | 350/759 [48:57<50:18, 7.38s/it][2024-12-31 17:59:03,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.40 | bwd_microstep: 360.15 | bwd_inner_microstep: 359.77 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.20 +[2024-12-31 17:59:03,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.55 | bwd_microstep: 341.04 | bwd_inner_microstep: 341.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:04,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.09 | bwd_microstep: 280.67 | bwd_inner_microstep: 280.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:59:04,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.47 | bwd_microstep: 258.88 | bwd_inner_microstep: 258.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:05,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 249.91 | bwd_inner_microstep: 249.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:05,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 250.16 | bwd_inner_microstep: 250.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:05,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.85 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:59:06,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 300.88 | bwd_inner_microstep: 300.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:06,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 246.53 | bwd_inner_microstep: 246.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:59:07,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:59:07,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:59:08,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:59:08,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.46 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:09,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 251.84 | bwd_inner_microstep: 251.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:59:09,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 265.98 | bwd_inner_microstep: 265.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:09,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.66 | optimizer_step: 3.42 +[2024-12-31 17:59:09,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 257.35 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 13.62 | step_microstep: 11.72 +[2024-12-31 17:59:09,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2867.98 | bwd: 4286.84 | bwd_inner: 4272.29 | bwd_allreduce: 13.92 | step: 14.19 + 46%|████▌ | 351/759 [49:04<50:16, 7.39s/it] {'loss': 1.2457, 'learning_rate': 1.1699103850286668e-05, 'epoch': 0.46} + 46%|████▌ | 351/759 [49:04<50:16, 7.39s/it][2024-12-31 17:59:10,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.45 | bwd_microstep: 316.89 | bwd_inner_microstep: 316.53 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:59:10,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.99 | bwd_microstep: 293.13 | bwd_inner_microstep: 293.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:59:11,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.23 | bwd_microstep: 282.33 | bwd_inner_microstep: 282.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:59:11,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.14 | bwd_microstep: 261.20 | bwd_inner_microstep: 261.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:12,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.97 | bwd_microstep: 258.14 | bwd_inner_microstep: 258.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:12,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 250.55 | bwd_inner_microstep: 250.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:13,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 248.05 | bwd_inner_microstep: 248.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:13,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 17:59:14,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:14,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:14,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 250.48 | bwd_inner_microstep: 250.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:15,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 242.91 | bwd_inner_microstep: 242.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:15,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.44 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:16,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.19 | bwd_microstep: 251.11 | bwd_inner_microstep: 251.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:16,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.32 | bwd_microstep: 241.35 | bwd_inner_microstep: 241.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:17,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.23 | optimizer_gradients: 0.62 | optimizer_step: 14.05 +[2024-12-31 17:59:17,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 321.06 | bwd_inner_microstep: 269.01 | bwd_allreduce_microstep: 52.00 | step_microstep: 24.48 +[2024-12-31 17:59:17,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2828.33 | bwd: 4195.19 | bwd_inner: 4142.27 | bwd_allreduce: 52.25 | step: 27.37 + 46%|████▋ | 352/759 [49:12<50:01, 7.37s/it] {'loss': 1.243, 'learning_rate': 1.1657024472439402e-05, 'epoch': 0.46} + 46%|████▋ | 352/759 [49:12<50:01, 7.37s/it][2024-12-31 17:59:17,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.25 | bwd_microstep: 387.93 | bwd_inner_microstep: 387.54 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 17:59:18,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.91 | bwd_microstep: 373.56 | bwd_inner_microstep: 373.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:18,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.21 | bwd_microstep: 262.73 | bwd_inner_microstep: 262.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:19,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 254.41 | bwd_inner_microstep: 254.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:19,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 248.27 | bwd_inner_microstep: 248.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:20,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.01 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:20,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:21,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.53 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:21,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.24 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:22,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.23 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:22,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:22,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 244.40 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:23,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 241.23 | bwd_inner_microstep: 241.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:23,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:24,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 257.30 | bwd_inner_microstep: 257.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:24,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.71 | optimizer_step: 3.35 +[2024-12-31 17:59:24,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.07 | bwd_microstep: 254.91 | bwd_inner_microstep: 241.19 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.10 +[2024-12-31 17:59:24,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2901.22 | bwd: 4235.92 | bwd_inner: 4221.41 | bwd_allreduce: 13.91 | step: 14.09 + 47%|████▋ | 353/759 [49:19<50:00, 7.39s/it] {'loss': 1.2467, 'learning_rate': 1.1614914903947952e-05, 'epoch': 0.47} + 47%|████▋ | 353/759 [49:19<50:00, 7.39s/it][2024-12-31 17:59:25,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.66 | bwd_microstep: 335.31 | bwd_inner_microstep: 334.97 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 17:59:25,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.28 | bwd_microstep: 291.10 | bwd_inner_microstep: 291.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 17:59:26,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.47 | bwd_microstep: 268.67 | bwd_inner_microstep: 268.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:26,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.68 | bwd_microstep: 265.67 | bwd_inner_microstep: 265.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:27,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.28 | bwd_microstep: 255.18 | bwd_inner_microstep: 255.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:27,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 249.21 | bwd_inner_microstep: 249.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:27,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:28,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.80 | bwd_microstep: 248.95 | bwd_inner_microstep: 248.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:28,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 246.53 | bwd_inner_microstep: 246.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:29,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 302.39 | bwd_inner_microstep: 302.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:29,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.54 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:30,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.83 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:30,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:31,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 254.09 | bwd_inner_microstep: 254.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:31,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.19 +[2024-12-31 17:59:32,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 11.10 | optimizer_gradients: 13.05 | optimizer_step: 3.10 +[2024-12-31 17:59:32,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 280.49 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 36.56 | step_microstep: 29.76 +[2024-12-31 17:59:32,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2818.65 | bwd: 4225.86 | bwd_inner: 4188.38 | bwd_allreduce: 36.86 | step: 32.70 + 47%|████▋ | 354/759 [49:26<49:49, 7.38s/it] {'loss': 1.254, 'learning_rate': 1.157277591203996e-05, 'epoch': 0.47} + 47%|████▋ | 354/759 [49:26<49:49, 7.38s/it][2024-12-31 17:59:32,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 256.71 | bwd_microstep: 389.51 | bwd_inner_microstep: 389.17 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:59:33,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.33 | bwd_microstep: 290.18 | bwd_inner_microstep: 290.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:33,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 256.83 | bwd_inner_microstep: 256.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:34,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.33 | bwd_microstep: 263.45 | bwd_inner_microstep: 263.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:59:34,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 248.55 | bwd_inner_microstep: 248.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:59:34,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:35,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:35,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:36,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.31 | bwd_microstep: 248.64 | bwd_inner_microstep: 248.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:36,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:37,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 242.54 | bwd_inner_microstep: 242.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:37,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 254.07 | bwd_inner_microstep: 254.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 17:59:38,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.54 | bwd_microstep: 251.68 | bwd_inner_microstep: 251.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:38,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:38,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.33 | bwd_microstep: 240.77 | bwd_inner_microstep: 240.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:39,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.14 | optimizer_gradients: 0.72 | optimizer_step: 3.29 +[2024-12-31 17:59:39,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 268.15 | bwd_inner_microstep: 254.55 | bwd_allreduce_microstep: 13.51 | step_microstep: 11.30 +[2024-12-31 17:59:39,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2862.87 | bwd: 4179.85 | bwd_inner: 4165.52 | bwd_allreduce: 13.78 | step: 14.00 + 47%|████▋ | 355/759 [49:34<49:35, 7.37s/it] {'loss': 1.242, 'learning_rate': 1.153060826447918e-05, 'epoch': 0.47} + 47%|████▋ | 355/759 [49:34<49:35, 7.37s/it][2024-12-31 17:59:39,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.30 | bwd_microstep: 314.14 | bwd_inner_microstep: 313.79 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 17:59:40,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.08 | bwd_microstep: 312.91 | bwd_inner_microstep: 312.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:40,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.77 | bwd_microstep: 256.36 | bwd_inner_microstep: 256.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:41,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 250.21 | bwd_inner_microstep: 250.00 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.27 +[2024-12-31 17:59:41,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 247.80 | bwd_inner_microstep: 247.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:42,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 252.27 | bwd_inner_microstep: 252.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:42,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 253.98 | bwd_inner_microstep: 253.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:43,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:43,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:43,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 247.27 | bwd_inner_microstep: 247.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:44,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:44,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:45,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:45,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 17:59:46,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.61 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 17:59:46,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.39 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 17:59:46,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 297.47 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 53.73 | step_microstep: 11.49 +[2024-12-31 17:59:46,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2805.23 | bwd: 4143.62 | bwd_inner: 4088.79 | bwd_allreduce: 54.10 | step: 14.31 + 47%|████▋ | 356/759 [49:41<49:11, 7.32s/it] {'loss': 1.2393, 'learning_rate': 1.1488412729551449e-05, 'epoch': 0.47} + 47%|████▋ | 356/759 [49:41<49:11, 7.32s/it][2024-12-31 17:59:47,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.43 | bwd_microstep: 308.01 | bwd_inner_microstep: 307.64 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 17:59:47,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.35 | bwd_microstep: 290.04 | bwd_inner_microstep: 289.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 17:59:48,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.25 | bwd_microstep: 280.72 | bwd_inner_microstep: 280.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 17:59:48,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.58 | bwd_microstep: 280.78 | bwd_inner_microstep: 280.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:59:49,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.89 | bwd_microstep: 262.04 | bwd_inner_microstep: 262.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 17:59:49,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.70 | bwd_microstep: 256.52 | bwd_inner_microstep: 256.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:49,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 248.57 | bwd_inner_microstep: 248.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:50,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 267.00 | bwd_inner_microstep: 266.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 17:59:50,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 246.15 | bwd_inner_microstep: 246.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 17:59:51,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:51,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.47 | bwd_inner_microstep: 245.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 17:59:52,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:59:52,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 17:59:52,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.67 | bwd_microstep: 250.35 | bwd_inner_microstep: 250.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:53,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:59:53,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.63 | optimizer_step: 7.80 +[2024-12-31 17:59:53,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.77 | bwd_microstep: 295.33 | bwd_inner_microstep: 227.28 | bwd_allreduce_microstep: 68.00 | step_microstep: 28.15 +[2024-12-31 17:59:53,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2805.76 | bwd: 4206.60 | bwd_inner: 4137.60 | bwd_allreduce: 68.29 | step: 30.66 + 47%|████▋ | 357/759 [49:48<48:59, 7.31s/it] {'loss': 1.2137, 'learning_rate': 1.144619007605071e-05, 'epoch': 0.47} + 47%|████▋ | 357/759 [49:48<48:59, 7.31s/it][2024-12-31 17:59:54,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.41 | bwd_microstep: 363.81 | bwd_inner_microstep: 363.45 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 17:59:54,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.16 | bwd_microstep: 291.72 | bwd_inner_microstep: 291.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:55,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.50 | bwd_microstep: 285.51 | bwd_inner_microstep: 285.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:55,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.17 | bwd_microstep: 266.80 | bwd_inner_microstep: 266.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:56,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.05 | bwd_microstep: 256.80 | bwd_inner_microstep: 256.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 17:59:56,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.44 | bwd_microstep: 257.62 | bwd_inner_microstep: 257.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 17:59:57,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.12 | bwd_microstep: 247.90 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:57,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:58,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 271.06 | bwd_inner_microstep: 271.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 17:59:58,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 17:59:59,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 17:59:59,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.51 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 17:59:59,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 245.46 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:00,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 260.71 | bwd_inner_microstep: 260.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:00,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:01,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.27 | optimizer_gradients: 0.80 | optimizer_step: 3.28 +[2024-12-31 18:00:01,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 256.83 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.76 +[2024-12-31 18:00:01,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2870.39 | bwd: 4227.19 | bwd_inner: 4212.70 | bwd_allreduce: 13.88 | step: 14.72 + 47%|████▋ | 358/759 [49:56<49:01, 7.33s/it] {'loss': 1.2508, 'learning_rate': 1.1403941073265014e-05, 'epoch': 0.47} + 47%|████▋ | 358/759 [49:56<49:01, 7.33s/it][2024-12-31 18:00:01,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.82 | bwd_microstep: 304.46 | bwd_inner_microstep: 304.11 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:00:02,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.78 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:00:02,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.65 | bwd_microstep: 272.38 | bwd_inner_microstep: 271.23 | bwd_allreduce_microstep: 0.48 | step_microstep: 0.44 +[2024-12-31 18:00:03,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.40 | bwd_microstep: 257.85 | bwd_inner_microstep: 257.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:03,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.93 | bwd_microstep: 247.22 | bwd_inner_microstep: 247.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:04,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 247.93 | bwd_inner_microstep: 247.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:04,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 246.08 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:04,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 254.92 | bwd_inner_microstep: 254.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:05,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:05,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:00:06,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:00:06,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 242.43 | bwd_inner_microstep: 242.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:00:07,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.21 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:07,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.83 | bwd_microstep: 241.27 | bwd_inner_microstep: 241.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:08,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.85 | bwd_microstep: 240.93 | bwd_inner_microstep: 240.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:08,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.14 | optimizer_gradients: 0.64 | optimizer_step: 11.32 +[2024-12-31 18:00:08,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.00 | bwd_microstep: 308.73 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 64.84 | step_microstep: 21.16 +[2024-12-31 18:00:08,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.06 | bwd: 4153.92 | bwd_inner: 4087.14 | bwd_allreduce: 65.62 | step: 24.51 + 47%|████▋ | 359/759 [50:03<48:48, 7.32s/it] {'loss': 1.2404, 'learning_rate': 1.1361666490962468e-05, 'epoch': 0.47} + 47%|████▋ | 359/759 [50:03<48:48, 7.32s/it][2024-12-31 18:00:09,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.06 | bwd_microstep: 337.78 | bwd_inner_microstep: 337.42 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:00:09,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.95 | bwd_microstep: 287.20 | bwd_inner_microstep: 287.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:00:10,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 262.78 | bwd_inner_microstep: 262.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:10,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.06 | bwd_microstep: 250.53 | bwd_inner_microstep: 250.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:00:10,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:11,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 248.02 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.30 +[2024-12-31 18:00:11,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 246.70 | bwd_inner_microstep: 246.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:12,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:12,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:00:13,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:13,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.83 | bwd_microstep: 243.02 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:13,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:14,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.62 | bwd_microstep: 240.93 | bwd_inner_microstep: 240.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:14,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.89 | bwd_microstep: 240.78 | bwd_inner_microstep: 240.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:00:15,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.32 | bwd_microstep: 240.62 | bwd_inner_microstep: 240.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:15,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.83 | optimizer_step: 3.14 +[2024-12-31 18:00:15,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 310.27 | bwd_inner_microstep: 241.49 | bwd_allreduce_microstep: 68.73 | step_microstep: 10.69 +[2024-12-31 18:00:15,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2805.72 | bwd: 4136.23 | bwd_inner: 4066.42 | bwd_allreduce: 69.05 | step: 13.46 + 47%|████▋ | 360/759 [50:10<48:29, 7.29s/it] {'loss': 1.2699, 'learning_rate': 1.1319367099377248e-05, 'epoch': 0.47} + 47%|████▋ | 360/759 [50:10<48:29, 7.29s/it][2024-12-31 18:00:16,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 281.30 | bwd_microstep: 314.48 | bwd_inner_microstep: 314.14 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:00:16,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.12 | bwd_microstep: 282.77 | bwd_inner_microstep: 282.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:00:17,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.90 | bwd_microstep: 276.75 | bwd_inner_microstep: 276.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:17,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.17 | bwd_microstep: 254.34 | bwd_inner_microstep: 254.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:00:18,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:00:18,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 247.14 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:00:19,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:19,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:00:19,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:00:20,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:20,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 242.53 | bwd_inner_microstep: 242.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:21,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 242.84 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:21,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.76 | bwd_microstep: 240.69 | bwd_inner_microstep: 240.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:22,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:22,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.51 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.03 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.26 +[2024-12-31 18:00:23,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.73 | optimizer_gradients: 0.64 | optimizer_step: 3.11 +[2024-12-31 18:00:23,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 538.86 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 295.39 | step_microstep: 17.22 +[2024-12-31 18:00:23,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2875.47 | bwd: 4356.78 | bwd_inner: 4060.43 | bwd_allreduce: 295.72 | step: 20.12 + 48%|████▊ | 361/759 [50:18<48:49, 7.36s/it] {'loss': 1.2087, 'learning_rate': 1.1277043669195549e-05, 'epoch': 0.48} + 48%|████▊ | 361/759 [50:18<48:49, 7.36s/it][2024-12-31 18:00:23,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 278.80 | bwd_microstep: 357.61 | bwd_inner_microstep: 357.09 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.36 +[2024-12-31 18:00:24,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.69 | bwd_microstep: 291.40 | bwd_inner_microstep: 291.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:24,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.68 | bwd_microstep: 256.94 | bwd_inner_microstep: 256.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:00:25,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 248.07 | bwd_inner_microstep: 248.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:25,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:26,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 246.56 | bwd_inner_microstep: 246.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:00:26,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.55 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:27,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:27,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.10 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:00:27,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 242.40 | bwd_inner_microstep: 242.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:28,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.68 | bwd_microstep: 242.07 | bwd_inner_microstep: 242.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:00:28,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.52 | bwd_microstep: 241.00 | bwd_inner_microstep: 240.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:29,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.45 | bwd_microstep: 240.62 | bwd_inner_microstep: 240.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:00:29,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 240.61 | bwd_inner_microstep: 240.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:30,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.96 | bwd_microstep: 255.56 | bwd_inner_microstep: 255.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:00:30,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.82 | optimizer_gradients: 0.72 | optimizer_step: 3.29 +[2024-12-31 18:00:30,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.74 | bwd_microstep: 255.96 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 14.72 | step_microstep: 11.48 +[2024-12-31 18:00:30,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2862.30 | bwd: 4098.65 | bwd_inner: 4082.89 | bwd_allreduce: 15.06 | step: 14.55 + 48%|████▊ | 362/759 [50:25<48:30, 7.33s/it] {'loss': 1.2189, 'learning_rate': 1.1234696971541534e-05, 'epoch': 0.48} + 48%|████▊ | 362/759 [50:25<48:30, 7.33s/it][2024-12-31 18:00:31,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.29 | bwd_microstep: 352.34 | bwd_inner_microstep: 351.98 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:00:31,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.33 | bwd_microstep: 357.36 | bwd_inner_microstep: 357.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:32,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.48 | bwd_microstep: 266.09 | bwd_inner_microstep: 266.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:00:32,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.43 | bwd_microstep: 267.81 | bwd_inner_microstep: 267.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:33,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.62 | bwd_microstep: 320.19 | bwd_inner_microstep: 320.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:00:33,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 247.96 | bwd_inner_microstep: 247.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:34,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 247.35 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:00:34,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 257.87 | bwd_inner_microstep: 257.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:00:34,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 246.52 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:00:35,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.65 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:35,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.18 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:36,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:36,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:37,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 242.69 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:37,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 242.61 | bwd_inner_microstep: 242.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:00:38,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.15 | optimizer_gradients: 0.69 | optimizer_step: 3.20 +[2024-12-31 18:00:38,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 266.57 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 22.65 | step_microstep: 12.66 +[2024-12-31 18:00:38,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2879.61 | bwd: 4291.79 | bwd_inner: 4268.09 | bwd_allreduce: 22.97 | step: 15.41 + 48%|████▊ | 363/759 [50:32<48:37, 7.37s/it] {'loss': 1.2292, 'learning_rate': 1.1192327777963313e-05, 'epoch': 0.48} + 48%|████▊ | 363/759 [50:32<48:37, 7.37s/it][2024-12-31 18:00:38,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.07 | bwd_microstep: 315.87 | bwd_inner_microstep: 315.36 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.27 +[2024-12-31 18:00:39,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.96 | bwd_microstep: 286.83 | bwd_inner_microstep: 286.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:00:39,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.85 | bwd_microstep: 284.49 | bwd_inner_microstep: 284.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:39,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.64 | bwd_microstep: 256.09 | bwd_inner_microstep: 256.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:00:40,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 247.69 | bwd_inner_microstep: 247.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:40,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 248.28 | bwd_inner_microstep: 248.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:41,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:00:41,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.08 | bwd_microstep: 248.05 | bwd_inner_microstep: 248.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:42,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 258.11 | bwd_inner_microstep: 258.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:00:42,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 245.03 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:00:43,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:00:43,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:00:43,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:44,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:44,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.16 | bwd_microstep: 241.25 | bwd_inner_microstep: 241.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:45,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 28.14 | optimizer_gradients: 10.00 | optimizer_step: 57.00 +[2024-12-31 18:00:45,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 521.58 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 277.48 | step_microstep: 97.67 +[2024-12-31 18:00:45,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.02 | bwd: 4375.17 | bwd_inner: 4096.66 | bwd_allreduce: 277.78 | step: 100.98 + 48%|████▊ | 364/759 [50:40<48:57, 7.44s/it] {'loss': 1.2429, 'learning_rate': 1.1149936860418846e-05, 'epoch': 0.48} + 48%|████▊ | 364/759 [50:40<48:57, 7.44s/it][2024-12-31 18:00:46,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 274.66 | bwd_microstep: 467.88 | bwd_inner_microstep: 467.53 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:00:46,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.71 | bwd_microstep: 307.09 | bwd_inner_microstep: 307.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:47,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.66 | bwd_microstep: 286.04 | bwd_inner_microstep: 286.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:47,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.91 | bwd_microstep: 254.86 | bwd_inner_microstep: 254.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:00:48,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 249.79 | bwd_inner_microstep: 249.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:48,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.27 | bwd_microstep: 255.40 | bwd_inner_microstep: 255.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:00:49,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:00:49,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.90 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:00:50,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:50,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.50 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:50,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:51,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 257.27 | bwd_inner_microstep: 256.93 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.27 +[2024-12-31 18:00:51,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:00:52,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.20 | bwd_microstep: 263.04 | bwd_inner_microstep: 262.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:00:52,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 242.51 | bwd_inner_microstep: 242.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:00:53,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.65 | optimizer_gradients: 0.96 | optimizer_step: 3.14 +[2024-12-31 18:00:53,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 344.33 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 100.99 | step_microstep: 12.11 +[2024-12-31 18:00:53,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2907.43 | bwd: 4397.76 | bwd_inner: 4295.60 | bwd_allreduce: 101.46 | step: 14.86 + 48%|████▊ | 365/759 [50:48<49:06, 7.48s/it] {'loss': 1.2296, 'learning_rate': 1.1107524991261913e-05, 'epoch': 0.48} + 48%|████▊ | 365/759 [50:48<49:06, 7.48s/it][2024-12-31 18:00:53,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.50 | bwd_microstep: 337.57 | bwd_inner_microstep: 337.21 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:00:54,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 232.71 | bwd_microstep: 371.49 | bwd_inner_microstep: 371.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:00:54,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.58 | bwd_microstep: 257.07 | bwd_inner_microstep: 257.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:00:55,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.87 | bwd_microstep: 257.58 | bwd_inner_microstep: 257.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:00:55,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 251.74 | bwd_inner_microstep: 251.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:00:56,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 261.22 | bwd_inner_microstep: 261.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:56,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:00:57,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 249.35 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:00:57,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:00:57,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:58,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:58,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.59 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:59,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:00:59,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.10 | bwd_microstep: 241.18 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:00:59,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.86 | bwd_microstep: 242.29 | bwd_inner_microstep: 242.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:01:00,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.71 | optimizer_step: 3.29 +[2024-12-31 18:01:00,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.86 | bwd_microstep: 255.12 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 13.55 | step_microstep: 11.29 +[2024-12-31 18:01:00,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2900.63 | bwd: 4195.11 | bwd_inner: 4180.71 | bwd_allreduce: 13.84 | step: 12.49 + 48%|████▊ | 366/759 [50:55<48:39, 7.43s/it] {'loss': 1.2658, 'learning_rate': 1.1065092943228024e-05, 'epoch': 0.48} + 48%|████▊ | 366/759 [50:55<48:39, 7.43s/it][2024-12-31 18:01:01,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.66 | bwd_microstep: 315.59 | bwd_inner_microstep: 315.22 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:01:01,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.20 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:02,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.71 | bwd_microstep: 267.56 | bwd_inner_microstep: 267.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:02,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.56 | bwd_microstep: 256.39 | bwd_inner_microstep: 256.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:02,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 250.80 | bwd_inner_microstep: 250.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:03,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:03,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 254.87 | bwd_inner_microstep: 254.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:04,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 248.09 | bwd_inner_microstep: 248.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:04,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:05,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 252.44 | bwd_inner_microstep: 252.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:05,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:01:05,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:06,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:06,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:07,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.68 | bwd_microstep: 241.24 | bwd_inner_microstep: 241.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:01:07,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.34 | optimizer_gradients: 0.63 | optimizer_step: 3.61 +[2024-12-31 18:01:07,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 349.20 | bwd_inner_microstep: 246.57 | bwd_allreduce_microstep: 102.59 | step_microstep: 12.71 +[2024-12-31 18:01:07,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2824.88 | bwd: 4223.60 | bwd_inner: 4120.23 | bwd_allreduce: 102.85 | step: 15.49 + 48%|████▊ | 367/759 [51:02<48:19, 7.40s/it] {'loss': 1.2367, 'learning_rate': 1.1022641489420342e-05, 'epoch': 0.48} + 48%|████▊ | 367/759 [51:02<48:19, 7.40s/it][2024-12-31 18:01:08,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 258.07 | bwd_microstep: 370.60 | bwd_inner_microstep: 370.26 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:01:08,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.06 | bwd_microstep: 292.63 | bwd_inner_microstep: 292.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:09,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.03 | bwd_microstep: 268.22 | bwd_inner_microstep: 268.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:09,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.91 | bwd_microstep: 262.19 | bwd_inner_microstep: 262.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:01:10,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 249.40 | bwd_inner_microstep: 249.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:01:10,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 248.32 | bwd_inner_microstep: 248.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:11,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 245.95 | bwd_inner_microstep: 245.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:11,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 247.57 | bwd_inner_microstep: 247.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:01:12,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:01:12,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 339.72 | bwd_inner_microstep: 339.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:13,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.03 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:13,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:13,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.16 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.19 +[2024-12-31 18:01:14,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:14,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 241.35 | bwd_inner_microstep: 241.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:15,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.07 | optimizer_gradients: 0.55 | optimizer_step: 3.10 +[2024-12-31 18:01:15,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 302.41 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 58.01 | step_microstep: 12.46 +[2024-12-31 18:01:15,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.45 | bwd: 4291.50 | bwd_inner: 4232.44 | bwd_allreduce: 58.38 | step: 15.17 + 48%|████▊ | 368/759 [51:10<48:17, 7.41s/it] {'loss': 1.2268, 'learning_rate': 1.098017140329561e-05, 'epoch': 0.48} + 48%|████▊ | 368/759 [51:10<48:17, 7.41s/it][2024-12-31 18:01:15,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.74 | bwd_microstep: 349.12 | bwd_inner_microstep: 348.74 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.24 +[2024-12-31 18:01:16,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.67 | bwd_microstep: 292.63 | bwd_inner_microstep: 292.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:01:16,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.01 | bwd_microstep: 259.67 | bwd_inner_microstep: 259.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:17,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.39 | bwd_microstep: 256.24 | bwd_inner_microstep: 256.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:01:17,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 248.95 | bwd_inner_microstep: 248.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:01:18,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 256.66 | bwd_inner_microstep: 256.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:18,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 246.63 | bwd_inner_microstep: 246.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:19,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 247.16 | bwd_inner_microstep: 247.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:01:19,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.03 | bwd_microstep: 259.02 | bwd_inner_microstep: 258.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:01:19,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:20,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 270.87 | bwd_inner_microstep: 270.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:01:20,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.01 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:21,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:21,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:01:22,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.53 | bwd_microstep: 241.40 | bwd_inner_microstep: 241.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:22,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 8.20 | optimizer_gradients: 0.62 | optimizer_step: 3.12 +[2024-12-31 18:01:22,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.16 | bwd_microstep: 280.43 | bwd_inner_microstep: 240.59 | bwd_allreduce_microstep: 39.79 | step_microstep: 30.20 +[2024-12-31 18:01:22,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2823.66 | bwd: 4185.27 | bwd_inner: 4144.47 | bwd_allreduce: 40.06 | step: 33.23 + 49%|████▊ | 369/759 [51:17<47:58, 7.38s/it] {'loss': 1.2717, 'learning_rate': 1.0937683458650029e-05, 'epoch': 0.49} + 49%|████▊ | 369/759 [51:17<47:58, 7.38s/it][2024-12-31 18:01:23,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.49 | bwd_microstep: 318.97 | bwd_inner_microstep: 318.62 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:01:23,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.43 | bwd_microstep: 287.56 | bwd_inner_microstep: 287.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:01:24,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.61 | bwd_microstep: 267.19 | bwd_inner_microstep: 267.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:01:24,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 249.07 | bwd_inner_microstep: 249.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:01:24,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 249.00 | bwd_inner_microstep: 248.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:25,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 248.57 | bwd_inner_microstep: 248.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:25,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:01:26,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.86 | bwd_inner_microstep: 245.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:01:26,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:27,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:27,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 249.58 | bwd_inner_microstep: 249.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:01:27,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.91 | bwd_microstep: 242.30 | bwd_inner_microstep: 242.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:28,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:01:28,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:29,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.38 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:29,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.63 | optimizer_gradients: 0.99 | optimizer_step: 3.10 +[2024-12-31 18:01:29,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 346.10 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 102.60 | step_microstep: 10.82 +[2024-12-31 18:01:29,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2790.00 | bwd: 4175.10 | bwd_inner: 4071.66 | bwd_allreduce: 102.85 | step: 13.68 + 49%|████▊ | 370/759 [51:24<47:36, 7.34s/it] {'loss': 1.2409, 'learning_rate': 1.0895178429605189e-05, 'epoch': 0.49} + 49%|████▊ | 370/759 [51:24<47:36, 7.34s/it][2024-12-31 18:01:30,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.47 | bwd_microstep: 314.45 | bwd_inner_microstep: 314.07 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:01:30,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.27 | bwd_microstep: 302.60 | bwd_inner_microstep: 302.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:01:31,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.97 | bwd_microstep: 269.88 | bwd_inner_microstep: 269.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:31,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 257.48 | bwd_inner_microstep: 257.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:32,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 249.12 | bwd_inner_microstep: 249.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:32,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 250.34 | bwd_inner_microstep: 250.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:33,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 247.42 | bwd_inner_microstep: 247.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:33,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 246.29 | bwd_inner_microstep: 246.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:01:34,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:34,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:01:34,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:35,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:35,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:36,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 254.37 | bwd_inner_microstep: 254.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:01:36,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:37,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 0.61 | optimizer_step: 3.08 +[2024-12-31 18:01:37,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 378.17 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 133.55 | step_microstep: 10.99 +[2024-12-31 18:01:37,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.43 | bwd: 4233.30 | bwd_inner: 4098.96 | bwd_allreduce: 133.81 | step: 13.84 + 49%|████▉ | 371/759 [51:32<47:29, 7.35s/it] {'loss': 1.2242, 'learning_rate': 1.0852657090593961e-05, 'epoch': 0.49} + 49%|████▉ | 371/759 [51:32<47:29, 7.35s/it][2024-12-31 18:01:37,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 282.76 | bwd_microstep: 305.11 | bwd_inner_microstep: 304.79 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.20 +[2024-12-31 18:01:38,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.95 | bwd_microstep: 293.19 | bwd_inner_microstep: 293.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:38,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.85 | bwd_microstep: 285.57 | bwd_inner_microstep: 285.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:01:39,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.91 | bwd_microstep: 260.80 | bwd_inner_microstep: 260.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:39,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 248.40 | bwd_inner_microstep: 248.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:40,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 248.99 | bwd_inner_microstep: 248.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:40,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 246.44 | bwd_inner_microstep: 246.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:40,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:01:41,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:01:41,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:42,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:42,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:43,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:43,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:43,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 242.58 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.30 +[2024-12-31 18:01:44,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.03 | optimizer_gradients: 0.60 | optimizer_step: 3.26 +[2024-12-31 18:01:44,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 338.00 | bwd_inner_microstep: 254.14 | bwd_allreduce_microstep: 83.81 | step_microstep: 13.10 +[2024-12-31 18:01:44,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2878.15 | bwd: 4181.69 | bwd_inner: 4096.91 | bwd_allreduce: 84.13 | step: 16.13 + 49%|████▉ | 372/759 [51:39<47:23, 7.35s/it] {'loss': 1.2232, 'learning_rate': 1.0810120216346368e-05, 'epoch': 0.49} + 49%|████▉ | 372/759 [51:39<47:23, 7.35s/it][2024-12-31 18:01:45,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.81 | bwd_microstep: 339.73 | bwd_inner_microstep: 339.27 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.18 +[2024-12-31 18:01:45,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.36 | bwd_microstep: 266.91 | bwd_inner_microstep: 266.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:46,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.03 | bwd_microstep: 262.42 | bwd_inner_microstep: 262.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:01:46,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.20 | bwd_microstep: 256.52 | bwd_inner_microstep: 256.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:46,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.87 | bwd_microstep: 249.88 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:01:47,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 249.74 | bwd_inner_microstep: 249.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:01:47,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:01:48,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:48,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.72 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:49,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:49,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 242.23 | bwd_inner_microstep: 242.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:01:49,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 242.59 | bwd_inner_microstep: 242.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:01:50,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:50,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 242.34 | bwd_inner_microstep: 242.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:01:51,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.27 | bwd_microstep: 242.37 | bwd_inner_microstep: 242.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:01:51,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.99 | optimizer_gradients: 0.62 | optimizer_step: 3.10 +[2024-12-31 18:01:51,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.67 | bwd_microstep: 466.13 | bwd_inner_microstep: 250.85 | bwd_allreduce_microstep: 215.24 | step_microstep: 12.87 +[2024-12-31 18:01:51,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.34 | bwd: 4281.99 | bwd_inner: 4065.91 | bwd_allreduce: 215.51 | step: 14.55 + 49%|████▉ | 373/759 [51:46<47:22, 7.36s/it] {'loss': 1.2814, 'learning_rate': 1.0767568581875494e-05, 'epoch': 0.49} + 49%|████▉ | 373/759 [51:46<47:22, 7.36s/it][2024-12-31 18:01:52,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.36 | bwd_microstep: 312.33 | bwd_inner_microstep: 311.97 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:01:52,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.41 | bwd_microstep: 288.49 | bwd_inner_microstep: 288.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:53,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.99 | bwd_microstep: 283.37 | bwd_inner_microstep: 283.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:53,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.30 | bwd_microstep: 281.91 | bwd_inner_microstep: 281.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:01:54,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.23 | bwd_microstep: 255.41 | bwd_inner_microstep: 255.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:54,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 248.47 | bwd_inner_microstep: 248.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:55,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:01:55,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:56,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:56,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:56,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:57,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 255.53 | bwd_inner_microstep: 255.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:57,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:01:58,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:01:58,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:01:59,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.24 | optimizer_gradients: 0.84 | optimizer_step: 3.15 +[2024-12-31 18:01:59,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.93 | bwd_microstep: 459.12 | bwd_inner_microstep: 241.99 | bwd_allreduce_microstep: 217.04 | step_microstep: 12.22 +[2024-12-31 18:01:59,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.86 | bwd: 4341.72 | bwd_inner: 4123.84 | bwd_allreduce: 217.32 | step: 15.28 + 49%|████▉ | 374/759 [51:54<47:26, 7.39s/it] {'loss': 1.2448, 'learning_rate': 1.072500296246334e-05, 'epoch': 0.49} + 49%|████▉ | 374/759 [51:54<47:26, 7.39s/it][2024-12-31 18:02:00,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 233.51 | bwd_microstep: 384.52 | bwd_inner_microstep: 384.17 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:02:00,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.50 | bwd_microstep: 283.88 | bwd_inner_microstep: 283.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:02:00,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.04 | bwd_microstep: 257.15 | bwd_inner_microstep: 257.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:01,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.15 | bwd_microstep: 262.48 | bwd_inner_microstep: 262.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:02:01,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 250.27 | bwd_inner_microstep: 250.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:02:02,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.96 | bwd_microstep: 255.14 | bwd_inner_microstep: 255.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:02,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:02:03,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 245.91 | bwd_inner_microstep: 245.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:03,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:04,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 245.46 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:04,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:04,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.41 | bwd_microstep: 241.84 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.26 +[2024-12-31 18:02:05,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.42 | bwd_microstep: 241.29 | bwd_inner_microstep: 241.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:05,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:06,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.20 | bwd_microstep: 243.06 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.46 | step_microstep: 0.71 +[2024-12-31 18:02:06,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 0.68 | optimizer_step: 3.20 +[2024-12-31 18:02:06,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.48 | bwd_microstep: 259.35 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 16.79 | step_microstep: 16.59 +[2024-12-31 18:02:06,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.69 | bwd: 4149.02 | bwd_inner: 4129.81 | bwd_allreduce: 17.71 | step: 20.22 + 49%|████▉ | 375/759 [52:01<47:11, 7.37s/it] {'loss': 1.2454, 'learning_rate': 1.0682424133646712e-05, 'epoch': 0.49} + 49%|████▉ | 375/759 [52:01<47:11, 7.37s/it][2024-12-31 18:02:07,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.05 | bwd_microstep: 351.92 | bwd_inner_microstep: 351.59 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:02:07,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.67 | bwd_microstep: 288.76 | bwd_inner_microstep: 288.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:08,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.24 | bwd_microstep: 266.64 | bwd_inner_microstep: 266.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:08,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.17 | bwd_microstep: 263.41 | bwd_inner_microstep: 263.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:09,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 250.10 | bwd_inner_microstep: 250.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:09,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 257.09 | bwd_inner_microstep: 257.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:10,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:10,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 255.26 | bwd_inner_microstep: 255.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:02:10,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.51 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:11,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:11,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.60 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:02:12,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 256.03 | bwd_inner_microstep: 255.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:02:12,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:02:13,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:13,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.27 | bwd_microstep: 241.75 | bwd_inner_microstep: 241.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:14,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 92.14 | optimizer_gradients: 3.99 | optimizer_step: 10.34 +[2024-12-31 18:02:14,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 258.73 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 13.93 | step_microstep: 109.95 +[2024-12-31 18:02:14,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.12 | bwd: 4157.60 | bwd_inner: 4142.62 | bwd_allreduce: 14.20 | step: 112.86 + 50%|████▉ | 376/759 [52:09<47:07, 7.38s/it] {'loss': 1.2136, 'learning_rate': 1.0639832871203094e-05, 'epoch': 0.5} + 50%|████▉ | 376/759 [52:09<47:07, 7.38s/it][2024-12-31 18:02:14,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.45 | bwd_microstep: 356.69 | bwd_inner_microstep: 356.31 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:02:15,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.76 | bwd_microstep: 283.74 | bwd_inner_microstep: 283.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:15,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.29 | bwd_microstep: 262.95 | bwd_inner_microstep: 262.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:16,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.25 | bwd_microstep: 256.64 | bwd_inner_microstep: 256.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.71 | bwd_microstep: 256.40 | bwd_inner_microstep: 256.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:17,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.79 | bwd_microstep: 247.79 | bwd_inner_microstep: 247.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:17,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 248.07 | bwd_inner_microstep: 248.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:17,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 248.30 | bwd_inner_microstep: 248.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:18,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:18,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:19,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 251.96 | bwd_inner_microstep: 251.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:02:19,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:20,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:20,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.61 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:20,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 242.79 | bwd_inner_microstep: 242.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:21,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.73 | optimizer_step: 3.34 +[2024-12-31 18:02:21,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 257.88 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 13.52 | step_microstep: 10.77 +[2024-12-31 18:02:21,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.14 | bwd: 4133.80 | bwd_inner: 4119.38 | bwd_allreduce: 13.80 | step: 13.78 + 50%|████▉ | 377/759 [52:16<46:49, 7.36s/it] {'loss': 1.222, 'learning_rate': 1.0597229951136498e-05, 'epoch': 0.5} + 50%|████▉ | 377/759 [52:16<46:49, 7.36s/it][2024-12-31 18:02:21,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.90 | bwd_microstep: 302.27 | bwd_inner_microstep: 301.92 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:02:22,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.11 | bwd_microstep: 282.92 | bwd_inner_microstep: 282.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:02:22,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.32 | bwd_microstep: 267.77 | bwd_inner_microstep: 267.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:23,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.66 | bwd_microstep: 254.80 | bwd_inner_microstep: 254.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:02:23,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.48 | bwd_microstep: 259.42 | bwd_inner_microstep: 259.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:24,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.24 | bwd_microstep: 291.41 | bwd_inner_microstep: 291.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:24,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 247.20 | bwd_inner_microstep: 247.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:02:25,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 246.37 | bwd_inner_microstep: 246.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:25,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 246.52 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:26,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:02:26,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:26,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:27,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 246.04 | bwd_inner_microstep: 246.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:27,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 249.77 | bwd_inner_microstep: 249.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:28,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 248.56 | bwd_inner_microstep: 248.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:28,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.69 | optimizer_step: 3.14 +[2024-12-31 18:02:28,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 515.67 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 271.36 | step_microstep: 10.95 +[2024-12-31 18:02:28,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2810.35 | bwd: 4391.31 | bwd_inner: 4119.11 | bwd_allreduce: 271.60 | step: 13.93 + 50%|████▉ | 378/759 [52:23<46:57, 7.39s/it] {'loss': 1.2472, 'learning_rate': 1.0554616149663355e-05, 'epoch': 0.5} + 50%|████▉ | 378/759 [52:23<46:57, 7.39s/it][2024-12-31 18:02:29,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.75 | bwd_microstep: 370.49 | bwd_inner_microstep: 370.13 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:02:30,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.28 | bwd_microstep: 291.32 | bwd_inner_microstep: 291.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:30,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.57 | bwd_microstep: 287.98 | bwd_inner_microstep: 287.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:30,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.08 | bwd_microstep: 267.09 | bwd_inner_microstep: 267.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:31,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.83 | bwd_microstep: 271.39 | bwd_inner_microstep: 271.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:31,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.49 | bwd_microstep: 262.89 | bwd_inner_microstep: 262.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:32,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 261.35 | bwd_inner_microstep: 261.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:32,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 247.84 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:33,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 251.33 | bwd_inner_microstep: 251.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:02:33,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:34,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:34,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:34,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:35,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:35,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.51 | bwd_microstep: 225.35 | bwd_inner_microstep: 225.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:36,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.18 | optimizer_gradients: 0.63 | optimizer_step: 3.11 +[2024-12-31 18:02:36,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 286.01 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 41.97 | step_microstep: 22.37 +[2024-12-31 18:02:36,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.00 | bwd: 4244.87 | bwd_inner: 4202.11 | bwd_allreduce: 42.22 | step: 25.41 + 50%|████▉ | 379/759 [52:31<46:50, 7.40s/it] {'loss': 1.2139, 'learning_rate': 1.0511992243198335e-05, 'epoch': 0.5} + 50%|████▉ | 379/759 [52:31<46:50, 7.40s/it][2024-12-31 18:02:36,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.38 | bwd_microstep: 384.64 | bwd_inner_microstep: 384.17 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.26 +[2024-12-31 18:02:37,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.02 | bwd_microstep: 288.72 | bwd_inner_microstep: 288.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:02:37,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.66 | bwd_microstep: 261.01 | bwd_inner_microstep: 260.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:02:38,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.36 | bwd_microstep: 248.70 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:38,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 249.70 | bwd_inner_microstep: 249.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:39,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 257.23 | bwd_inner_microstep: 257.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:39,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.84 | bwd_microstep: 259.00 | bwd_inner_microstep: 258.78 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.26 +[2024-12-31 18:02:40,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.13 | bwd_microstep: 250.87 | bwd_inner_microstep: 250.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:40,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:02:41,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.78 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:41,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:02:41,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 250.31 | bwd_inner_microstep: 250.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:02:42,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:42,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:43,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 251.24 | bwd_inner_microstep: 251.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:43,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.39 | optimizer_gradients: 1.06 | optimizer_step: 3.49 +[2024-12-31 18:02:43,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 257.94 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 13.60 | step_microstep: 14.62 +[2024-12-31 18:02:43,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2905.31 | bwd: 4179.59 | bwd_inner: 4164.57 | bwd_allreduce: 14.07 | step: 17.66 + 50%|█████ | 380/759 [52:38<46:43, 7.40s/it] {'loss': 1.2283, 'learning_rate': 1.0469359008340216e-05, 'epoch': 0.5} + 50%|█████ | 380/759 [52:38<46:43, 7.40s/it][2024-12-31 18:02:44,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.13 | bwd_microstep: 338.80 | bwd_inner_microstep: 338.45 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:02:44,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.04 | bwd_microstep: 288.07 | bwd_inner_microstep: 288.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:45,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.04 | bwd_microstep: 282.13 | bwd_inner_microstep: 282.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:45,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.57 | bwd_microstep: 263.41 | bwd_inner_microstep: 263.17 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:02:46,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.34 | bwd_microstep: 255.03 | bwd_inner_microstep: 254.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:46,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 250.66 | bwd_inner_microstep: 250.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:47,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.06 | bwd_microstep: 249.71 | bwd_inner_microstep: 249.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:47,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.52 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:47,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:48,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 245.97 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:48,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.92 | bwd_microstep: 246.20 | bwd_inner_microstep: 246.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.42 +[2024-12-31 18:02:49,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:49,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:50,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:50,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:51,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.24 | optimizer_gradients: 0.64 | optimizer_step: 3.08 +[2024-12-31 18:02:51,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 283.56 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 39.13 | step_microstep: 10.97 +[2024-12-31 18:02:51,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.19 | bwd: 4177.11 | bwd_inner: 4136.84 | bwd_allreduce: 39.56 | step: 14.24 + 50%|█████ | 381/759 [52:45<46:26, 7.37s/it] {'loss': 1.2242, 'learning_rate': 1.0426717221857756e-05, 'epoch': 0.5} + 50%|█████ | 381/759 [52:45<46:26, 7.37s/it][2024-12-31 18:02:51,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.15 | bwd_microstep: 373.36 | bwd_inner_microstep: 372.98 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:02:52,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.98 | bwd_microstep: 291.46 | bwd_inner_microstep: 291.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:52,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.79 | bwd_microstep: 283.86 | bwd_inner_microstep: 283.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:53,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.75 | bwd_microstep: 256.48 | bwd_inner_microstep: 256.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:53,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.77 | bwd_microstep: 255.03 | bwd_inner_microstep: 255.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:53,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:54,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 248.24 | bwd_inner_microstep: 248.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:02:54,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 248.18 | bwd_inner_microstep: 248.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:55,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 245.81 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:55,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:56,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:56,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:02:57,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:02:57,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:02:57,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 240.35 | bwd_inner_microstep: 240.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:58,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.51 | optimizer_gradients: 0.74 | optimizer_step: 3.31 +[2024-12-31 18:02:58,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.43 | bwd_microstep: 256.52 | bwd_inner_microstep: 242.89 | bwd_allreduce_microstep: 13.52 | step_microstep: 14.50 +[2024-12-31 18:02:58,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2856.07 | bwd: 4169.77 | bwd_inner: 4155.26 | bwd_allreduce: 13.82 | step: 17.42 + 50%|█████ | 382/759 [52:53<46:13, 7.36s/it] {'loss': 1.2512, 'learning_rate': 1.0384067660675508e-05, 'epoch': 0.5} + 50%|█████ | 382/759 [52:53<46:13, 7.36s/it][2024-12-31 18:02:58,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.19 | bwd_microstep: 368.71 | bwd_inner_microstep: 368.36 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.24 +[2024-12-31 18:02:59,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.54 | bwd_microstep: 292.79 | bwd_inner_microstep: 292.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:02:59,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.43 | bwd_microstep: 261.63 | bwd_inner_microstep: 261.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:00,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.60 | bwd_microstep: 256.75 | bwd_inner_microstep: 256.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:00,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 248.48 | bwd_inner_microstep: 248.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:01,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 246.70 | bwd_inner_microstep: 246.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:01,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 248.58 | bwd_inner_microstep: 248.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:02,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 249.17 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:02,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:02,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:03:03,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:03,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:04,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:04,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:05,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.49 | bwd_microstep: 241.21 | bwd_inner_microstep: 241.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:05,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.73 | optimizer_gradients: 0.78 | optimizer_step: 3.37 +[2024-12-31 18:03:05,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.45 | bwd_microstep: 270.93 | bwd_inner_microstep: 256.54 | bwd_allreduce_microstep: 14.18 | step_microstep: 17.19 +[2024-12-31 18:03:05,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2813.55 | bwd: 4152.39 | bwd_inner: 4137.18 | bwd_allreduce: 14.51 | step: 20.28 + 50%|█████ | 383/759 [53:00<45:57, 7.33s/it] {'loss': 1.2419, 'learning_rate': 1.034141110185968e-05, 'epoch': 0.5} + 50%|█████ | 383/759 [53:00<45:57, 7.33s/it][2024-12-31 18:03:06,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.33 | bwd_microstep: 366.75 | bwd_inner_microstep: 366.41 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:03:06,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.03 | bwd_microstep: 300.35 | bwd_inner_microstep: 300.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:03:07,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.05 | bwd_microstep: 286.54 | bwd_inner_microstep: 286.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:03:07,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.66 | bwd_microstep: 265.93 | bwd_inner_microstep: 265.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:03:08,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.72 | bwd_microstep: 263.00 | bwd_inner_microstep: 262.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:08,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.67 | bwd_microstep: 261.39 | bwd_inner_microstep: 261.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:03:09,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:03:09,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 248.93 | bwd_inner_microstep: 248.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:03:09,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 246.00 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:10,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 245.68 | bwd_inner_microstep: 245.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:10,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:11,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:11,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.91 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:12,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.01 | bwd_inner_microstep: 243.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:12,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 242.89 | bwd_inner_microstep: 242.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:13,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.14 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 18:03:13,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 329.27 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 84.35 | step_microstep: 10.67 +[2024-12-31 18:03:13,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2904.10 | bwd: 4283.93 | bwd_inner: 4198.81 | bwd_allreduce: 84.60 | step: 13.75 + 51%|█████ | 384/759 [53:08<46:05, 7.38s/it] {'loss': 1.2473, 'learning_rate': 1.0298748322603982e-05, 'epoch': 0.51} + 51%|█████ | 384/759 [53:08<46:05, 7.38s/it][2024-12-31 18:03:13,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 273.83 | bwd_microstep: 471.96 | bwd_inner_microstep: 471.59 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:03:14,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.80 | bwd_microstep: 356.09 | bwd_inner_microstep: 356.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:14,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.78 | bwd_microstep: 280.94 | bwd_inner_microstep: 280.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:15,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.21 | bwd_microstep: 259.49 | bwd_inner_microstep: 259.13 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.27 +[2024-12-31 18:03:15,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 254.77 | bwd_inner_microstep: 254.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:03:16,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 248.55 | bwd_inner_microstep: 248.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:03:16,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 450.02 | bwd_inner_microstep: 449.79 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:03:17,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.16 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:17,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 246.08 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:18,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 246.29 | bwd_inner_microstep: 246.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:18,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.32 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:19,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:19,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.31 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:19,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:20,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.61 | bwd_microstep: 241.01 | bwd_inner_microstep: 240.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:20,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 8.59 | optimizer_gradients: 0.87 | optimizer_step: 9.03 +[2024-12-31 18:03:20,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 398.76 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 157.55 | step_microstep: 21.14 +[2024-12-31 18:03:20,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2938.29 | bwd: 4672.98 | bwd_inner: 4513.95 | bwd_allreduce: 158.19 | step: 24.13 + 51%|█████ | 385/759 [53:15<46:57, 7.53s/it] {'loss': 1.2309, 'learning_rate': 1.0256080100215448e-05, 'epoch': 0.51} + 51%|█████ | 385/759 [53:15<46:57, 7.53s/it][2024-12-31 18:03:21,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.14 | bwd_microstep: 389.60 | bwd_inner_microstep: 389.25 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:03:22,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.00 | bwd_microstep: 348.01 | bwd_inner_microstep: 347.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:03:22,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.09 | bwd_microstep: 284.70 | bwd_inner_microstep: 284.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:03:23,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.13 | bwd_microstep: 278.07 | bwd_inner_microstep: 278.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:03:23,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.69 | bwd_microstep: 259.59 | bwd_inner_microstep: 259.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:24,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 250.75 | bwd_inner_microstep: 250.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:03:24,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.78 | bwd_microstep: 250.52 | bwd_inner_microstep: 250.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:03:24,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 247.19 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:03:25,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 260.52 | bwd_inner_microstep: 260.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:25,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:26,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:03:26,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:27,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:27,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 242.84 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:28,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:28,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 1.03 | optimizer_step: 3.45 +[2024-12-31 18:03:28,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 271.75 | bwd_inner_microstep: 257.14 | bwd_allreduce_microstep: 14.47 | step_microstep: 29.48 +[2024-12-31 18:03:28,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2874.57 | bwd: 4302.61 | bwd_inner: 4287.25 | bwd_allreduce: 14.76 | step: 32.48 + 51%|█████ | 386/759 [53:23<46:50, 7.53s/it] {'loss': 1.2165, 'learning_rate': 1.0213407212100296e-05, 'epoch': 0.51} + 51%|█████ | 386/759 [53:23<46:50, 7.53s/it][2024-12-31 18:03:29,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.41 | bwd_microstep: 364.95 | bwd_inner_microstep: 364.60 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:03:29,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.64 | bwd_microstep: 291.60 | bwd_inner_microstep: 291.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:03:30,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.31 | bwd_microstep: 266.00 | bwd_inner_microstep: 265.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:03:30,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.38 | bwd_microstep: 254.99 | bwd_inner_microstep: 254.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:31,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.64 | bwd_microstep: 280.17 | bwd_inner_microstep: 280.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:31,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 248.86 | bwd_inner_microstep: 248.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:31,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:32,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:32,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:03:33,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 253.89 | bwd_inner_microstep: 253.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:33,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:34,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:03:34,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.63 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:34,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.71 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:35,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.67 | bwd_microstep: 240.92 | bwd_inner_microstep: 240.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:03:35,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 2.26 | optimizer_step: 6.94 +[2024-12-31 18:03:35,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.83 | bwd_microstep: 255.32 | bwd_inner_microstep: 241.68 | bwd_allreduce_microstep: 13.54 | step_microstep: 25.60 +[2024-12-31 18:03:35,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.75 | bwd: 4171.76 | bwd_inner: 4157.40 | bwd_allreduce: 13.81 | step: 28.31 + 51%|█████ | 387/759 [53:30<46:20, 7.47s/it] {'loss': 1.2257, 'learning_rate': 1.017073043574975e-05, 'epoch': 0.51} + 51%|█████ | 387/759 [53:30<46:20, 7.47s/it][2024-12-31 18:03:36,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.18 | bwd_microstep: 309.02 | bwd_inner_microstep: 308.65 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.22 +[2024-12-31 18:03:36,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.19 | bwd_microstep: 287.15 | bwd_inner_microstep: 287.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:37,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.79 | bwd_microstep: 262.59 | bwd_inner_microstep: 262.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:37,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.78 | bwd_microstep: 256.90 | bwd_inner_microstep: 256.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:38,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 251.71 | bwd_inner_microstep: 251.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:38,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:39,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 247.05 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:39,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:39,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:03:40,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 255.94 | bwd_inner_microstep: 255.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:40,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.65 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:03:41,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:41,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:03:42,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.29 | bwd_microstep: 241.26 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:42,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 246.71 | bwd_inner_microstep: 246.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:03:43,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.08 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:03:43,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.43 | bwd_microstep: 340.04 | bwd_inner_microstep: 242.32 | bwd_allreduce_microstep: 97.56 | step_microstep: 11.05 +[2024-12-31 18:03:43,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.02 | bwd: 4165.87 | bwd_inner: 4067.02 | bwd_allreduce: 97.96 | step: 13.74 + 51%|█████ | 388/759 [53:38<45:51, 7.42s/it] {'loss': 1.2333, 'learning_rate': 1.0128050548725865e-05, 'epoch': 0.51} + 51%|█████ | 388/759 [53:38<45:51, 7.42s/it][2024-12-31 18:03:43,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.83 | bwd_microstep: 335.33 | bwd_inner_microstep: 334.99 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:03:44,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.59 | bwd_microstep: 308.70 | bwd_inner_microstep: 308.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:44,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.76 | bwd_microstep: 268.02 | bwd_inner_microstep: 268.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:03:45,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.25 | bwd_microstep: 264.55 | bwd_inner_microstep: 264.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:03:45,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 248.61 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:46,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 275.25 | bwd_inner_microstep: 275.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:46,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:46,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:47,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:47,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 246.14 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:03:48,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 243.06 | bwd_inner_microstep: 242.87 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.27 +[2024-12-31 18:03:48,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 242.89 | bwd_inner_microstep: 242.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:49,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:49,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 275.81 | bwd_inner_microstep: 275.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:03:49,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.44 | bwd_microstep: 241.64 | bwd_inner_microstep: 241.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:50,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.96 | optimizer_gradients: 1.16 | optimizer_step: 3.10 +[2024-12-31 18:03:50,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.16 | bwd_microstep: 309.50 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 66.35 | step_microstep: 11.37 +[2024-12-31 18:03:50,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.51 | bwd: 4240.01 | bwd_inner: 4172.55 | bwd_allreduce: 66.73 | step: 14.41 + 51%|█████▏ | 389/759 [53:45<45:38, 7.40s/it] {'loss': 1.1892, 'learning_rate': 1.0085368328647395e-05, 'epoch': 0.51} + 51%|█████▏ | 389/759 [53:45<45:38, 7.40s/it][2024-12-31 18:03:51,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.65 | bwd_microstep: 317.48 | bwd_inner_microstep: 317.13 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:03:51,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.19 | bwd_microstep: 287.33 | bwd_inner_microstep: 287.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:52,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.32 | bwd_microstep: 268.93 | bwd_inner_microstep: 268.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:52,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.32 | bwd_microstep: 262.88 | bwd_inner_microstep: 262.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:52,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.23 | bwd_microstep: 256.72 | bwd_inner_microstep: 256.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:03:53,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 249.38 | bwd_inner_microstep: 249.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:53,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.86 | bwd_microstep: 252.79 | bwd_inner_microstep: 252.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:03:54,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 245.18 | bwd_inner_microstep: 245.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:03:54,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:55,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 246.71 | bwd_inner_microstep: 246.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:03:55,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.28 +[2024-12-31 18:03:55,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.62 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:56,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.19 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:56,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:03:57,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 243.36 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:03:57,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.52 | optimizer_gradients: 0.55 | optimizer_step: 3.09 +[2024-12-31 18:03:57,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 404.73 | bwd_inner_microstep: 242.28 | bwd_allreduce_microstep: 162.40 | step_microstep: 11.01 +[2024-12-31 18:03:57,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2812.43 | bwd: 4256.45 | bwd_inner: 4092.94 | bwd_allreduce: 162.76 | step: 14.26 + 51%|█████▏ | 390/759 [53:52<45:26, 7.39s/it] {'loss': 1.221, 'learning_rate': 1.0042684553175575e-05, 'epoch': 0.51} + 51%|█████▏ | 390/759 [53:52<45:26, 7.39s/it][2024-12-31 18:03:58,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 257.38 | bwd_microstep: 408.43 | bwd_inner_microstep: 408.08 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:03:59,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.64 | bwd_microstep: 291.02 | bwd_inner_microstep: 291.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:03:59,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.39 | bwd_microstep: 286.94 | bwd_inner_microstep: 286.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:00,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.87 | bwd_microstep: 261.57 | bwd_inner_microstep: 261.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:04:00,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.09 | bwd_microstep: 257.03 | bwd_inner_microstep: 257.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:04:00,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 250.85 | bwd_inner_microstep: 250.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:01,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.62 | bwd_microstep: 249.36 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:01,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:02,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:04:02,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:03,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:03,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:03,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:04,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:04,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:05,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.61 | optimizer_step: 3.22 +[2024-12-31 18:04:05,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 391.62 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 147.94 | step_microstep: 10.77 +[2024-12-31 18:04:05,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.50 | bwd: 4358.40 | bwd_inner: 4209.68 | bwd_allreduce: 148.19 | step: 13.99 + 52%|█████▏ | 391/759 [54:00<45:34, 7.43s/it] {'loss': 1.2375, 'learning_rate': 1e-05, 'epoch': 0.52} + 52%|█████▏ | 391/759 [54:00<45:34, 7.43s/it][2024-12-31 18:04:06,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.82 | bwd_microstep: 373.68 | bwd_inner_microstep: 373.32 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:04:06,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.80 | bwd_microstep: 291.60 | bwd_inner_microstep: 291.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:04:06,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.60 | bwd_microstep: 267.20 | bwd_inner_microstep: 267.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:04:07,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.99 | bwd_microstep: 268.08 | bwd_inner_microstep: 268.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:07,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 254.97 | bwd_inner_microstep: 254.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:08,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:04:08,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 247.72 | bwd_inner_microstep: 247.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:09,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 249.02 | bwd_inner_microstep: 248.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:09,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.96 | bwd_inner_microstep: 245.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:04:10,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.80 | bwd_microstep: 245.71 | bwd_inner_microstep: 245.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:10,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:04:10,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:11,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.52 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:04:11,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:12,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.14 | bwd_microstep: 242.50 | bwd_inner_microstep: 242.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:04:12,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.40 | optimizer_gradients: 0.74 | optimizer_step: 3.22 +[2024-12-31 18:04:12,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 259.03 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 13.64 | step_microstep: 14.85 +[2024-12-31 18:04:12,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2861.96 | bwd: 4172.89 | bwd_inner: 4158.35 | bwd_allreduce: 13.94 | step: 17.59 + 52%|█████▏ | 392/759 [54:07<45:14, 7.40s/it] {'loss': 1.2177, 'learning_rate': 9.957315446824425e-06, 'epoch': 0.52} + 52%|█████▏ | 392/759 [54:07<45:14, 7.40s/it][2024-12-31 18:04:13,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.27 | bwd_microstep: 359.57 | bwd_inner_microstep: 359.34 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.13 +[2024-12-31 18:04:13,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.75 | bwd_microstep: 293.41 | bwd_inner_microstep: 293.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:14,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.54 | bwd_microstep: 283.75 | bwd_inner_microstep: 283.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:14,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.29 | bwd_microstep: 267.34 | bwd_inner_microstep: 267.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:15,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.54 | bwd_microstep: 263.76 | bwd_inner_microstep: 263.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:15,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.41 | bwd_microstep: 256.19 | bwd_inner_microstep: 256.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:04:16,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.52 | bwd_microstep: 254.98 | bwd_inner_microstep: 254.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:04:16,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 249.19 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:04:17,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 248.53 | bwd_inner_microstep: 248.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:17,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.43 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:04:17,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:04:18,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:18,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:19,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:19,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:04:20,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.77 | optimizer_step: 3.45 +[2024-12-31 18:04:20,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 257.87 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.35 +[2024-12-31 18:04:20,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2910.19 | bwd: 4202.67 | bwd_inner: 4188.10 | bwd_allreduce: 13.94 | step: 14.19 + 52%|█████▏ | 393/759 [54:15<45:07, 7.40s/it] {'loss': 1.2274, 'learning_rate': 9.91463167135261e-06, 'epoch': 0.52} + 52%|█████▏ | 393/759 [54:15<45:07, 7.40s/it][2024-12-31 18:04:20,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.82 | bwd_microstep: 313.24 | bwd_inner_microstep: 312.86 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:04:21,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.09 | bwd_microstep: 292.56 | bwd_inner_microstep: 292.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:04:21,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.51 | bwd_microstep: 283.10 | bwd_inner_microstep: 283.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:22,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.69 | bwd_microstep: 274.42 | bwd_inner_microstep: 274.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:04:22,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.12 | bwd_microstep: 261.27 | bwd_inner_microstep: 261.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:04:23,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 255.30 | bwd_inner_microstep: 255.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:23,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.62 | bwd_microstep: 254.52 | bwd_inner_microstep: 254.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:23,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 247.23 | bwd_inner_microstep: 247.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:24,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 244.78 | bwd_inner_microstep: 244.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:24,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.71 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:25,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 270.61 | bwd_microstep: 452.62 | bwd_inner_microstep: 452.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:26,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:26,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 251.19 | bwd_inner_microstep: 251.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:26,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 242.45 | bwd_inner_microstep: 242.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:27,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:27,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.66 | optimizer_gradients: 0.72 | optimizer_step: 3.33 +[2024-12-31 18:04:27,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 256.62 | bwd_inner_microstep: 242.86 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.25 +[2024-12-31 18:04:27,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2956.48 | bwd: 4363.46 | bwd_inner: 4348.81 | bwd_allreduce: 13.93 | step: 14.18 + 52%|█████▏ | 394/759 [54:22<45:28, 7.48s/it] {'loss': 1.2043, 'learning_rate': 9.871949451274137e-06, 'epoch': 0.52} + 52%|█████▏ | 394/759 [54:22<45:28, 7.48s/it][2024-12-31 18:04:28,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.51 | bwd_microstep: 304.20 | bwd_inner_microstep: 303.86 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:04:28,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.64 | bwd_microstep: 360.49 | bwd_inner_microstep: 360.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:29,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.52 | bwd_microstep: 263.32 | bwd_inner_microstep: 263.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:29,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.31 | bwd_microstep: 262.65 | bwd_inner_microstep: 262.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:30,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.85 | bwd_microstep: 266.70 | bwd_inner_microstep: 266.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:30,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:04:31,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 247.81 | bwd_inner_microstep: 247.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:31,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.81 | bwd_microstep: 248.37 | bwd_inner_microstep: 248.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:32,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:32,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.50 | bwd_microstep: 248.18 | bwd_inner_microstep: 248.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:04:32,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 253.22 | bwd_inner_microstep: 253.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:33,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:33,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 268.30 | bwd_inner_microstep: 268.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:04:34,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:34,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 246.37 | bwd_inner_microstep: 246.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:35,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.99 | optimizer_gradients: 0.56 | optimizer_step: 3.10 +[2024-12-31 18:04:35,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 336.70 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 93.21 | step_microstep: 11.05 +[2024-12-31 18:04:35,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2843.46 | bwd: 4286.95 | bwd_inner: 4192.98 | bwd_allreduce: 93.46 | step: 14.18 + 52%|█████▏ | 395/759 [54:30<45:16, 7.46s/it] {'loss': 1.2204, 'learning_rate': 9.829269564250254e-06, 'epoch': 0.52} + 52%|█████▏ | 395/759 [54:30<45:16, 7.46s/it][2024-12-31 18:04:35,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.97 | bwd_microstep: 349.32 | bwd_inner_microstep: 348.95 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:04:36,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.35 | bwd_microstep: 284.86 | bwd_inner_microstep: 284.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:04:36,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.62 | bwd_microstep: 256.86 | bwd_inner_microstep: 256.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:04:37,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.83 | bwd_microstep: 255.32 | bwd_inner_microstep: 255.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:04:37,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 247.52 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:38,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:04:38,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 250.76 | bwd_inner_microstep: 250.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:04:38,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.58 | bwd_microstep: 246.98 | bwd_inner_microstep: 246.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:39,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:39,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:40,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:40,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 242.22 | bwd_inner_microstep: 242.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:41,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.34 | bwd_microstep: 245.03 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:04:41,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:04:41,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 240.97 | bwd_inner_microstep: 240.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:42,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.70 | optimizer_gradients: 0.76 | optimizer_step: 3.31 +[2024-12-31 18:04:42,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 254.98 | bwd_inner_microstep: 241.36 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.51 +[2024-12-31 18:04:42,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2843.40 | bwd: 4097.39 | bwd_inner: 4082.94 | bwd_allreduce: 13.80 | step: 14.43 + 52%|█████▏ | 396/759 [54:37<44:45, 7.40s/it] {'loss': 1.2835, 'learning_rate': 9.786592787899707e-06, 'epoch': 0.52} + 52%|█████▏ | 396/759 [54:37<44:45, 7.40s/it][2024-12-31 18:04:43,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.11 | bwd_microstep: 335.74 | bwd_inner_microstep: 335.39 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:04:43,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.13 | bwd_microstep: 288.94 | bwd_inner_microstep: 288.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:04:44,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.78 | bwd_microstep: 278.15 | bwd_inner_microstep: 278.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:44,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.33 | bwd_microstep: 268.48 | bwd_inner_microstep: 268.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:04:44,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 249.85 | bwd_inner_microstep: 249.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:45,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:45,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 249.07 | bwd_inner_microstep: 249.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:04:46,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 283.98 | bwd_inner_microstep: 283.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:04:46,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.96 | bwd_inner_microstep: 245.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:47,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:47,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:48,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 244.04 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:48,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:48,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:49,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.86 | bwd_microstep: 258.52 | bwd_inner_microstep: 258.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:49,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.60 | optimizer_gradients: 0.62 | optimizer_step: 3.11 +[2024-12-31 18:04:49,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.10 | bwd_microstep: 410.13 | bwd_inner_microstep: 297.17 | bwd_allreduce_microstep: 112.91 | step_microstep: 11.40 +[2024-12-31 18:04:49,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2830.94 | bwd: 4339.24 | bwd_inner: 4225.57 | bwd_allreduce: 113.16 | step: 14.41 + 52%|█████▏ | 397/759 [54:44<44:44, 7.42s/it] {'loss': 1.2021, 'learning_rate': 9.743919899784555e-06, 'epoch': 0.52} + 52%|█████▏ | 397/759 [54:44<44:44, 7.42s/it][2024-12-31 18:04:50,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.88 | bwd_microstep: 353.38 | bwd_inner_microstep: 353.04 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:04:51,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.79 | bwd_microstep: 301.12 | bwd_inner_microstep: 301.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:04:51,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.34 | bwd_microstep: 290.63 | bwd_inner_microstep: 290.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:51,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.16 | bwd_microstep: 268.49 | bwd_inner_microstep: 268.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:52,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.75 | bwd_microstep: 279.35 | bwd_inner_microstep: 279.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:52,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.81 | bwd_microstep: 259.61 | bwd_inner_microstep: 259.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:04:53,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.92 | bwd_microstep: 256.30 | bwd_inner_microstep: 256.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:04:53,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.36 | bwd_microstep: 247.34 | bwd_inner_microstep: 247.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:54,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 247.27 | bwd_inner_microstep: 247.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:04:54,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 257.04 | bwd_inner_microstep: 257.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:04:55,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:55,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:55,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:56,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 243.03 | bwd_inner_microstep: 242.75 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.29 +[2024-12-31 18:04:56,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:04:57,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.89 | optimizer_step: 3.40 +[2024-12-31 18:04:57,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 257.05 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.99 +[2024-12-31 18:04:57,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2863.91 | bwd: 4234.17 | bwd_inner: 4219.38 | bwd_allreduce: 14.05 | step: 14.95 + 52%|█████▏ | 398/759 [54:52<44:35, 7.41s/it] {'loss': 1.2067, 'learning_rate': 9.701251677396021e-06, 'epoch': 0.52} + 52%|█████▏ | 398/759 [54:52<44:35, 7.41s/it][2024-12-31 18:04:57,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.64 | bwd_microstep: 370.46 | bwd_inner_microstep: 370.11 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:04:58,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 252.06 | bwd_microstep: 416.46 | bwd_inner_microstep: 416.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:04:59,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.19 | bwd_microstep: 287.01 | bwd_inner_microstep: 286.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:04:59,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 273.39 | bwd_microstep: 466.22 | bwd_inner_microstep: 466.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:05:00,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.75 | bwd_microstep: 254.78 | bwd_inner_microstep: 254.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:00,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 248.52 | bwd_inner_microstep: 248.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:05:01,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 250.15 | bwd_inner_microstep: 250.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:05:01,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 252.10 | bwd_inner_microstep: 252.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:05:02,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 260.38 | bwd_inner_microstep: 260.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:02,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:05:02,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:05:03,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:05:03,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:04,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:05:04,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.87 | bwd_microstep: 240.81 | bwd_inner_microstep: 240.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:05:05,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.70 | optimizer_step: 3.26 +[2024-12-31 18:05:05,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.06 | bwd_microstep: 256.36 | bwd_inner_microstep: 242.67 | bwd_allreduce_microstep: 13.57 | step_microstep: 10.97 +[2024-12-31 18:05:05,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2986.59 | bwd: 4524.55 | bwd_inner: 4510.06 | bwd_allreduce: 13.85 | step: 13.75 + 53%|█████▎ | 399/759 [55:00<45:10, 7.53s/it] {'loss': 1.2337, 'learning_rate': 9.658588898140322e-06, 'epoch': 0.53} + 53%|█████▎ | 399/759 [55:00<45:10, 7.53s/it][2024-12-31 18:05:05,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 234.41 | bwd_microstep: 382.14 | bwd_inner_microstep: 381.79 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:05:06,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.90 | bwd_microstep: 297.05 | bwd_inner_microstep: 297.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:06,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.73 | bwd_microstep: 269.57 | bwd_inner_microstep: 269.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:05:07,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.25 | bwd_microstep: 261.19 | bwd_inner_microstep: 261.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:05:07,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 249.80 | bwd_inner_microstep: 249.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:05:08,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 250.04 | bwd_inner_microstep: 250.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:05:08,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 256.01 | bwd_inner_microstep: 255.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:05:08,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.90 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:05:09,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 247.32 | bwd_inner_microstep: 247.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:09,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:10,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:05:10,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:05:11,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:05:11,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:05:11,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 270.49 | bwd_inner_microstep: 270.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:05:12,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.71 | optimizer_gradients: 0.77 | optimizer_step: 3.32 +[2024-12-31 18:05:12,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 257.92 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 14.14 | step_microstep: 11.72 +[2024-12-31 18:05:12,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2841.34 | bwd: 4209.04 | bwd_inner: 4194.08 | bwd_allreduce: 14.40 | step: 14.08 + 53%|█████▎ | 400/759 [55:07<44:38, 7.46s/it] {'loss': 1.2177, 'learning_rate': 9.615932339324497e-06, 'epoch': 0.53} + 53%|█████▎ | 400/759 [55:07<44:38, 7.46s/it][INFO|trainer.py:2936] 2024-12-31 18:05:13,633 >> Saving model checkpoint to work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400 +[INFO|configuration_utils.py:473] 2024-12-31 18:05:13,656 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/config.json +[INFO|configuration_utils.py:594] 2024-12-31 18:05:13,659 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/generation_config.json +[INFO|modeling_utils.py:2493] 2024-12-31 18:07:25,322 >> Model weights saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/model.safetensors +[INFO|tokenization_utils_base.py:2433] 2024-12-31 18:07:25,952 >> tokenizer config file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/tokenizer_config.json +[INFO|tokenization_utils_base.py:2442] 2024-12-31 18:07:26,170 >> Special tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/special_tokens_map.json +[INFO|tokenization_utils_base.py:2493] 2024-12-31 18:07:26,226 >> added tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/added_tokens.json +[2024-12-31 18:07:46,241] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step400 is about to be saved! +[2024-12-31 18:07:46,288] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/global_step400/mp_rank_00_model_states.pt +[2024-12-31 18:07:46,288] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/global_step400/mp_rank_00_model_states.pt... +[2024-12-31 18:07:51,113] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/global_step400/mp_rank_00_model_states.pt. +[2024-12-31 18:07:51,125] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-12-31 18:07:52,599] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-12-31 18:07:52,666] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-400/global_step400/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-12-31 18:07:52,666] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step400 is ready now! +[INFO|trainer.py:3028] 2024-12-31 18:07:52,799 >> Deleting older checkpoint [work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/checkpoint-200] due to args.save_total_limit +[2024-12-31 18:07:53,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 288.68 | bwd_microstep: 343.71 | bwd_inner_microstep: 343.35 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:07:54,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.49 | bwd_microstep: 291.45 | bwd_inner_microstep: 291.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:07:54,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.76 | bwd_microstep: 281.76 | bwd_inner_microstep: 281.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:07:55,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.93 | bwd_microstep: 261.28 | bwd_inner_microstep: 261.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:07:55,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:07:55,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 247.04 | bwd_inner_microstep: 247.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:07:56,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:07:56,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.48 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:08:35,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.01 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:08:57,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.20 | bwd_microstep: 251.01 | bwd_inner_microstep: 250.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:08:58,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:08:58,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.66 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:08,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.16 | bwd_microstep: 248.18 | bwd_inner_microstep: 248.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:08,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:09,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.42 +[2024-12-31 18:09:09,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.91 | optimizer_step: 3.69 +[2024-12-31 18:09:09,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.40 | bwd_microstep: 255.51 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 13.78 | step_microstep: 12.62 +[2024-12-31 18:09:09,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2981.09 | bwd: 4137.02 | bwd_inner: 4122.18 | bwd_allreduce: 14.08 | step: 15.64 + 53%|█████▎ | 401/759 [59:04<7:35:44, 76.38s/it] {'loss': 1.2336, 'learning_rate': 9.573282778142246e-06, 'epoch': 0.53} + 53%|█████▎ | 401/759 [59:04<7:35:44, 76.38s/it][2024-12-31 18:09:10,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.06 | bwd_microstep: 298.52 | bwd_inner_microstep: 298.16 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:09:10,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.90 | bwd_microstep: 374.31 | bwd_inner_microstep: 374.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:09:11,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.78 | bwd_microstep: 262.85 | bwd_inner_microstep: 262.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:11,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 256.60 | bwd_inner_microstep: 256.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:12,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 245.90 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:12,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 246.55 | bwd_inner_microstep: 246.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:12,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 246.33 | bwd_inner_microstep: 246.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:13,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:09:13,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:14,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:14,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 246.58 | bwd_inner_microstep: 246.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:09:15,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:15,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:15,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.58 | bwd_microstep: 242.38 | bwd_inner_microstep: 242.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:16,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.85 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:09:16,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.66 | optimizer_gradients: 0.84 | optimizer_step: 3.30 +[2024-12-31 18:09:16,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.47 | bwd_microstep: 254.80 | bwd_inner_microstep: 241.09 | bwd_allreduce_microstep: 13.58 | step_microstep: 10.99 +[2024-12-31 18:09:16,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.24 | bwd: 4139.63 | bwd_inner: 4125.08 | bwd_allreduce: 13.87 | step: 13.92 + 53%|█████▎ | 402/759 [59:11<5:31:04, 55.64s/it] {'loss': 1.227, 'learning_rate': 9.530640991659785e-06, 'epoch': 0.53} + 53%|█████▎ | 402/759 [59:11<5:31:04, 55.64s/it][2024-12-31 18:09:17,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 249.48 | bwd_microstep: 412.94 | bwd_inner_microstep: 412.58 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:09:18,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.55 | bwd_microstep: 288.37 | bwd_inner_microstep: 288.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:18,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.63 | bwd_microstep: 267.62 | bwd_inner_microstep: 267.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:18,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.33 | bwd_microstep: 261.28 | bwd_inner_microstep: 261.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:19,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 249.50 | bwd_inner_microstep: 249.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:19,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 248.42 | bwd_inner_microstep: 248.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:20,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 248.70 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:20,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 246.42 | bwd_inner_microstep: 246.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:09:21,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 246.55 | bwd_inner_microstep: 246.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:09:21,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:09:22,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:09:22,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:09:22,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.26 | bwd_microstep: 241.90 | bwd_inner_microstep: 241.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:09:23,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:09:23,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:24,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.26 | optimizer_gradients: 0.66 | optimizer_step: 3.13 +[2024-12-31 18:09:24,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.91 | bwd_microstep: 384.37 | bwd_inner_microstep: 241.93 | bwd_allreduce_microstep: 142.39 | step_microstep: 11.31 +[2024-12-31 18:09:24,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.90 | bwd: 4318.91 | bwd_inner: 4175.74 | bwd_allreduce: 142.64 | step: 13.59 + 53%|█████▎ | 403/759 [59:19<4:04:17, 41.17s/it] {'loss': 1.2225, 'learning_rate': 9.488007756801672e-06, 'epoch': 0.53} + 53%|█████▎ | 403/759 [59:19<4:04:17, 41.17s/it][2024-12-31 18:09:24,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.84 | bwd_microstep: 411.36 | bwd_inner_microstep: 411.02 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:09:25,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.16 | bwd_microstep: 285.41 | bwd_inner_microstep: 285.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:25,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.81 | bwd_microstep: 269.29 | bwd_inner_microstep: 269.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:26,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.39 | bwd_microstep: 266.67 | bwd_inner_microstep: 266.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:09:26,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.07 | bwd_microstep: 262.18 | bwd_inner_microstep: 262.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:27,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.06 | bwd_microstep: 255.77 | bwd_inner_microstep: 255.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:09:27,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.25 | bwd_microstep: 255.75 | bwd_inner_microstep: 255.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:09:28,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 253.08 | bwd_inner_microstep: 253.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:28,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:29,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:29,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:29,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:30,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:09:30,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:31,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.91 | bwd_microstep: 241.33 | bwd_inner_microstep: 241.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:31,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.17 | optimizer_gradients: 0.59 | optimizer_step: 3.10 +[2024-12-31 18:09:31,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.84 | bwd_microstep: 395.18 | bwd_inner_microstep: 242.34 | bwd_allreduce_microstep: 152.79 | step_microstep: 11.04 +[2024-12-31 18:09:31,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2871.13 | bwd: 4364.93 | bwd_inner: 4211.36 | bwd_allreduce: 153.04 | step: 13.81 + 53%|█████▎ | 404/759 [59:26<3:03:52, 31.08s/it] {'loss': 1.2252, 'learning_rate': 9.445383850336648e-06, 'epoch': 0.53} + 53%|█████▎ | 404/759 [59:26<3:03:52, 31.08s/it][2024-12-31 18:09:32,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.52 | bwd_microstep: 375.94 | bwd_inner_microstep: 375.59 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:09:32,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.81 | bwd_microstep: 296.84 | bwd_inner_microstep: 296.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:09:33,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.47 | bwd_microstep: 284.49 | bwd_inner_microstep: 284.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:33,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.60 | bwd_microstep: 264.35 | bwd_inner_microstep: 264.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:34,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.48 | bwd_microstep: 256.63 | bwd_inner_microstep: 256.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:34,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 248.23 | bwd_inner_microstep: 248.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:35,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 247.16 | bwd_inner_microstep: 247.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:09:35,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:09:36,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:36,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 245.67 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:09:36,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:37,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.64 | bwd_microstep: 241.55 | bwd_inner_microstep: 241.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:37,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.39 | bwd_microstep: 242.50 | bwd_inner_microstep: 242.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:38,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.89 | bwd_microstep: 241.99 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:38,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.90 | bwd_microstep: 240.88 | bwd_inner_microstep: 240.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:39,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.29 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:09:39,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.14 | bwd_microstep: 404.11 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 158.96 | step_microstep: 10.88 +[2024-12-31 18:09:39,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.08 | bwd: 4331.72 | bwd_inner: 4171.89 | bwd_allreduce: 159.23 | step: 14.06 + 53%|█████▎ | 405/759 [59:34<2:21:33, 23.99s/it] {'loss': 1.1976, 'learning_rate': 9.402770048863502e-06, 'epoch': 0.53} + 53%|█████▎ | 405/759 [59:34<2:21:33, 23.99s/it][2024-12-31 18:09:39,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.65 | bwd_microstep: 360.89 | bwd_inner_microstep: 360.46 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.23 +[2024-12-31 18:09:40,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 211.11 | bwd_microstep: 318.51 | bwd_inner_microstep: 318.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:09:40,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.94 | bwd_microstep: 263.16 | bwd_inner_microstep: 263.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:41,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.67 | bwd_microstep: 257.12 | bwd_inner_microstep: 257.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:09:41,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 249.35 | bwd_inner_microstep: 249.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:42,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 252.88 | bwd_inner_microstep: 252.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:42,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:43,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:43,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:43,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:09:44,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 248.33 | bwd_inner_microstep: 248.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:44,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:45,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.54 | bwd_microstep: 247.24 | bwd_inner_microstep: 246.81 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.28 +[2024-12-31 18:09:45,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:46,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.25 | bwd_microstep: 241.17 | bwd_inner_microstep: 241.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:46,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.75 | optimizer_gradients: 0.72 | optimizer_step: 3.37 +[2024-12-31 18:09:46,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 257.71 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.27 +[2024-12-31 18:09:46,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.06 | bwd: 4163.84 | bwd_inner: 4148.84 | bwd_allreduce: 14.12 | step: 14.48 + 53%|█████▎ | 406/759 [59:41<1:51:42, 18.99s/it] {'loss': 1.2518, 'learning_rate': 9.360167128796913e-06, 'epoch': 0.53} + 53%|█████▎ | 406/759 [59:41<1:51:42, 18.99s/it][2024-12-31 18:09:47,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 234.66 | bwd_microstep: 385.44 | bwd_inner_microstep: 384.97 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.23 +[2024-12-31 18:09:47,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 206.74 | bwd_microstep: 300.08 | bwd_inner_microstep: 300.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:48,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.44 | bwd_microstep: 281.46 | bwd_inner_microstep: 281.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:09:48,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.59 | bwd_microstep: 267.33 | bwd_inner_microstep: 267.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:49,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.15 | bwd_microstep: 261.83 | bwd_inner_microstep: 261.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:49,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.97 | bwd_microstep: 248.96 | bwd_inner_microstep: 248.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:50,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 247.96 | bwd_inner_microstep: 247.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:50,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:50,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 249.52 | bwd_inner_microstep: 249.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:51,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 246.36 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:09:51,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.59 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:52,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.04 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:52,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:53,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:09:53,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:09:54,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.85 | optimizer_gradients: 0.61 | optimizer_step: 3.08 +[2024-12-31 18:09:54,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.36 | bwd_microstep: 539.32 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 293.03 | step_microstep: 10.61 +[2024-12-31 18:09:54,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2885.39 | bwd: 4500.05 | bwd_inner: 4205.77 | bwd_allreduce: 293.50 | step: 13.61 + 54%|█████▎ | 407/759 [59:49<1:31:28, 15.59s/it] {'loss': 1.2183, 'learning_rate': 9.317575866353293e-06, 'epoch': 0.54} + 54%|█████▎ | 407/759 [59:49<1:31:28, 15.59s/it][2024-12-31 18:09:54,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.65 | bwd_microstep: 310.54 | bwd_inner_microstep: 310.17 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:09:55,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.95 | bwd_microstep: 288.98 | bwd_inner_microstep: 288.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:55,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.81 | bwd_microstep: 265.07 | bwd_inner_microstep: 265.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:56,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 258.52 | bwd_inner_microstep: 258.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:56,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 249.08 | bwd_inner_microstep: 249.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:57,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 246.79 | bwd_inner_microstep: 246.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:57,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 261.79 | bwd_inner_microstep: 261.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:09:57,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:09:58,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:09:58,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:09:59,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:09:59,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:10:00,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:00,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 248.84 | bwd_inner_microstep: 248.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:00,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:01,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.62 | optimizer_gradients: 0.88 | optimizer_step: 3.15 +[2024-12-31 18:10:01,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 353.79 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 109.16 | step_microstep: 10.56 +[2024-12-31 18:10:01,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2789.97 | bwd: 4199.47 | bwd_inner: 4089.46 | bwd_allreduce: 109.43 | step: 13.62 + 54%|█████▍ | 408/759 [59:56<1:16:37, 13.10s/it] {'loss': 1.2364, 'learning_rate': 9.274997037536663e-06, 'epoch': 0.54} + 54%|█████▍ | 408/759 [59:56<1:16:37, 13.10s/it][2024-12-31 18:10:02,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.18 | bwd_microstep: 335.86 | bwd_inner_microstep: 335.43 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.22 +[2024-12-31 18:10:02,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.18 | bwd_microstep: 286.45 | bwd_inner_microstep: 286.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:03,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.83 | bwd_microstep: 267.94 | bwd_inner_microstep: 267.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:10:03,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.18 | bwd_microstep: 256.81 | bwd_inner_microstep: 256.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:03,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.71 | bwd_microstep: 248.32 | bwd_inner_microstep: 248.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:10:04,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.63 | bwd_microstep: 248.76 | bwd_inner_microstep: 248.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:10:04,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 247.93 | bwd_inner_microstep: 247.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:10:05,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.55 | bwd_inner_microstep: 248.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:05,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 247.78 | bwd_inner_microstep: 247.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:06,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 245.83 | bwd_inner_microstep: 245.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:10:06,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:06,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 245.03 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:07,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:07,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:08,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 241.99 | bwd_inner_microstep: 241.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:08,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 8.09 | optimizer_gradients: 0.66 | optimizer_step: 3.15 +[2024-12-31 18:10:08,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 268.76 | bwd_inner_microstep: 248.54 | bwd_allreduce_microstep: 20.13 | step_microstep: 14.32 +[2024-12-31 18:10:08,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2836.32 | bwd: 4122.61 | bwd_inner: 4101.30 | bwd_allreduce: 20.46 | step: 17.19 + 54%|█████▍ | 409/759 [1:00:03<1:06:09, 11.34s/it] {'loss': 1.2687, 'learning_rate': 9.232431418124507e-06, 'epoch': 0.54} + 54%|█████▍ | 409/759 [1:00:03<1:06:09, 11.34s/it][2024-12-31 18:10:09,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.36 | bwd_microstep: 367.56 | bwd_inner_microstep: 367.16 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.20 +[2024-12-31 18:10:09,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.94 | bwd_microstep: 293.25 | bwd_inner_microstep: 293.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:10,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.31 | bwd_microstep: 288.69 | bwd_inner_microstep: 288.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:10,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.08 | bwd_microstep: 264.87 | bwd_inner_microstep: 264.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:11,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.39 | bwd_microstep: 262.11 | bwd_inner_microstep: 262.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:11,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 252.14 | bwd_inner_microstep: 251.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:10:12,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 255.93 | bwd_inner_microstep: 255.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:12,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.69 | bwd_microstep: 246.24 | bwd_inner_microstep: 246.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:13,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:13,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:13,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:10:14,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 244.31 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:10:14,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:15,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:15,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:16,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.54 | optimizer_gradients: 0.70 | optimizer_step: 3.29 +[2024-12-31 18:10:16,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 259.59 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 13.75 | step_microstep: 10.87 +[2024-12-31 18:10:16,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2873.65 | bwd: 4203.09 | bwd_inner: 4188.23 | bwd_allreduce: 14.11 | step: 14.01 + 54%|█████▍ | 410/759 [1:00:11<59:04, 10.15s/it] {'loss': 1.2287, 'learning_rate': 9.189879783653633e-06, 'epoch': 0.54} + 54%|█████▍ | 410/759 [1:00:11<59:04, 10.15s/it][2024-12-31 18:10:16,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.50 | bwd_microstep: 343.80 | bwd_inner_microstep: 343.45 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:10:17,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.98 | bwd_microstep: 302.96 | bwd_inner_microstep: 302.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:17,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.78 | bwd_microstep: 281.54 | bwd_inner_microstep: 281.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:18,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.08 | bwd_microstep: 261.46 | bwd_inner_microstep: 261.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:18,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.96 | bwd_microstep: 256.03 | bwd_inner_microstep: 256.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:19,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 255.04 | bwd_inner_microstep: 255.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:19,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 248.50 | bwd_inner_microstep: 248.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:10:19,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 248.70 | bwd_inner_microstep: 248.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:20,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:20,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:21,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:21,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:22,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.41 | bwd_microstep: 241.36 | bwd_inner_microstep: 241.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:22,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 252.08 | bwd_inner_microstep: 252.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:22,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.30 | bwd_microstep: 240.45 | bwd_inner_microstep: 240.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:23,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.30 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:10:23,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 414.95 | bwd_inner_microstep: 241.39 | bwd_allreduce_microstep: 173.51 | step_microstep: 11.22 +[2024-12-31 18:10:23,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2837.63 | bwd: 4329.37 | bwd_inner: 4154.88 | bwd_allreduce: 173.83 | step: 14.28 + 54%|█████▍ | 411/759 [1:00:18<54:11, 9.34s/it] {'loss': 1.2357, 'learning_rate': 9.14734290940604e-06, 'epoch': 0.54} + 54%|█████▍ | 411/759 [1:00:18<54:11, 9.34s/it][2024-12-31 18:10:24,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.07 | bwd_microstep: 358.62 | bwd_inner_microstep: 358.27 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:10:24,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.89 | bwd_microstep: 288.30 | bwd_inner_microstep: 288.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:25,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.52 | bwd_microstep: 290.57 | bwd_inner_microstep: 290.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:10:25,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.59 | bwd_microstep: 261.95 | bwd_inner_microstep: 261.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:10:26,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.57 | bwd_microstep: 262.77 | bwd_inner_microstep: 262.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:26,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 249.71 | bwd_inner_microstep: 249.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:27,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.55 | bwd_microstep: 249.65 | bwd_inner_microstep: 249.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:10:27,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:27,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:10:28,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.91 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:10:28,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 245.25 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:29,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:29,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:30,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.41 | bwd_microstep: 242.42 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:30,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.11 | bwd_microstep: 241.18 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:31,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.11 | optimizer_gradients: 0.59 | optimizer_step: 3.09 +[2024-12-31 18:10:31,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 427.43 | bwd_inner_microstep: 241.44 | bwd_allreduce_microstep: 185.95 | step_microstep: 10.94 +[2024-12-31 18:10:31,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.84 | bwd: 4339.48 | bwd_inner: 4152.74 | bwd_allreduce: 186.19 | step: 14.03 + 54%|█████▍ | 412/759 [1:00:26<50:48, 8.78s/it] {'loss': 1.2428, 'learning_rate': 9.104821570394811e-06, 'epoch': 0.54} + 54%|█████▍ | 412/759 [1:00:26<50:48, 8.78s/it][2024-12-31 18:10:31,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.35 | bwd_microstep: 358.14 | bwd_inner_microstep: 357.76 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.21 +[2024-12-31 18:10:32,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 290.42 | bwd_microstep: 504.17 | bwd_inner_microstep: 504.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:10:32,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 262.38 | bwd_inner_microstep: 262.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:10:33,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.33 | bwd_microstep: 268.12 | bwd_inner_microstep: 268.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:33,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.52 | bwd_microstep: 250.84 | bwd_inner_microstep: 250.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:10:34,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 247.56 | bwd_inner_microstep: 247.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:34,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.50 | bwd_inner_microstep: 246.31 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.20 +[2024-12-31 18:10:35,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:35,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:36,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.98 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:36,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:36,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:37,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 253.09 | bwd_inner_microstep: 253.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:37,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:38,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.93 | bwd_microstep: 241.10 | bwd_inner_microstep: 241.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:38,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.72 | optimizer_step: 3.30 +[2024-12-31 18:10:38,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.90 | bwd_microstep: 255.17 | bwd_inner_microstep: 241.56 | bwd_allreduce_microstep: 13.52 | step_microstep: 10.61 +[2024-12-31 18:10:38,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2912.01 | bwd: 4354.05 | bwd_inner: 4339.45 | bwd_allreduce: 13.94 | step: 13.60 + 54%|█████▍ | 413/759 [1:00:33<48:33, 8.42s/it] {'loss': 1.2327, 'learning_rate': 9.062316541349978e-06, 'epoch': 0.54} + 54%|█████▍ | 413/759 [1:00:33<48:33, 8.42s/it][2024-12-31 18:10:39,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.40 | bwd_microstep: 355.92 | bwd_inner_microstep: 355.59 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:10:39,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.54 | bwd_microstep: 298.45 | bwd_inner_microstep: 298.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:10:40,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.00 | bwd_microstep: 281.55 | bwd_inner_microstep: 281.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:40,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.85 | bwd_microstep: 269.19 | bwd_inner_microstep: 269.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:10:41,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.98 | bwd_microstep: 263.52 | bwd_inner_microstep: 263.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:10:41,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 251.33 | bwd_inner_microstep: 251.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:42,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 247.80 | bwd_inner_microstep: 247.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:42,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 251.74 | bwd_inner_microstep: 251.31 | bwd_allreduce_microstep: 0.30 | step_microstep: 0.28 +[2024-12-31 18:10:42,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:43,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 245.92 | bwd_inner_microstep: 245.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:43,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:44,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:44,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 250.76 | bwd_inner_microstep: 250.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:45,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 248.68 | bwd_inner_microstep: 248.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:45,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:46,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.66 | optimizer_step: 3.27 +[2024-12-31 18:10:46,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 518.93 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 274.79 | step_microstep: 10.49 +[2024-12-31 18:10:46,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.26 | bwd: 4464.78 | bwd_inner: 4188.53 | bwd_allreduce: 275.44 | step: 13.65 + 55%|█████▍ | 414/759 [1:00:41<46:59, 8.17s/it] {'loss': 1.2187, 'learning_rate': 9.019828596704394e-06, 'epoch': 0.55} + 55%|█████▍ | 414/759 [1:00:41<46:59, 8.17s/it][2024-12-31 18:10:46,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.53 | bwd_microstep: 394.32 | bwd_inner_microstep: 393.97 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:10:47,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.18 | bwd_microstep: 362.33 | bwd_inner_microstep: 362.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:10:47,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.66 | bwd_microstep: 262.71 | bwd_inner_microstep: 262.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:48,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 251.44 | bwd_inner_microstep: 251.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:10:48,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 250.40 | bwd_inner_microstep: 250.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:49,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 249.70 | bwd_inner_microstep: 249.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:49,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:50,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:50,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.81 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:51,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:51,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 251.03 | bwd_inner_microstep: 251.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:51,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.39 | bwd_microstep: 241.53 | bwd_inner_microstep: 241.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:10:52,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:52,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:10:53,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:53,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.27 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:10:53,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 299.13 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 55.09 | step_microstep: 11.07 +[2024-12-31 18:10:53,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.55 | bwd: 4275.18 | bwd_inner: 4219.28 | bwd_allreduce: 55.35 | step: 14.00 + 55%|█████▍ | 415/759 [1:00:48<45:32, 7.94s/it] {'loss': 1.2402, 'learning_rate': 8.977358510579658e-06, 'epoch': 0.55} + 55%|█████▍ | 415/759 [1:00:48<45:32, 7.94s/it][2024-12-31 18:10:54,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.35 | bwd_microstep: 315.72 | bwd_inner_microstep: 315.35 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:10:54,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 232.24 | bwd_microstep: 368.32 | bwd_inner_microstep: 368.11 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.24 +[2024-12-31 18:10:55,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.56 | bwd_microstep: 285.33 | bwd_inner_microstep: 285.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:55,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.16 | bwd_microstep: 261.89 | bwd_inner_microstep: 261.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:56,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 250.18 | bwd_inner_microstep: 250.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:56,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 210.84 | bwd_microstep: 246.69 | bwd_inner_microstep: 246.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:57,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 249.38 | bwd_inner_microstep: 249.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:57,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.56 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:57,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 246.36 | bwd_inner_microstep: 246.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:58,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:10:58,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:10:59,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:10:59,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:00,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:00,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.94 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:01,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 1.24 | optimizer_step: 3.59 +[2024-12-31 18:11:01,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 258.55 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 13.96 | step_microstep: 12.66 +[2024-12-31 18:11:01,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2912.86 | bwd: 4189.11 | bwd_inner: 4174.07 | bwd_allreduce: 14.38 | step: 15.77 + 55%|█████▍ | 416/759 [1:00:56<44:28, 7.78s/it] {'loss': 1.2443, 'learning_rate': 8.93490705677198e-06, 'epoch': 0.55} + 55%|█████▍ | 416/759 [1:00:56<44:28, 7.78s/it][2024-12-31 18:11:01,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.45 | bwd_microstep: 313.32 | bwd_inner_microstep: 312.96 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:11:02,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.30 | bwd_microstep: 284.93 | bwd_inner_microstep: 284.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:02,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.46 | bwd_microstep: 308.15 | bwd_inner_microstep: 308.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:03,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.91 | bwd_microstep: 258.33 | bwd_inner_microstep: 258.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:03,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.47 | bwd_microstep: 256.72 | bwd_inner_microstep: 256.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:03,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 247.65 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:11:04,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 246.60 | bwd_inner_microstep: 246.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:04,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:05,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:06,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.91 | bwd_inner_microstep: 245.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:06,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:06,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:07,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:07,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:08,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.31 | optimizer_gradients: 0.58 | optimizer_step: 3.22 +[2024-12-31 18:11:08,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 367.25 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 123.50 | step_microstep: 11.87 +[2024-12-31 18:11:08,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2813.12 | bwd: 4242.99 | bwd_inner: 4118.62 | bwd_allreduce: 123.76 | step: 14.84 + 55%|█████▍ | 417/759 [1:01:03<43:35, 7.65s/it] {'loss': 1.2737, 'learning_rate': 8.89247500873809e-06, 'epoch': 0.55} + 55%|█████▍ | 417/759 [1:01:03<43:35, 7.65s/it][2024-12-31 18:11:08,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.12 | bwd_microstep: 350.54 | bwd_inner_microstep: 350.17 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:11:09,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.83 | bwd_microstep: 286.63 | bwd_inner_microstep: 286.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:09,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.71 | bwd_microstep: 257.40 | bwd_inner_microstep: 257.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:11:10,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.50 | bwd_microstep: 256.77 | bwd_inner_microstep: 256.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:11:10,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 249.28 | bwd_inner_microstep: 249.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:11,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 246.90 | bwd_inner_microstep: 246.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:11:11,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:11:12,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:12,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.05 | bwd_microstep: 245.86 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:12,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:13,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:13,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 242.12 | bwd_inner_microstep: 242.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:11:14,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.53 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:11:14,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.89 | bwd_microstep: 241.28 | bwd_inner_microstep: 241.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:15,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 242.22 | bwd_inner_microstep: 242.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:11:15,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.96 | optimizer_step: 3.30 +[2024-12-31 18:11:15,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 312.22 | bwd_inner_microstep: 241.67 | bwd_allreduce_microstep: 70.50 | step_microstep: 11.14 +[2024-12-31 18:11:15,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2808.39 | bwd: 4149.71 | bwd_inner: 4078.24 | bwd_allreduce: 70.80 | step: 13.49 + 55%|█████▌ | 418/759 [1:01:10<42:44, 7.52s/it] {'loss': 1.2073, 'learning_rate': 8.850063139581156e-06, 'epoch': 0.55} + 55%|█████▌ | 418/759 [1:01:10<42:44, 7.52s/it][2024-12-31 18:11:16,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.68 | bwd_microstep: 337.78 | bwd_inner_microstep: 337.42 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:11:16,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.39 | bwd_microstep: 295.97 | bwd_inner_microstep: 295.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:17,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.89 | bwd_microstep: 290.03 | bwd_inner_microstep: 290.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:11:17,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.89 | bwd_microstep: 284.41 | bwd_inner_microstep: 284.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:11:18,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.47 | bwd_microstep: 262.90 | bwd_inner_microstep: 262.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:11:18,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.24 | bwd_microstep: 261.28 | bwd_inner_microstep: 261.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:19,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.72 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:19,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.96 | bwd_microstep: 248.39 | bwd_inner_microstep: 248.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:19,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 252.59 | bwd_inner_microstep: 252.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:20,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:20,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 246.40 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:21,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:21,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:22,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.32 | bwd_microstep: 240.69 | bwd_inner_microstep: 240.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:22,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.25 | bwd_microstep: 241.25 | bwd_inner_microstep: 241.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:22,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.95 | optimizer_gradients: 0.71 | optimizer_step: 3.23 +[2024-12-31 18:11:22,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 257.26 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.28 +[2024-12-31 18:11:22,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2862.67 | bwd: 4202.33 | bwd_inner: 4187.85 | bwd_allreduce: 13.89 | step: 14.12 + 55%|█████▌ | 419/759 [1:01:17<42:20, 7.47s/it] {'loss': 1.2182, 'learning_rate': 8.807672222036692e-06, 'epoch': 0.55} + 55%|█████▌ | 419/759 [1:01:17<42:20, 7.47s/it][2024-12-31 18:11:23,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.31 | bwd_microstep: 338.57 | bwd_inner_microstep: 338.22 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:11:24,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.56 | bwd_microstep: 291.19 | bwd_inner_microstep: 291.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:24,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.62 | bwd_microstep: 282.52 | bwd_inner_microstep: 282.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:24,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.58 | bwd_microstep: 256.43 | bwd_inner_microstep: 256.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:25,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.45 | bwd_microstep: 254.57 | bwd_inner_microstep: 254.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:25,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 246.89 | bwd_inner_microstep: 246.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:26,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:26,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:27,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:27,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:28,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:28,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:28,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:29,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:29,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.33 | bwd_microstep: 241.86 | bwd_inner_microstep: 241.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:30,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.05 | optimizer_gradients: 0.59 | optimizer_step: 3.10 +[2024-12-31 18:11:30,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.28 | bwd_microstep: 314.43 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 73.10 | step_microstep: 10.43 +[2024-12-31 18:11:30,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2820.73 | bwd: 4185.56 | bwd_inner: 4111.68 | bwd_allreduce: 73.35 | step: 13.46 + 55%|█████▌ | 420/759 [1:01:25<41:54, 7.42s/it] {'loss': 1.2306, 'learning_rate': 8.765303028458468e-06, 'epoch': 0.55} + 55%|█████▌ | 420/759 [1:01:25<41:54, 7.42s/it][2024-12-31 18:11:30,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.47 | bwd_microstep: 371.21 | bwd_inner_microstep: 370.75 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.25 +[2024-12-31 18:11:31,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.95 | bwd_microstep: 289.45 | bwd_inner_microstep: 289.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:31,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.68 | bwd_microstep: 282.52 | bwd_inner_microstep: 282.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:11:32,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.50 | bwd_microstep: 262.05 | bwd_inner_microstep: 262.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:32,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.53 | bwd_microstep: 256.12 | bwd_inner_microstep: 256.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:33,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 259.99 | bwd_inner_microstep: 259.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:33,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:34,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 245.87 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:11:34,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:34,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 244.24 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:35,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:35,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.17 | bwd_microstep: 241.70 | bwd_inner_microstep: 241.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:11:36,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 242.26 | bwd_inner_microstep: 242.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:36,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 249.80 | bwd_inner_microstep: 249.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:37,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 242.41 | bwd_inner_microstep: 242.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:37,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.92 | optimizer_gradients: 0.73 | optimizer_step: 3.22 +[2024-12-31 18:11:37,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 257.73 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.37 +[2024-12-31 18:11:37,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2841.95 | bwd: 4180.37 | bwd_inner: 4165.81 | bwd_allreduce: 13.92 | step: 14.54 + 55%|█████▌ | 421/759 [1:01:32<41:37, 7.39s/it] {'loss': 1.2609, 'learning_rate': 8.722956330804456e-06, 'epoch': 0.55} + 55%|█████▌ | 421/759 [1:01:32<41:37, 7.39s/it][2024-12-31 18:11:38,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.07 | bwd_microstep: 416.70 | bwd_inner_microstep: 416.32 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:11:38,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.66 | bwd_microstep: 291.58 | bwd_inner_microstep: 291.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:39,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.37 | bwd_microstep: 280.25 | bwd_inner_microstep: 280.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:39,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.15 | bwd_microstep: 261.64 | bwd_inner_microstep: 261.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:40,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.19 | bwd_microstep: 254.04 | bwd_inner_microstep: 254.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:40,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 249.28 | bwd_inner_microstep: 249.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:41,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:11:41,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:41,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:42,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:42,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:11:43,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 245.06 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:43,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:44,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 242.89 | bwd_inner_microstep: 242.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:44,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.32 | bwd_microstep: 241.50 | bwd_inner_microstep: 241.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:44,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.71 | optimizer_step: 3.25 +[2024-12-31 18:11:44,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 256.86 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 13.55 | step_microstep: 10.72 +[2024-12-31 18:11:44,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2862.07 | bwd: 4211.78 | bwd_inner: 4196.96 | bwd_allreduce: 13.87 | step: 13.88 + 56%|█████▌ | 422/759 [1:01:39<41:29, 7.39s/it] {'loss': 1.1832, 'learning_rate': 8.680632900622752e-06, 'epoch': 0.56} + 56%|█████▌ | 422/759 [1:01:39<41:29, 7.39s/it][2024-12-31 18:11:45,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.86 | bwd_microstep: 407.16 | bwd_inner_microstep: 406.79 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:11:46,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.94 | bwd_microstep: 297.34 | bwd_inner_microstep: 297.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:46,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.04 | bwd_microstep: 281.04 | bwd_inner_microstep: 281.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:47,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.30 | bwd_microstep: 280.56 | bwd_inner_microstep: 280.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:47,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.62 | bwd_microstep: 249.92 | bwd_inner_microstep: 249.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:48,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 249.01 | bwd_inner_microstep: 248.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:48,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 248.76 | bwd_inner_microstep: 248.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:48,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:49,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:49,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:50,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:50,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 244.95 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:51,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:51,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.12 | bwd_microstep: 241.98 | bwd_inner_microstep: 241.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:51,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.22 | bwd_microstep: 241.01 | bwd_inner_microstep: 240.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:11:52,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.40 | optimizer_gradients: 0.56 | optimizer_step: 3.10 +[2024-12-31 18:11:52,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.02 | bwd_microstep: 345.76 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 101.65 | step_microstep: 11.00 +[2024-12-31 18:11:52,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2873.06 | bwd: 4315.72 | bwd_inner: 4213.27 | bwd_allreduce: 101.90 | step: 13.97 + 56%|█████▌ | 423/759 [1:01:47<41:31, 7.41s/it] {'loss': 1.2261, 'learning_rate': 8.638333509037537e-06, 'epoch': 0.56} + 56%|█████▌ | 423/759 [1:01:47<41:31, 7.41s/it][2024-12-31 18:11:52,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.88 | bwd_microstep: 318.39 | bwd_inner_microstep: 318.08 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.19 +[2024-12-31 18:11:53,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.23 | bwd_microstep: 291.19 | bwd_inner_microstep: 291.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:53,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.54 | bwd_microstep: 267.65 | bwd_inner_microstep: 267.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:11:54,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.49 | bwd_microstep: 262.88 | bwd_inner_microstep: 262.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:54,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.85 | bwd_microstep: 256.54 | bwd_inner_microstep: 256.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +local variable 'images' referenced before assignment nuscene_cap_194k +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +local variable 'images' referenced before assignment nuscene_cap_194k +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 212, in multi_modal_multi_image_get_item + image = Image.open(new_cam_path[key]).convert('RGB') + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/site-packages/PIL/Image.py", line 3466, in open + filename = os.path.realpath(os.fspath(fp)) + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/posixpath.py", line 393, in realpath + return abspath(path) + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/posixpath.py", line 380, in abspath + cwd = os.getcwd() +FileNotFoundError: [Errno 2] No such file or directory +Traceback (most recent call last): + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 272, in __getitem__ + ret = self.multi_modal_multi_image_get_item(data_item) + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 214, in multi_modal_multi_image_get_item + print(f'Failed to load image: {images}, the dataset is: {self.ds_name}') +UnboundLocalError: local variable 'images' referenced before assignment + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 212, in multi_modal_multi_image_get_item + image = Image.open(new_cam_path[key]).convert('RGB') + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/site-packages/PIL/Image.py", line 3466, in open + filename = os.path.realpath(os.fspath(fp)) + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/posixpath.py", line 393, in realpath + return abspath(path) + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/posixpath.py", line 380, in abspath + cwd = os.getcwd() +FileNotFoundError: [Errno 2] No such file or directory + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 272, in __getitem__ + ret = self.multi_modal_multi_image_get_item(data_item) + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 214, in multi_modal_multi_image_get_item + print(f'Failed to load image: {images}, the dataset is: {self.ds_name}') +UnboundLocalError: local variable 'images' referenced before assignment +[2024-12-31 18:11:55,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 253.35 | bwd_inner_microstep: 253.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:55,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 248.09 | bwd_inner_microstep: 248.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:56,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 245.44 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:11:56,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:11:57,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:11:57,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:57,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:58,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:11:58,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:11:59,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:11:59,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.00 | optimizer_gradients: 0.55 | optimizer_step: 3.08 +[2024-12-31 18:11:59,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 299.90 | bwd_inner_microstep: 245.66 | bwd_allreduce_microstep: 54.19 | step_microstep: 10.40 +[2024-12-31 18:11:59,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.65 | bwd: 4152.16 | bwd_inner: 4097.13 | bwd_allreduce: 54.43 | step: 13.02 + 56%|█████▌ | 424/759 [1:01:54<41:05, 7.36s/it] {'loss': 1.2133, 'learning_rate': 8.59605892673499e-06, 'epoch': 0.56} + 56%|█████▌ | 424/759 [1:01:54<41:05, 7.36s/it][2024-12-31 18:12:00,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.43 | bwd_microstep: 342.09 | bwd_inner_microstep: 341.74 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.24 +[2024-12-31 18:12:00,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.06 | bwd_microstep: 286.99 | bwd_inner_microstep: 286.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:01,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.52 | bwd_microstep: 268.09 | bwd_inner_microstep: 268.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:12:01,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 262.54 | bwd_inner_microstep: 262.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:12:02,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.47 | bwd_microstep: 255.50 | bwd_inner_microstep: 255.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:02,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.38 | bwd_microstep: 256.06 | bwd_inner_microstep: 256.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:12:03,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:12:03,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.30 | bwd_microstep: 248.78 | bwd_inner_microstep: 248.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:12:03,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:04,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:12:04,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.82 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:12:05,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 250.08 | bwd_inner_microstep: 250.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:05,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 249.48 | bwd_inner_microstep: 249.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:12:06,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:06,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:06,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.04 | optimizer_gradients: 0.67 | optimizer_step: 3.16 +[2024-12-31 18:12:06,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 271.44 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 26.67 | step_microstep: 12.17 +[2024-12-31 18:12:06,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.45 | bwd: 4162.93 | bwd_inner: 4135.39 | bwd_allreduce: 26.93 | step: 15.03 + 56%|█████▌ | 425/759 [1:02:01<40:50, 7.34s/it] {'loss': 1.2271, 'learning_rate': 8.55380992394929e-06, 'epoch': 0.56} + 56%|█████▌ | 425/759 [1:02:01<40:50, 7.34s/it][2024-12-31 18:12:07,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.76 | bwd_microstep: 310.67 | bwd_inner_microstep: 310.29 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:12:07,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.86 | bwd_microstep: 284.10 | bwd_inner_microstep: 284.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:12:08,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.61 | bwd_microstep: 267.90 | bwd_inner_microstep: 267.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:08,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.70 | bwd_microstep: 264.41 | bwd_inner_microstep: 264.08 | bwd_allreduce_microstep: 0.22 | step_microstep: 0.27 +[2024-12-31 18:12:09,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.69 | bwd_microstep: 256.96 | bwd_inner_microstep: 256.78 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.20 +[2024-12-31 18:12:09,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:10,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 249.77 | bwd_inner_microstep: 249.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:10,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 248.22 | bwd_inner_microstep: 248.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:11,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:11,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:11,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:12,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:12,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:13,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.71 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:13,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.07 | bwd_microstep: 242.47 | bwd_inner_microstep: 242.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:14,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.90 | optimizer_gradients: 0.66 | optimizer_step: 3.24 +[2024-12-31 18:12:14,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 272.34 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 27.52 | step_microstep: 10.93 +[2024-12-31 18:12:14,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2804.35 | bwd: 4106.81 | bwd_inner: 4077.94 | bwd_allreduce: 28.15 | step: 14.01 + 56%|█████▌ | 426/759 [1:02:09<40:30, 7.30s/it] {'loss': 1.2241, 'learning_rate': 8.511587270448556e-06, 'epoch': 0.56} + 56%|█████▌ | 426/759 [1:02:09<40:30, 7.30s/it][2024-12-31 18:12:14,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.15 | bwd_microstep: 353.76 | bwd_inner_microstep: 353.39 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:12:15,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.75 | bwd_microstep: 288.72 | bwd_inner_microstep: 288.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:15,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.46 | bwd_microstep: 281.92 | bwd_inner_microstep: 281.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:16,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.49 | bwd_microstep: 258.29 | bwd_inner_microstep: 258.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:16,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 257.05 | bwd_inner_microstep: 257.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:17,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.21 | bwd_microstep: 255.53 | bwd_inner_microstep: 255.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:17,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:17,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:18,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 251.99 | bwd_inner_microstep: 251.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:12:18,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:19,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:12:19,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:12:20,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:20,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:21,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.26 +[2024-12-31 18:12:21,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.63 | optimizer_step: 3.45 +[2024-12-31 18:12:21,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 258.88 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 13.70 | step_microstep: 10.83 +[2024-12-31 18:12:21,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.20 | bwd: 4171.18 | bwd_inner: 4156.08 | bwd_allreduce: 14.20 | step: 13.96 + 56%|█████▋ | 427/759 [1:02:16<40:25, 7.31s/it] {'loss': 1.2091, 'learning_rate': 8.469391735520824e-06, 'epoch': 0.56} + 56%|█████▋ | 427/759 [1:02:16<40:25, 7.31s/it][2024-12-31 18:12:22,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.68 | bwd_microstep: 306.88 | bwd_inner_microstep: 306.53 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:12:22,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.78 | bwd_microstep: 320.29 | bwd_inner_microstep: 320.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:12:23,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.76 | bwd_microstep: 268.07 | bwd_inner_microstep: 268.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:23,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.53 | bwd_microstep: 257.24 | bwd_inner_microstep: 257.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:23,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.37 | bwd_microstep: 257.04 | bwd_inner_microstep: 257.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:12:24,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 249.38 | bwd_inner_microstep: 249.17 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.24 +[2024-12-31 18:12:24,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:25,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 249.69 | bwd_inner_microstep: 249.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:25,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 246.52 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:26,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:26,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:26,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 243.01 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:12:27,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 242.48 | bwd_inner_microstep: 242.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:27,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.98 | bwd_microstep: 267.84 | bwd_inner_microstep: 267.64 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.27 +[2024-12-31 18:12:28,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 242.92 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:28,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.87 | optimizer_step: 3.59 +[2024-12-31 18:12:28,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.54 | bwd_microstep: 342.59 | bwd_inner_microstep: 328.59 | bwd_allreduce_microstep: 13.91 | step_microstep: 11.91 +[2024-12-31 18:12:28,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.77 | bwd: 4228.85 | bwd_inner: 4213.58 | bwd_allreduce: 14.41 | step: 15.18 + 56%|█████▋ | 428/759 [1:02:23<40:21, 7.32s/it] {'loss': 1.247, 'learning_rate': 8.42722408796004e-06, 'epoch': 0.56} + 56%|█████▋ | 428/759 [1:02:23<40:21, 7.32s/it][2024-12-31 18:12:29,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 245.12 | bwd_microstep: 403.63 | bwd_inner_microstep: 403.24 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.22 +[2024-12-31 18:12:30,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.83 | bwd_microstep: 291.11 | bwd_inner_microstep: 291.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:30,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.40 | bwd_microstep: 266.78 | bwd_inner_microstep: 266.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:30,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.32 | bwd_microstep: 263.41 | bwd_inner_microstep: 263.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:12:31,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.99 | bwd_microstep: 256.87 | bwd_inner_microstep: 256.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:12:31,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 248.39 | bwd_inner_microstep: 248.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:32,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 249.07 | bwd_inner_microstep: 249.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:32,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:12:33,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 256.24 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:33,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:12:34,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:12:34,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.17 +[2024-12-31 18:12:34,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.91 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:12:35,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:35,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:36,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.57 | optimizer_gradients: 4.92 | optimizer_step: 6.29 +[2024-12-31 18:12:36,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.14 | bwd_microstep: 269.36 | bwd_inner_microstep: 226.26 | bwd_allreduce_microstep: 43.05 | step_microstep: 20.49 +[2024-12-31 18:12:36,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2931.18 | bwd: 4207.47 | bwd_inner: 4163.28 | bwd_allreduce: 43.43 | step: 22.97 + 57%|█████▋ | 429/759 [1:02:31<40:23, 7.34s/it] {'loss': 1.2601, 'learning_rate': 8.385085096052053e-06, 'epoch': 0.57} + 57%|█████▋ | 429/759 [1:02:31<40:23, 7.34s/it][2024-12-31 18:12:36,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.93 | bwd_microstep: 348.10 | bwd_inner_microstep: 347.75 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:12:37,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.69 | bwd_microstep: 282.80 | bwd_inner_microstep: 282.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:37,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.32 | bwd_microstep: 262.29 | bwd_inner_microstep: 262.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:38,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 256.63 | bwd_inner_microstep: 256.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:38,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:39,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:39,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:39,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 246.15 | bwd_inner_microstep: 246.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:40,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 246.58 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:12:40,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:41,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:41,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:12:42,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:42,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 241.48 | bwd_inner_microstep: 241.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:42,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.97 | bwd_microstep: 241.44 | bwd_inner_microstep: 241.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:43,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.86 | optimizer_step: 3.40 +[2024-12-31 18:12:43,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.45 | bwd_microstep: 255.52 | bwd_inner_microstep: 241.84 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.19 +[2024-12-31 18:12:43,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.18 | bwd: 4098.74 | bwd_inner: 4084.21 | bwd_allreduce: 13.87 | step: 14.07 + 57%|█████▋ | 430/759 [1:02:38<40:01, 7.30s/it] {'loss': 1.2767, 'learning_rate': 8.342975527560601e-06, 'epoch': 0.57} + 57%|█████▋ | 430/759 [1:02:38<40:01, 7.30s/it][2024-12-31 18:12:44,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 242.18 | bwd_microstep: 397.58 | bwd_inner_microstep: 397.23 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:12:44,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.96 | bwd_microstep: 294.07 | bwd_inner_microstep: 294.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:12:45,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.50 | bwd_microstep: 282.40 | bwd_inner_microstep: 282.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:12:45,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.37 | bwd_microstep: 268.83 | bwd_inner_microstep: 268.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:46,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.78 | bwd_microstep: 268.58 | bwd_inner_microstep: 268.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:12:46,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 248.84 | bwd_inner_microstep: 248.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:46,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 249.02 | bwd_inner_microstep: 249.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:12:47,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.16 | bwd_microstep: 247.72 | bwd_inner_microstep: 247.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:47,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:48,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 246.02 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:48,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:12:49,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:49,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 256.41 | bwd_inner_microstep: 256.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:49,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.13 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:50,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 246.09 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:50,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.73 | optimizer_gradients: 0.80 | optimizer_step: 3.16 +[2024-12-31 18:12:50,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.39 | bwd_microstep: 378.06 | bwd_inner_microstep: 226.93 | bwd_allreduce_microstep: 151.08 | step_microstep: 10.92 +[2024-12-31 18:12:50,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2847.96 | bwd: 4362.62 | bwd_inner: 4210.51 | bwd_allreduce: 151.38 | step: 13.68 + 57%|█████▋ | 431/759 [1:02:45<40:12, 7.35s/it] {'loss': 1.2175, 'learning_rate': 8.300896149713334e-06, 'epoch': 0.57} + 57%|█████▋ | 431/759 [1:02:45<40:12, 7.35s/it][2024-12-31 18:12:51,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.60 | bwd_microstep: 340.32 | bwd_inner_microstep: 339.79 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.27 +[2024-12-31 18:12:52,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.18 | bwd_microstep: 293.87 | bwd_inner_microstep: 293.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:12:52,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.48 | bwd_microstep: 293.92 | bwd_inner_microstep: 293.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:12:52,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 250.91 | bwd_inner_microstep: 250.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:53,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.01 | bwd_microstep: 256.90 | bwd_inner_microstep: 256.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:12:53,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:12:54,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:54,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:12:55,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:12:55,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:56,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.31 +[2024-12-31 18:12:56,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:56,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:12:57,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:12:57,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:12:58,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.68 | optimizer_step: 3.13 +[2024-12-31 18:12:58,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 440.74 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 196.95 | step_microstep: 10.50 +[2024-12-31 18:12:58,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.65 | bwd: 4329.08 | bwd_inner: 4130.74 | bwd_allreduce: 197.47 | step: 13.68 + 57%|█████▋ | 432/759 [1:02:53<40:13, 7.38s/it] {'loss': 1.2192, 'learning_rate': 8.258847729187845e-06, 'epoch': 0.57} + 57%|█████▋ | 432/759 [1:02:53<40:13, 7.38s/it][2024-12-31 18:12:59,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 287.44 | bwd_microstep: 500.22 | bwd_inner_microstep: 499.80 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.19 +[2024-12-31 18:12:59,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.46 | bwd_microstep: 282.24 | bwd_inner_microstep: 282.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:00,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.75 | bwd_microstep: 256.29 | bwd_inner_microstep: 256.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:00,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 249.37 | bwd_inner_microstep: 249.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:13:00,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 249.62 | bwd_inner_microstep: 249.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:01,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 248.62 | bwd_inner_microstep: 248.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:01,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.18 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:13:02,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:02,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 251.00 | bwd_inner_microstep: 250.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:03,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:03,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:04,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.30 +[2024-12-31 18:13:04,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:04,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:05,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.82 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.25 +[2024-12-31 18:13:05,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 0.62 | optimizer_step: 3.39 +[2024-12-31 18:13:05,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.40 | bwd_microstep: 255.46 | bwd_inner_microstep: 241.67 | bwd_allreduce_microstep: 13.67 | step_microstep: 11.11 +[2024-12-31 18:13:05,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2867.52 | bwd: 4249.41 | bwd_inner: 4234.12 | bwd_allreduce: 14.34 | step: 14.39 + 57%|█████▋ | 433/759 [1:03:00<40:10, 7.39s/it] {'loss': 1.2512, 'learning_rate': 8.216831032097689e-06, 'epoch': 0.57} + 57%|█████▋ | 433/759 [1:03:00<40:10, 7.39s/it][2024-12-31 18:13:06,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.37 | bwd_microstep: 354.07 | bwd_inner_microstep: 353.71 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:13:06,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.27 | bwd_microstep: 286.89 | bwd_inner_microstep: 286.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:07,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.26 | bwd_microstep: 263.89 | bwd_inner_microstep: 263.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:07,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.17 | bwd_microstep: 261.58 | bwd_inner_microstep: 261.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:08,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 252.90 | bwd_inner_microstep: 252.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:13:08,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 250.11 | bwd_inner_microstep: 250.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:09,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 247.81 | bwd_inner_microstep: 247.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:09,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 246.49 | bwd_inner_microstep: 246.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:13:09,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:13:10,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 249.65 | bwd_inner_microstep: 249.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:10,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:11,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:11,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.19 | bwd_microstep: 240.90 | bwd_inner_microstep: 240.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:12,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.78 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:12,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:13,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.91 | optimizer_gradients: 0.56 | optimizer_step: 3.11 +[2024-12-31 18:13:13,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 563.35 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 318.60 | step_microstep: 10.63 +[2024-12-31 18:13:13,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2823.50 | bwd: 4442.39 | bwd_inner: 4122.91 | bwd_allreduce: 318.85 | step: 13.46 + 57%|█████▋ | 434/759 [1:03:08<40:17, 7.44s/it] {'loss': 1.2384, 'learning_rate': 8.174846823978412e-06, 'epoch': 0.57} + 57%|█████▋ | 434/759 [1:03:08<40:17, 7.44s/it][2024-12-31 18:13:13,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.65 | bwd_microstep: 340.83 | bwd_inner_microstep: 340.47 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:13:14,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.32 | bwd_microstep: 309.34 | bwd_inner_microstep: 309.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:14,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.27 | bwd_microstep: 282.10 | bwd_inner_microstep: 282.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:15,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 262.45 | bwd_inner_microstep: 262.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:13:15,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 250.12 | bwd_inner_microstep: 250.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:16,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 245.77 | bwd_inner_microstep: 245.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:16,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:17,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 248.88 | bwd_inner_microstep: 248.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:13:17,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 246.27 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:17,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:18,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:18,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:19,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:19,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.75 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:20,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.29 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:20,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.38 | optimizer_gradients: 0.62 | optimizer_step: 3.09 +[2024-12-31 18:13:20,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.40 | bwd_microstep: 496.76 | bwd_inner_microstep: 241.77 | bwd_allreduce_microstep: 254.94 | step_microstep: 11.15 +[2024-12-31 18:13:20,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2824.38 | bwd: 4390.34 | bwd_inner: 4134.62 | bwd_allreduce: 255.18 | step: 14.08 + 57%|█████▋ | 435/759 [1:03:15<40:15, 7.46s/it] {'loss': 1.2098, 'learning_rate': 8.132895869773638e-06, 'epoch': 0.57} + 57%|█████▋ | 435/759 [1:03:15<40:15, 7.46s/it][2024-12-31 18:13:21,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.15 | bwd_microstep: 304.22 | bwd_inner_microstep: 303.86 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:13:21,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.24 | bwd_microstep: 289.31 | bwd_inner_microstep: 289.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:13:22,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.64 | bwd_microstep: 269.37 | bwd_inner_microstep: 269.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:22,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.13 | bwd_microstep: 265.01 | bwd_inner_microstep: 264.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:23,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.61 | bwd_microstep: 261.94 | bwd_inner_microstep: 261.59 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.29 +[2024-12-31 18:13:23,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.17 | bwd_microstep: 264.84 | bwd_inner_microstep: 264.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:24,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 247.08 | bwd_inner_microstep: 247.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:13:24,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 247.57 | bwd_inner_microstep: 247.36 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.23 +[2024-12-31 18:13:25,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 247.34 | bwd_inner_microstep: 247.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:13:25,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 246.89 | bwd_inner_microstep: 246.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:25,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:26,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:26,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:13:27,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.05 | bwd_microstep: 241.70 | bwd_inner_microstep: 241.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:27,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:28,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.95 | optimizer_step: 3.23 +[2024-12-31 18:13:28,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 337.86 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 94.18 | step_microstep: 10.96 +[2024-12-31 18:13:28,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.25 | bwd: 4198.33 | bwd_inner: 4102.78 | bwd_allreduce: 94.78 | step: 13.84 + 57%|█████▋ | 436/759 [1:03:23<39:52, 7.41s/it] {'loss': 1.2061, 'learning_rate': 8.0909789338211e-06, 'epoch': 0.57} + 57%|█████▋ | 436/759 [1:03:23<39:52, 7.41s/it][2024-12-31 18:13:28,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.15 | bwd_microstep: 346.17 | bwd_inner_microstep: 345.82 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:13:29,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.33 | bwd_microstep: 287.26 | bwd_inner_microstep: 287.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:29,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.76 | bwd_microstep: 265.75 | bwd_inner_microstep: 265.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:30,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 257.58 | bwd_inner_microstep: 257.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:30,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 250.55 | bwd_inner_microstep: 250.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:31,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:13:31,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.45 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:31,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 246.91 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:32,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:13:32,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:33,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:33,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:13:34,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:13:34,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:34,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:35,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.04 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:13:35,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 525.34 | bwd_inner_microstep: 242.29 | bwd_allreduce_microstep: 283.01 | step_microstep: 10.53 +[2024-12-31 18:13:35,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2828.44 | bwd: 4382.86 | bwd_inner: 4098.76 | bwd_allreduce: 283.32 | step: 13.24 + 58%|█████▊ | 437/759 [1:03:30<39:52, 7.43s/it] {'loss': 1.2013, 'learning_rate': 8.04909677983872e-06, 'epoch': 0.58} + 58%|█████▊ | 437/759 [1:03:30<39:52, 7.43s/it][2024-12-31 18:13:36,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.29 | bwd_microstep: 341.61 | bwd_inner_microstep: 341.49 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.09 +[2024-12-31 18:13:36,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.38 | bwd_microstep: 354.28 | bwd_inner_microstep: 354.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:13:37,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.63 | bwd_microstep: 269.61 | bwd_inner_microstep: 269.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:37,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.47 | bwd_microstep: 258.86 | bwd_inner_microstep: 258.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:38,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 250.02 | bwd_inner_microstep: 249.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:38,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.54 | bwd_microstep: 254.95 | bwd_inner_microstep: 254.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:39,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 270.65 | bwd_microstep: 455.78 | bwd_inner_microstep: 455.42 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.30 +[2024-12-31 18:13:39,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 246.76 | bwd_inner_microstep: 246.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:40,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 245.36 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:40,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:41,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:13:41,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 246.80 | bwd_inner_microstep: 246.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:41,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.10 | bwd_microstep: 242.51 | bwd_inner_microstep: 242.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:13:42,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:13:42,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 243.57 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:43,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.68 | optimizer_gradients: 0.72 | optimizer_step: 3.39 +[2024-12-31 18:13:43,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.51 | bwd_microstep: 255.46 | bwd_inner_microstep: 241.75 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.15 +[2024-12-31 18:13:43,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2953.41 | bwd: 4396.26 | bwd_inner: 4381.43 | bwd_allreduce: 14.09 | step: 14.07 + 58%|█████▊ | 438/759 [1:03:38<40:04, 7.49s/it] {'loss': 1.1785, 'learning_rate': 8.00725017091071e-06, 'epoch': 0.58} + 58%|█████▊ | 438/759 [1:03:38<40:04, 7.49s/it][2024-12-31 18:13:43,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.39 | bwd_microstep: 308.55 | bwd_inner_microstep: 308.42 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.08 +[2024-12-31 18:13:44,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.02 | bwd_microstep: 352.60 | bwd_inner_microstep: 352.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:44,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.99 | bwd_microstep: 269.27 | bwd_inner_microstep: 269.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:45,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.15 | bwd_microstep: 258.55 | bwd_inner_microstep: 258.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:13:45,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.24 | bwd_microstep: 248.04 | bwd_inner_microstep: 248.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:13:46,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 247.28 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:46,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 254.37 | bwd_inner_microstep: 254.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:13:47,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.56 | bwd_microstep: 254.64 | bwd_inner_microstep: 254.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:47,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 244.95 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:47,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 254.89 | bwd_inner_microstep: 254.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:13:48,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:48,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.46 | bwd_microstep: 241.10 | bwd_inner_microstep: 241.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:49,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:13:49,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:50,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:13:50,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.78 | optimizer_gradients: 0.81 | optimizer_step: 3.13 +[2024-12-31 18:13:50,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.41 | bwd_microstep: 314.91 | bwd_inner_microstep: 241.70 | bwd_allreduce_microstep: 73.17 | step_microstep: 10.81 +[2024-12-31 18:13:50,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.88 | bwd: 4223.92 | bwd_inner: 4149.94 | bwd_allreduce: 73.37 | step: 13.51 + 58%|█████▊ | 439/759 [1:03:45<39:43, 7.45s/it] {'loss': 1.2051, 'learning_rate': 7.965439869473664e-06, 'epoch': 0.58} + 58%|█████▊ | 439/759 [1:03:45<39:43, 7.45s/it][2024-12-31 18:13:51,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.48 | bwd_microstep: 358.80 | bwd_inner_microstep: 358.41 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:13:51,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.33 | bwd_microstep: 302.89 | bwd_inner_microstep: 302.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:52,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.47 | bwd_microstep: 289.86 | bwd_inner_microstep: 289.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:52,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.84 | bwd_microstep: 282.27 | bwd_inner_microstep: 282.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:53,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.51 | bwd_microstep: 262.76 | bwd_inner_microstep: 262.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:53,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.21 | bwd_microstep: 256.97 | bwd_inner_microstep: 256.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:54,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 249.19 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:54,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 249.56 | bwd_inner_microstep: 249.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:54,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:55,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:13:55,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.98 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:56,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:56,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.26 +[2024-12-31 18:13:57,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 284.50 | bwd_microstep: 242.76 | bwd_inner_microstep: 242.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:13:57,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 257.20 | bwd_inner_microstep: 257.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:13:58,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.62 | optimizer_step: 3.84 +[2024-12-31 18:13:58,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.08 | bwd_microstep: 343.24 | bwd_inner_microstep: 240.87 | bwd_allreduce_microstep: 102.32 | step_microstep: 11.15 +[2024-12-31 18:13:58,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2979.30 | bwd: 4318.05 | bwd_inner: 4214.33 | bwd_allreduce: 102.85 | step: 14.33 + 58%|█████▊ | 440/759 [1:03:53<39:49, 7.49s/it] {'loss': 1.2502, 'learning_rate': 7.923666637302643e-06, 'epoch': 0.58} + 58%|█████▊ | 440/759 [1:03:53<39:49, 7.49s/it][2024-12-31 18:13:58,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.05 | bwd_microstep: 313.70 | bwd_inner_microstep: 313.34 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:13:59,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.10 | bwd_microstep: 292.02 | bwd_inner_microstep: 291.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:13:59,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.75 | bwd_microstep: 267.88 | bwd_inner_microstep: 267.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:00,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.81 | bwd_microstep: 266.71 | bwd_inner_microstep: 266.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:00,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 258.19 | bwd_inner_microstep: 258.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:14:01,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 250.05 | bwd_inner_microstep: 250.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:01,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 247.82 | bwd_inner_microstep: 247.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:01,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 261.47 | bwd_inner_microstep: 261.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:02,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:02,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.23 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:03,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 313.27 | bwd_inner_microstep: 313.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:14:03,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:04,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:04,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:05,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.78 | bwd_microstep: 240.95 | bwd_inner_microstep: 240.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:05,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.22 | optimizer_gradients: 8.85 | optimizer_step: 7.78 +[2024-12-31 18:14:05,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 309.51 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 64.69 | step_microstep: 25.24 +[2024-12-31 18:14:05,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2867.74 | bwd: 4243.68 | bwd_inner: 4178.17 | bwd_allreduce: 64.94 | step: 28.28 + 58%|█████▊ | 441/759 [1:04:00<39:35, 7.47s/it] {'loss': 1.1977, 'learning_rate': 7.881931235497324e-06, 'epoch': 0.58} + 58%|█████▊ | 441/759 [1:04:00<39:35, 7.47s/it][2024-12-31 18:14:06,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.43 | bwd_microstep: 350.65 | bwd_inner_microstep: 350.31 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:14:06,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.28 | bwd_microstep: 292.09 | bwd_inner_microstep: 292.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:07,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.82 | bwd_microstep: 286.75 | bwd_inner_microstep: 286.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:07,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.62 | bwd_microstep: 267.88 | bwd_inner_microstep: 267.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:14:08,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.25 | bwd_microstep: 264.71 | bwd_inner_microstep: 264.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:08,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.41 | bwd_microstep: 255.67 | bwd_inner_microstep: 255.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:09,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.49 | bwd_microstep: 255.52 | bwd_inner_microstep: 255.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:09,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:09,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 246.89 | bwd_inner_microstep: 246.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:10,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 247.42 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.34 +[2024-12-31 18:14:10,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:14:11,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:11,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:14:12,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:14:12,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:12,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 1.03 | optimizer_step: 3.43 +[2024-12-31 18:14:12,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 255.75 | bwd_inner_microstep: 241.97 | bwd_allreduce_microstep: 13.72 | step_microstep: 11.28 +[2024-12-31 18:14:12,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.22 | bwd: 4192.68 | bwd_inner: 4177.75 | bwd_allreduce: 14.22 | step: 14.34 + 58%|█████▊ | 442/759 [1:04:07<39:17, 7.44s/it] {'loss': 1.1519, 'learning_rate': 7.84023442446813e-06, 'epoch': 0.58} + 58%|█████▊ | 442/759 [1:04:07<39:17, 7.44s/it][2024-12-31 18:14:13,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.01 | bwd_microstep: 339.82 | bwd_inner_microstep: 339.46 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:14:14,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.15 | bwd_microstep: 288.07 | bwd_inner_microstep: 288.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:14:14,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.14 | bwd_microstep: 266.34 | bwd_inner_microstep: 266.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:14:14,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.64 | bwd_microstep: 255.42 | bwd_inner_microstep: 255.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:15,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.54 | bwd_microstep: 255.56 | bwd_inner_microstep: 255.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:14:15,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 247.10 | bwd_inner_microstep: 247.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:14:16,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:16,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 258.74 | bwd_inner_microstep: 258.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:17,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:17,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 253.59 | bwd_inner_microstep: 253.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.31 +[2024-12-31 18:14:18,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 246.13 | bwd_inner_microstep: 246.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:18,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.08 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:18,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:14:19,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 249.94 | bwd_inner_microstep: 249.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:14:19,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 247.98 | bwd_inner_microstep: 247.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:20,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.40 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 18:14:20,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 293.60 | bwd_inner_microstep: 246.13 | bwd_allreduce_microstep: 47.41 | step_microstep: 11.44 +[2024-12-31 18:14:20,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.89 | bwd: 4182.05 | bwd_inner: 4133.78 | bwd_allreduce: 47.66 | step: 14.28 + 58%|█████▊ | 443/759 [1:04:15<38:55, 7.39s/it] {'loss': 1.2102, 'learning_rate': 7.798576963922347e-06, 'epoch': 0.58} + 58%|█████▊ | 443/759 [1:04:15<38:55, 7.39s/it][2024-12-31 18:14:20,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.63 | bwd_microstep: 354.14 | bwd_inner_microstep: 353.73 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.20 +[2024-12-31 18:14:21,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.65 | bwd_microstep: 293.89 | bwd_inner_microstep: 293.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:14:21,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.20 | bwd_microstep: 286.86 | bwd_inner_microstep: 286.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:14:22,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 250.02 | bwd_inner_microstep: 249.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:14:22,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.83 | bwd_microstep: 247.59 | bwd_inner_microstep: 247.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:23,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 249.79 | bwd_inner_microstep: 249.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:23,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 245.73 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:24,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:24,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 246.37 | bwd_inner_microstep: 246.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:24,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 245.53 | bwd_inner_microstep: 245.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:25,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.60 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:25,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:26,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:14:26,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:27,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:27,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.06 | optimizer_step: 3.38 +[2024-12-31 18:14:27,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.68 | bwd_microstep: 256.09 | bwd_inner_microstep: 242.31 | bwd_allreduce_microstep: 13.67 | step_microstep: 11.57 +[2024-12-31 18:14:27,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.50 | bwd: 4144.54 | bwd_inner: 4129.81 | bwd_allreduce: 14.03 | step: 14.68 + 58%|█████▊ | 444/759 [1:04:22<38:39, 7.36s/it] {'loss': 1.2243, 'learning_rate': 7.75695961285032e-06, 'epoch': 0.58} + 58%|█████▊ | 444/759 [1:04:22<38:39, 7.36s/it][2024-12-31 18:14:28,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.81 | bwd_microstep: 358.80 | bwd_inner_microstep: 358.45 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:14:28,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 265.32 | bwd_microstep: 443.96 | bwd_inner_microstep: 443.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:14:29,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.52 | bwd_microstep: 263.35 | bwd_inner_microstep: 263.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:29,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.29 | bwd_inner_microstep: 248.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:30,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 248.91 | bwd_inner_microstep: 248.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:30,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:31,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:31,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 247.43 | bwd_inner_microstep: 247.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:31,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:32,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:32,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:14:33,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:33,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:34,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:34,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.26 | bwd_microstep: 240.86 | bwd_inner_microstep: 240.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:34,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.94 | optimizer_step: 3.25 +[2024-12-31 18:14:34,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.56 | bwd_microstep: 255.06 | bwd_inner_microstep: 241.32 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.52 +[2024-12-31 18:14:34,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.16 | bwd: 4261.50 | bwd_inner: 4246.97 | bwd_allreduce: 13.88 | step: 14.53 + 59%|█████▊ | 445/759 [1:04:29<38:38, 7.38s/it] {'loss': 1.2375, 'learning_rate': 7.71538312951161e-06, 'epoch': 0.59} + 59%|█████▊ | 445/759 [1:04:29<38:38, 7.38s/it][2024-12-31 18:14:35,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.32 | bwd_microstep: 358.02 | bwd_inner_microstep: 357.67 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:14:36,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.78 | bwd_microstep: 293.01 | bwd_inner_microstep: 292.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:36,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.41 | bwd_microstep: 282.73 | bwd_inner_microstep: 282.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:37,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.62 | bwd_microstep: 254.65 | bwd_inner_microstep: 254.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:37,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 249.13 | bwd_inner_microstep: 249.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:14:37,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:38,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 247.40 | bwd_inner_microstep: 247.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:14:38,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:39,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:39,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:40,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:40,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:40,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:41,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.99 | bwd_microstep: 225.05 | bwd_inner_microstep: 225.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:41,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:14:42,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.92 | optimizer_gradients: 0.70 | optimizer_step: 3.24 +[2024-12-31 18:14:42,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.16 | bwd_microstep: 259.10 | bwd_inner_microstep: 245.50 | bwd_allreduce_microstep: 13.51 | step_microstep: 11.36 +[2024-12-31 18:14:42,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2814.41 | bwd: 4129.23 | bwd_inner: 4114.87 | bwd_allreduce: 13.79 | step: 14.31 + 59%|█████▉ | 446/759 [1:04:37<38:15, 7.33s/it] {'loss': 1.2186, 'learning_rate': 7.673848271421166e-06, 'epoch': 0.59} + 59%|█████▉ | 446/759 [1:04:37<38:15, 7.33s/it][2024-12-31 18:14:42,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.35 | bwd_microstep: 313.75 | bwd_inner_microstep: 313.39 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:14:43,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.02 | bwd_microstep: 292.56 | bwd_inner_microstep: 292.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:43,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.49 | bwd_microstep: 269.32 | bwd_inner_microstep: 269.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:44,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.59 | bwd_microstep: 269.04 | bwd_inner_microstep: 269.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:14:44,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.18 | bwd_microstep: 255.08 | bwd_inner_microstep: 255.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:45,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:14:45,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 254.24 | bwd_inner_microstep: 254.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:45,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 249.98 | bwd_inner_microstep: 249.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:46,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 246.16 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:46,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:47,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 255.09 | bwd_inner_microstep: 255.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:47,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 254.41 | bwd_inner_microstep: 253.95 | bwd_allreduce_microstep: 0.29 | step_microstep: 0.26 +[2024-12-31 18:14:48,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:14:48,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:48,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.87 | bwd_microstep: 241.37 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:49,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.72 | optimizer_step: 3.38 +[2024-12-31 18:14:49,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 266.19 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 22.40 | step_microstep: 10.94 +[2024-12-31 18:14:49,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.03 | bwd: 4147.45 | bwd_inner: 4123.61 | bwd_allreduce: 23.00 | step: 13.90 + 59%|█████▉ | 447/759 [1:04:44<37:59, 7.31s/it] {'loss': 1.2363, 'learning_rate': 7.632355795335533e-06, 'epoch': 0.59} + 59%|█████▉ | 447/759 [1:04:44<37:59, 7.31s/it][2024-12-31 18:14:50,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.70 | bwd_microstep: 349.22 | bwd_inner_microstep: 348.87 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:14:50,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.47 | bwd_microstep: 297.08 | bwd_inner_microstep: 297.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:14:50,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.00 | bwd_microstep: 257.38 | bwd_inner_microstep: 257.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:14:51,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.74 | bwd_microstep: 261.35 | bwd_inner_microstep: 261.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:51,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.85 | bwd_microstep: 256.63 | bwd_inner_microstep: 256.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:14:52,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.75 | bwd_microstep: 249.19 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:14:52,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:14:53,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:53,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:54,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:54,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:54,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:14:55,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:55,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.58 | bwd_microstep: 242.39 | bwd_inner_microstep: 242.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:56,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 242.66 | bwd_inner_microstep: 242.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:56,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 1.10 | optimizer_step: 3.49 +[2024-12-31 18:14:56,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.62 | bwd_microstep: 256.37 | bwd_inner_microstep: 242.18 | bwd_allreduce_microstep: 14.10 | step_microstep: 12.14 +[2024-12-31 18:14:56,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2821.73 | bwd: 4122.51 | bwd_inner: 4107.59 | bwd_allreduce: 14.36 | step: 14.93 + 59%|█████▉ | 448/759 [1:04:51<37:44, 7.28s/it] {'loss': 1.2394, 'learning_rate': 7.590906457239073e-06, 'epoch': 0.59} + 59%|█████▉ | 448/759 [1:04:51<37:44, 7.28s/it][2024-12-31 18:14:57,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.54 | bwd_microstep: 341.35 | bwd_inner_microstep: 340.99 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:14:57,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.83 | bwd_microstep: 368.69 | bwd_inner_microstep: 368.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:14:58,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.02 | bwd_microstep: 263.74 | bwd_inner_microstep: 263.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:14:58,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.01 | bwd_microstep: 250.22 | bwd_inner_microstep: 250.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:14:59,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.60 | bwd_microstep: 256.93 | bwd_inner_microstep: 256.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:14:59,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 247.92 | bwd_inner_microstep: 247.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:15:00,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.15 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:00,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:00,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 252.33 | bwd_inner_microstep: 252.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:01,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:01,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:02,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:15:02,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:03,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 240.66 | bwd_inner_microstep: 240.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:15:03,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.95 | bwd_microstep: 241.46 | bwd_inner_microstep: 241.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:03,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.22 | optimizer_gradients: 0.57 | optimizer_step: 3.09 +[2024-12-31 18:15:03,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.80 | bwd_microstep: 280.82 | bwd_inner_microstep: 241.71 | bwd_allreduce_microstep: 39.06 | step_microstep: 11.14 +[2024-12-31 18:15:04,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2844.82 | bwd: 4217.25 | bwd_inner: 4177.42 | bwd_allreduce: 39.31 | step: 14.28 + 59%|█████▉ | 449/759 [1:04:58<37:42, 7.30s/it] {'loss': 1.214, 'learning_rate': 7.549501012330184e-06, 'epoch': 0.59} + 59%|█████▉ | 449/759 [1:04:58<37:42, 7.30s/it][2024-12-31 18:15:04,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.40 | bwd_microstep: 335.14 | bwd_inner_microstep: 334.76 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:15:05,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.76 | bwd_microstep: 312.51 | bwd_inner_microstep: 312.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:05,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.04 | bwd_microstep: 281.71 | bwd_inner_microstep: 281.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:15:06,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.06 | bwd_microstep: 281.78 | bwd_inner_microstep: 281.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:15:06,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.44 | bwd_microstep: 264.01 | bwd_inner_microstep: 263.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:15:06,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.66 | bwd_microstep: 256.37 | bwd_inner_microstep: 256.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:07,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 248.51 | bwd_inner_microstep: 248.03 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.32 +[2024-12-31 18:15:07,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:08,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 247.33 | bwd_inner_microstep: 247.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:15:08,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:09,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:09,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:10,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:10,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:15:10,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.42 | bwd_microstep: 240.83 | bwd_inner_microstep: 240.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:11,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.73 | optimizer_step: 3.27 +[2024-12-31 18:15:11,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 257.89 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 13.54 | step_microstep: 10.53 +[2024-12-31 18:15:11,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.70 | bwd: 4192.72 | bwd_inner: 4177.85 | bwd_allreduce: 14.02 | step: 13.51 + 59%|█████▉ | 450/759 [1:05:06<37:38, 7.31s/it] {'loss': 1.2012, 'learning_rate': 7.508140215007526e-06, 'epoch': 0.59} + 59%|█████▉ | 450/759 [1:05:06<37:38, 7.31s/it][2024-12-31 18:15:11,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.82 | bwd_microstep: 360.76 | bwd_inner_microstep: 360.42 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:15:12,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.74 | bwd_microstep: 297.35 | bwd_inner_microstep: 297.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:12,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.21 | bwd_microstep: 283.94 | bwd_inner_microstep: 283.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:13,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.65 | bwd_microstep: 254.97 | bwd_inner_microstep: 254.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:13,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.97 | bwd_microstep: 255.61 | bwd_inner_microstep: 255.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:14,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 248.87 | bwd_inner_microstep: 248.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:15:14,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 247.16 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:15:15,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 248.77 | bwd_inner_microstep: 248.33 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.27 +[2024-12-31 18:15:15,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:16,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 243.02 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:16,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:16,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:17,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.11 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.27 +[2024-12-31 18:15:17,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.29 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:18,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.18 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:18,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.78 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:15:18,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.06 | bwd_microstep: 357.13 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 115.92 | step_microstep: 11.51 +[2024-12-31 18:15:18,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.74 | bwd: 4259.28 | bwd_inner: 4141.90 | bwd_allreduce: 116.50 | step: 14.55 + 59%|█████▉ | 451/759 [1:05:13<37:39, 7.34s/it] {'loss': 1.2206, 'learning_rate': 7.466824818856296e-06, 'epoch': 0.59} + 59%|█████▉ | 451/759 [1:05:13<37:39, 7.34s/it][2024-12-31 18:15:19,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.62 | bwd_microstep: 341.44 | bwd_inner_microstep: 341.09 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:15:19,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 241.37 | bwd_microstep: 400.02 | bwd_inner_microstep: 400.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:20,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.59 | bwd_microstep: 256.51 | bwd_inner_microstep: 256.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:15:20,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 247.91 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:21,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:15:21,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:22,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.82 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:22,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:23,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:23,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 280.08 | bwd_inner_microstep: 280.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:15:23,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.08 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:15:24,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 242.42 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:24,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.73 | bwd_microstep: 241.28 | bwd_inner_microstep: 241.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:25,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.13 | bwd_microstep: 240.77 | bwd_inner_microstep: 240.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:15:25,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.31 | bwd_microstep: 241.37 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:26,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.09 | optimizer_step: 3.48 +[2024-12-31 18:15:26,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.45 | bwd_microstep: 256.39 | bwd_inner_microstep: 242.78 | bwd_allreduce_microstep: 13.51 | step_microstep: 11.69 +[2024-12-31 18:15:26,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2858.98 | bwd: 4219.94 | bwd_inner: 4205.51 | bwd_allreduce: 13.80 | step: 14.36 + 60%|█████▉ | 452/759 [1:05:21<37:34, 7.34s/it] {'loss': 1.2179, 'learning_rate': 7.4255555766345025e-06, 'epoch': 0.6} + 60%|█████▉ | 452/759 [1:05:21<37:34, 7.34s/it][2024-12-31 18:15:26,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.13 | bwd_microstep: 370.32 | bwd_inner_microstep: 369.98 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:15:27,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.47 | bwd_microstep: 293.37 | bwd_inner_microstep: 293.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:27,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.85 | bwd_microstep: 262.95 | bwd_inner_microstep: 262.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:28,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.26 | bwd_microstep: 261.80 | bwd_inner_microstep: 261.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:15:28,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 256.41 | bwd_inner_microstep: 256.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:29,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.72 | bwd_microstep: 254.62 | bwd_inner_microstep: 254.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:29,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 246.87 | bwd_inner_microstep: 246.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:29,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:30,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:30,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 251.63 | bwd_inner_microstep: 251.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:31,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:31,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:32,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:32,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:32,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 251.38 | bwd_inner_microstep: 251.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:33,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 11.90 | optimizer_step: 3.65 +[2024-12-31 18:15:33,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 257.55 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 13.58 | step_microstep: 22.55 +[2024-12-31 18:15:33,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2873.20 | bwd: 4181.25 | bwd_inner: 4166.74 | bwd_allreduce: 13.87 | step: 25.45 + 60%|█████▉ | 453/759 [1:05:28<37:28, 7.35s/it] {'loss': 1.2296, 'learning_rate': 7.384333240259216e-06, 'epoch': 0.6} + 60%|█████▉ | 453/759 [1:05:28<37:28, 7.35s/it][2024-12-31 18:15:34,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.93 | bwd_microstep: 342.68 | bwd_inner_microstep: 342.29 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:15:34,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.66 | bwd_microstep: 290.15 | bwd_inner_microstep: 290.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:35,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.31 | bwd_microstep: 281.25 | bwd_inner_microstep: 281.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:35,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 276.61 | bwd_inner_microstep: 276.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:35,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.05 | bwd_microstep: 256.85 | bwd_inner_microstep: 256.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:36,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 249.29 | bwd_inner_microstep: 249.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:36,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:15:37,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:15:37,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:15:38,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 246.15 | bwd_inner_microstep: 246.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:38,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:38,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 255.64 | bwd_inner_microstep: 255.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:15:39,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:39,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:40,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:40,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.51 | optimizer_gradients: 0.73 | optimizer_step: 6.52 +[2024-12-31 18:15:40,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.40 | bwd_microstep: 240.31 | bwd_inner_microstep: 226.67 | bwd_allreduce_microstep: 13.54 | step_microstep: 16.70 +[2024-12-31 18:15:40,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2813.93 | bwd: 4156.04 | bwd_inner: 4141.60 | bwd_allreduce: 13.83 | step: 19.65 + 60%|█████▉ | 454/759 [1:05:35<37:12, 7.32s/it] {'loss': 1.1944, 'learning_rate': 7.34315856079291e-06, 'epoch': 0.6} + 60%|█████▉ | 454/759 [1:05:35<37:12, 7.32s/it][2024-12-31 18:15:41,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.62 | bwd_microstep: 335.67 | bwd_inner_microstep: 335.39 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.15 +[2024-12-31 18:15:41,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.24 | bwd_microstep: 286.71 | bwd_inner_microstep: 286.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:42,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.43 | bwd_microstep: 287.60 | bwd_inner_microstep: 287.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:42,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.27 | bwd_microstep: 264.60 | bwd_inner_microstep: 264.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:43,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.66 | bwd_microstep: 255.46 | bwd_inner_microstep: 255.25 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.19 +[2024-12-31 18:15:43,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 249.16 | bwd_inner_microstep: 249.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:44,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:44,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:15:44,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.60 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:45,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:15:45,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.88 | bwd_microstep: 240.97 | bwd_inner_microstep: 240.66 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.28 +[2024-12-31 18:15:46,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.24 +[2024-12-31 18:15:46,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.27 | bwd_microstep: 240.25 | bwd_inner_microstep: 240.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:47,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:47,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:48,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.01 | optimizer_gradients: 6.52 | optimizer_step: 3.27 +[2024-12-31 18:15:48,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.08 | bwd_microstep: 527.11 | bwd_inner_microstep: 242.36 | bwd_allreduce_microstep: 284.70 | step_microstep: 19.29 +[2024-12-31 18:15:48,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.25 | bwd: 4402.14 | bwd_inner: 4115.80 | bwd_allreduce: 285.43 | step: 22.46 + 60%|█████▉ | 455/759 [1:05:43<37:25, 7.39s/it] {'loss': 1.2263, 'learning_rate': 7.3020322884297565e-06, 'epoch': 0.6} + 60%|█████▉ | 455/759 [1:05:43<37:25, 7.39s/it][2024-12-31 18:15:48,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.16 | bwd_microstep: 291.20 | bwd_inner_microstep: 290.83 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:15:49,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.41 | bwd_microstep: 285.57 | bwd_inner_microstep: 285.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:15:49,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.55 | bwd_microstep: 280.78 | bwd_inner_microstep: 280.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:15:50,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.43 | bwd_microstep: 261.89 | bwd_inner_microstep: 261.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:50,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.46 | bwd_microstep: 257.51 | bwd_inner_microstep: 257.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:51,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.41 | bwd_microstep: 258.51 | bwd_inner_microstep: 258.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:51,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 250.22 | bwd_inner_microstep: 250.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:15:51,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:15:52,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:15:52,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 245.57 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:53,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:53,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:15:54,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.22 +[2024-12-31 18:15:54,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:54,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:55,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.41 | optimizer_gradients: 0.56 | optimizer_step: 3.08 +[2024-12-31 18:15:55,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 406.59 | bwd_inner_microstep: 249.53 | bwd_allreduce_microstep: 157.01 | step_microstep: 14.60 +[2024-12-31 18:15:55,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2806.76 | bwd: 4253.45 | bwd_inner: 4095.51 | bwd_allreduce: 157.33 | step: 17.47 + 60%|██████ | 456/759 [1:05:50<37:14, 7.37s/it] {'loss': 1.2106, 'learning_rate': 7.260955172481959e-06, 'epoch': 0.6} + 60%|██████ | 456/759 [1:05:50<37:14, 7.37s/it][2024-12-31 18:15:56,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.74 | bwd_microstep: 361.99 | bwd_inner_microstep: 361.66 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:15:56,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.07 | bwd_microstep: 292.13 | bwd_inner_microstep: 292.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:15:57,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.58 | bwd_microstep: 263.53 | bwd_inner_microstep: 263.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:15:57,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.63 | bwd_microstep: 263.02 | bwd_inner_microstep: 262.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:15:58,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 250.85 | bwd_inner_microstep: 250.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:58,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 248.91 | bwd_inner_microstep: 248.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:15:58,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 247.81 | bwd_inner_microstep: 247.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:15:59,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 247.20 | bwd_inner_microstep: 247.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:15:59,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 246.27 | bwd_inner_microstep: 246.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:16:00,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 246.47 | bwd_inner_microstep: 246.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:00,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 244.54 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:01,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 245.19 | bwd_inner_microstep: 245.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:01,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:01,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.30 +[2024-12-31 18:16:02,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:02,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.73 | optimizer_step: 3.23 +[2024-12-31 18:16:02,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 263.37 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 17.92 | step_microstep: 11.36 +[2024-12-31 18:16:02,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2830.43 | bwd: 4154.72 | bwd_inner: 4135.67 | bwd_allreduce: 18.31 | step: 14.48 + 60%|██████ | 457/759 [1:05:57<36:58, 7.35s/it] {'loss': 1.2275, 'learning_rate': 7.219927961366091e-06, 'epoch': 0.6} + 60%|██████ | 457/759 [1:05:57<36:58, 7.35s/it][2024-12-31 18:16:03,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.37 | bwd_microstep: 312.23 | bwd_inner_microstep: 311.90 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:16:04,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 248.95 | bwd_microstep: 413.84 | bwd_inner_microstep: 413.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:16:04,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.78 | bwd_microstep: 266.60 | bwd_inner_microstep: 266.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:05,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.90 | bwd_microstep: 265.47 | bwd_inner_microstep: 265.12 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.28 +[2024-12-31 18:16:05,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 259.65 | bwd_inner_microstep: 259.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:05,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:16:06,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 245.19 | bwd_inner_microstep: 245.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:06,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 248.16 | bwd_inner_microstep: 248.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:16:07,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:16:07,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:08,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:16:08,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.43 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:08,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:09,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 241.25 | bwd_inner_microstep: 241.07 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.26 +[2024-12-31 18:16:09,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.56 | bwd_microstep: 240.81 | bwd_inner_microstep: 240.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:10,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.50 | optimizer_gradients: 0.63 | optimizer_step: 3.13 +[2024-12-31 18:16:10,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.88 | bwd_microstep: 453.93 | bwd_inner_microstep: 242.03 | bwd_allreduce_microstep: 211.86 | step_microstep: 14.11 +[2024-12-31 18:16:10,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2856.44 | bwd: 4414.38 | bwd_inner: 4201.18 | bwd_allreduce: 212.42 | step: 17.15 + 60%|██████ | 458/759 [1:06:05<37:09, 7.41s/it] {'loss': 1.1832, 'learning_rate': 7.178951402589482e-06, 'epoch': 0.6} + 60%|██████ | 458/759 [1:06:05<37:09, 7.41s/it][2024-12-31 18:16:11,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.55 | bwd_microstep: 342.04 | bwd_inner_microstep: 341.68 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:16:11,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.39 | bwd_microstep: 394.60 | bwd_inner_microstep: 394.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:16:12,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.47 | bwd_microstep: 284.40 | bwd_inner_microstep: 284.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:16:12,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.90 | bwd_microstep: 257.86 | bwd_inner_microstep: 257.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:16:13,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 257.97 | bwd_inner_microstep: 257.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:16:13,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:13,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:14,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 247.19 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:14,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.04 | bwd_microstep: 248.24 | bwd_inner_microstep: 247.86 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.23 +[2024-12-31 18:16:15,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:15,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:16:16,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:16,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:16,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.34 | bwd_microstep: 246.53 | bwd_inner_microstep: 246.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:17,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:16:17,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 8.31 | optimizer_step: 3.38 +[2024-12-31 18:16:17,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 257.94 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 13.59 | step_microstep: 18.79 +[2024-12-31 18:16:17,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2873.90 | bwd: 4252.57 | bwd_inner: 4237.51 | bwd_allreduce: 14.14 | step: 21.67 + 60%|██████ | 459/759 [1:06:12<37:03, 7.41s/it] {'loss': 1.2118, 'learning_rate': 7.1380262427365885e-06, 'epoch': 0.6} + 60%|██████ | 459/759 [1:06:12<37:03, 7.41s/it][2024-12-31 18:16:18,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 250.46 | bwd_microstep: 415.24 | bwd_inner_microstep: 414.88 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:16:19,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.87 | bwd_microstep: 298.42 | bwd_inner_microstep: 298.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:19,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.67 | bwd_microstep: 269.39 | bwd_inner_microstep: 269.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:19,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.32 | bwd_microstep: 261.39 | bwd_inner_microstep: 261.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:20,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.68 | bwd_microstep: 261.86 | bwd_inner_microstep: 261.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:20,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.80 | bwd_microstep: 254.87 | bwd_inner_microstep: 254.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:16:21,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 247.39 | bwd_inner_microstep: 247.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:21,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.23 | bwd_microstep: 246.59 | bwd_inner_microstep: 246.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:22,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 254.16 | bwd_inner_microstep: 254.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:22,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:23,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:23,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:23,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 246.76 | bwd_inner_microstep: 246.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:24,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:24,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:25,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.14 | optimizer_step: 3.51 +[2024-12-31 18:16:25,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 256.64 | bwd_inner_microstep: 242.61 | bwd_allreduce_microstep: 13.91 | step_microstep: 11.86 +[2024-12-31 18:16:25,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2877.08 | bwd: 4233.47 | bwd_inner: 4218.48 | bwd_allreduce: 14.21 | step: 14.85 + 61%|██████ | 460/759 [1:06:20<36:55, 7.41s/it] {'loss': 1.2205, 'learning_rate': 7.097153227455379e-06, 'epoch': 0.61} + 61%|██████ | 460/759 [1:06:20<36:55, 7.41s/it][2024-12-31 18:16:25,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.21 | bwd_microstep: 340.32 | bwd_inner_microstep: 339.98 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:16:26,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 252.73 | bwd_microstep: 427.15 | bwd_inner_microstep: 427.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:16:26,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.69 | bwd_microstep: 268.91 | bwd_inner_microstep: 268.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:27,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.50 | bwd_microstep: 256.56 | bwd_inner_microstep: 256.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:16:27,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.04 | bwd_microstep: 256.10 | bwd_inner_microstep: 255.97 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.19 +[2024-12-31 18:16:28,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 248.33 | bwd_inner_microstep: 248.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:28,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.68 | bwd_inner_microstep: 248.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:29,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.36 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:29,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 245.06 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:16:30,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 244.23 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:30,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.49 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.21 +[2024-12-31 18:16:30,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.37 | bwd_microstep: 241.45 | bwd_inner_microstep: 241.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:31,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 241.11 | bwd_inner_microstep: 241.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:16:31,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.39 | bwd_microstep: 242.03 | bwd_inner_microstep: 242.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:32,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 255.13 | bwd_inner_microstep: 255.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:32,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.82 | optimizer_step: 3.41 +[2024-12-31 18:16:32,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.99 | bwd_microstep: 255.32 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 14.08 | step_microstep: 11.90 +[2024-12-31 18:16:32,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2881.74 | bwd: 4259.87 | bwd_inner: 4244.40 | bwd_allreduce: 14.51 | step: 15.04 + 61%|██████ | 461/759 [1:06:27<36:50, 7.42s/it] {'loss': 1.2179, 'learning_rate': 7.056333101443761e-06, 'epoch': 0.61} + 61%|██████ | 461/759 [1:06:27<36:50, 7.42s/it][2024-12-31 18:16:33,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.39 | bwd_microstep: 344.16 | bwd_inner_microstep: 343.80 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:16:33,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.28 | bwd_microstep: 303.07 | bwd_inner_microstep: 303.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:34,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.53 | bwd_microstep: 282.09 | bwd_inner_microstep: 282.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:34,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.02 | bwd_microstep: 263.58 | bwd_inner_microstep: 263.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:16:35,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.30 | bwd_microstep: 255.57 | bwd_inner_microstep: 255.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:16:35,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 248.85 | bwd_inner_microstep: 248.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:36,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.02 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:36,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 248.06 | bwd_inner_microstep: 248.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:36,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:37,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 245.29 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:37,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:38,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.49 | bwd_microstep: 254.77 | bwd_inner_microstep: 254.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:38,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.98 | bwd_inner_microstep: 244.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:16:39,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:16:39,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.64 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:16:40,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.76 | optimizer_step: 3.38 +[2024-12-31 18:16:40,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 258.51 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 13.54 | step_microstep: 11.22 +[2024-12-31 18:16:40,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2924.39 | bwd: 4176.50 | bwd_inner: 4162.10 | bwd_allreduce: 13.81 | step: 14.31 + 61%|██████ | 462/759 [1:06:35<36:41, 7.41s/it] {'loss': 1.2121, 'learning_rate': 7.01556660843602e-06, 'epoch': 0.61} + 61%|██████ | 462/759 [1:06:35<36:41, 7.41s/it][2024-12-31 18:16:40,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.59 | bwd_microstep: 346.43 | bwd_inner_microstep: 346.07 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:16:41,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.21 | bwd_microstep: 280.39 | bwd_inner_microstep: 280.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:41,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 250.96 | bwd_inner_microstep: 250.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:16:42,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 250.07 | bwd_inner_microstep: 250.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:42,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:42,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 249.63 | bwd_inner_microstep: 249.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:16:43,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:43,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.13 | bwd_microstep: 249.65 | bwd_inner_microstep: 249.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:44,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:44,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 249.07 | bwd_inner_microstep: 249.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:45,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:45,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:45,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:46,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 335.16 | bwd_inner_microstep: 335.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:46,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:47,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.11 | optimizer_gradients: 2.45 | optimizer_step: 3.17 +[2024-12-31 18:16:47,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 291.71 | bwd_inner_microstep: 242.28 | bwd_allreduce_microstep: 49.38 | step_microstep: 13.32 +[2024-12-31 18:16:47,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.54 | bwd: 4218.81 | bwd_inner: 4168.58 | bwd_allreduce: 49.62 | step: 16.38 + 61%|██████ | 463/759 [1:06:42<36:28, 7.39s/it] {'loss': 1.2206, 'learning_rate': 6.974854491189243e-06, 'epoch': 0.61} + 61%|██████ | 463/759 [1:06:42<36:28, 7.39s/it][2024-12-31 18:16:48,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.93 | bwd_microstep: 336.03 | bwd_inner_microstep: 335.67 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:16:48,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.54 | bwd_microstep: 277.84 | bwd_inner_microstep: 277.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:48,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.89 | bwd_microstep: 283.38 | bwd_inner_microstep: 283.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:49,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.02 | bwd_microstep: 262.27 | bwd_inner_microstep: 262.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:16:49,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.12 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:16:50,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.16 | bwd_microstep: 257.01 | bwd_inner_microstep: 256.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:50,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 249.39 | bwd_inner_microstep: 249.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:51,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:51,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.24 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:52,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:52,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 246.92 | bwd_inner_microstep: 246.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:52,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 245.30 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:53,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:53,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 252.28 | bwd_inner_microstep: 252.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:54,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:54,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.82 | optimizer_step: 3.43 +[2024-12-31 18:16:54,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 258.19 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.19 +[2024-12-31 18:16:54,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.47 | bwd: 4138.44 | bwd_inner: 4123.98 | bwd_allreduce: 13.88 | step: 14.04 + 61%|██████ | 464/759 [1:06:49<36:08, 7.35s/it] {'loss': 1.24, 'learning_rate': 6.934197491469818e-06, 'epoch': 0.61} + 61%|██████ | 464/759 [1:06:49<36:08, 7.35s/it][2024-12-31 18:16:55,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.06 | bwd_microstep: 353.93 | bwd_inner_microstep: 353.45 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.26 +[2024-12-31 18:16:55,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.12 | bwd_microstep: 294.45 | bwd_inner_microstep: 294.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:56,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.61 | bwd_microstep: 263.53 | bwd_inner_microstep: 263.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:56,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 263.21 | bwd_inner_microstep: 263.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:57,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.70 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:57,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 247.33 | bwd_inner_microstep: 247.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:16:58,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 246.57 | bwd_inner_microstep: 246.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:16:58,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.96 | bwd_microstep: 253.08 | bwd_inner_microstep: 253.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:58,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 247.14 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:16:59,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:16:59,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.76 +[2024-12-31 18:17:00,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.23 | bwd_microstep: 258.90 | bwd_inner_microstep: 258.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:00,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.83 | bwd_microstep: 241.32 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:01,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 242.33 | bwd_inner_microstep: 242.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:01,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.45 | bwd_microstep: 242.03 | bwd_inner_microstep: 242.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:01,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 7.02 | optimizer_step: 13.50 +[2024-12-31 18:17:02,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 259.36 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 13.51 | step_microstep: 76.63 +[2024-12-31 18:17:02,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.22 | bwd: 4150.91 | bwd_inner: 4136.31 | bwd_allreduce: 13.85 | step: 80.38 + 61%|██████▏ | 465/759 [1:06:56<36:00, 7.35s/it] {'loss': 1.204, 'learning_rate': 6.893596350039896e-06, 'epoch': 0.61} + 61%|██████▏ | 465/759 [1:06:56<36:00, 7.35s/it][2024-12-31 18:17:02,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.91 | bwd_microstep: 347.29 | bwd_inner_microstep: 346.95 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:17:03,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.38 | bwd_microstep: 296.28 | bwd_inner_microstep: 296.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:17:03,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.68 | bwd_microstep: 282.34 | bwd_inner_microstep: 282.20 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.21 +[2024-12-31 18:17:04,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.87 | bwd_microstep: 266.73 | bwd_inner_microstep: 266.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:04,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.96 | bwd_microstep: 257.51 | bwd_inner_microstep: 257.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:17:04,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.58 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:17:05,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 251.79 | bwd_inner_microstep: 251.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:05,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 248.75 | bwd_inner_microstep: 248.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:06,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:06,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:07,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 246.15 | bwd_inner_microstep: 246.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:17:07,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:08,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:08,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.60 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:08,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:09,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.68 | optimizer_gradients: 0.72 | optimizer_step: 3.26 +[2024-12-31 18:17:09,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 511.25 | bwd_inner_microstep: 242.07 | bwd_allreduce_microstep: 269.13 | step_microstep: 10.96 +[2024-12-31 18:17:09,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2856.29 | bwd: 4422.40 | bwd_inner: 4152.35 | bwd_allreduce: 269.46 | step: 14.01 + 61%|██████▏ | 466/759 [1:07:04<36:12, 7.42s/it] {'loss': 1.2224, 'learning_rate': 6.853051806643898e-06, 'epoch': 0.61} + 61%|██████▏ | 466/759 [1:07:04<36:12, 7.42s/it][2024-12-31 18:17:10,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 244.08 | bwd_microstep: 401.39 | bwd_inner_microstep: 400.98 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:17:10,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.89 | bwd_microstep: 303.83 | bwd_inner_microstep: 303.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:11,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.04 | bwd_microstep: 265.87 | bwd_inner_microstep: 265.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:11,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.30 | bwd_microstep: 283.53 | bwd_inner_microstep: 283.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:12,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.01 | bwd_microstep: 255.88 | bwd_inner_microstep: 255.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:17:12,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:13,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 249.08 | bwd_inner_microstep: 249.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:13,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 251.51 | bwd_inner_microstep: 251.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:13,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 244.08 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:14,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 242.77 | bwd_inner_microstep: 242.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:17:14,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:15,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:17:15,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:16,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.52 | bwd_microstep: 241.76 | bwd_inner_microstep: 241.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:16,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.42 | bwd_microstep: 241.40 | bwd_inner_microstep: 241.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:16,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 4.47 | optimizer_step: 3.23 +[2024-12-31 18:17:16,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 257.86 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 13.72 | step_microstep: 14.58 +[2024-12-31 18:17:16,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2866.50 | bwd: 4218.03 | bwd_inner: 4203.37 | bwd_allreduce: 14.00 | step: 17.77 + 62%|██████▏ | 467/759 [1:07:11<36:01, 7.40s/it] {'loss': 1.227, 'learning_rate': 6.812564599995042e-06, 'epoch': 0.62} + 62%|██████▏ | 467/759 [1:07:11<36:01, 7.40s/it][2024-12-31 18:17:17,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.94 | bwd_microstep: 318.84 | bwd_inner_microstep: 318.50 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:17:18,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.81 | bwd_microstep: 280.66 | bwd_inner_microstep: 280.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:18,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.85 | bwd_microstep: 289.76 | bwd_inner_microstep: 289.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:17:18,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.58 | bwd_microstep: 266.88 | bwd_inner_microstep: 266.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:19,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.40 | bwd_microstep: 264.00 | bwd_inner_microstep: 263.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:19,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.28 | bwd_microstep: 256.22 | bwd_inner_microstep: 256.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:20,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 256.52 | bwd_inner_microstep: 256.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:20,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:17:21,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 246.21 | bwd_inner_microstep: 246.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:17:21,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:17:22,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.45 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:22,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 243.62 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:22,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 242.70 | bwd_inner_microstep: 242.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:23,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 242.81 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.34 +[2024-12-31 18:17:23,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.71 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:24,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.64 | optimizer_gradients: 0.64 | optimizer_step: 3.25 +[2024-12-31 18:17:24,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 639.32 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 395.39 | step_microstep: 10.96 +[2024-12-31 18:17:24,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.40 | bwd: 4527.11 | bwd_inner: 4130.49 | bwd_allreduce: 395.91 | step: 13.55 + 62%|██████▏ | 468/759 [1:07:19<36:14, 7.47s/it] {'loss': 1.2361, 'learning_rate': 6.772135467761889e-06, 'epoch': 0.62} + 62%|██████▏ | 468/759 [1:07:19<36:14, 7.47s/it][2024-12-31 18:17:25,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.99 | bwd_microstep: 346.99 | bwd_inner_microstep: 346.63 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:17:25,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.56 | bwd_microstep: 296.94 | bwd_inner_microstep: 296.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:26,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.08 | bwd_microstep: 268.59 | bwd_inner_microstep: 268.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:26,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.97 | bwd_microstep: 271.26 | bwd_inner_microstep: 271.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:17:27,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 250.01 | bwd_inner_microstep: 249.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:17:27,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 249.36 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:27,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.51 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:28,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 247.72 | bwd_inner_microstep: 247.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:17:28,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:29,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.54 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:29,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:30,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:17:30,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.08 | bwd_microstep: 242.41 | bwd_inner_microstep: 242.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:17:30,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:31,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.53 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:17:31,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.77 | optimizer_gradients: 0.64 | optimizer_step: 3.28 +[2024-12-31 18:17:31,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.72 | bwd_microstep: 255.37 | bwd_inner_microstep: 241.75 | bwd_allreduce_microstep: 13.53 | step_microstep: 14.04 +[2024-12-31 18:17:31,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.82 | bwd: 4142.61 | bwd_inner: 4128.00 | bwd_allreduce: 13.82 | step: 16.87 + 62%|██████▏ | 469/759 [1:07:26<35:49, 7.41s/it] {'loss': 1.2099, 'learning_rate': 6.731765146554891e-06, 'epoch': 0.62} + 62%|██████▏ | 469/759 [1:07:26<35:49, 7.41s/it][2024-12-31 18:17:32,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.89 | bwd_microstep: 391.79 | bwd_inner_microstep: 391.43 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:17:33,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.40 | bwd_microstep: 288.18 | bwd_inner_microstep: 288.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:17:33,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.85 | bwd_microstep: 267.56 | bwd_inner_microstep: 267.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:33,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 267.85 | bwd_inner_microstep: 267.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:34,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:17:34,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 249.13 | bwd_inner_microstep: 249.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:17:35,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:17:35,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.01 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:36,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:36,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:37,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:37,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:17:37,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.60 | bwd_microstep: 240.92 | bwd_inner_microstep: 240.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:38,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.99 | bwd_microstep: 242.15 | bwd_inner_microstep: 242.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:38,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.35 | bwd_microstep: 240.87 | bwd_inner_microstep: 240.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:39,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 9.88 | optimizer_gradients: 9.34 | optimizer_step: 3.27 +[2024-12-31 18:17:39,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 257.69 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 13.71 | step_microstep: 24.60 +[2024-12-31 18:17:39,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.42 | bwd: 4160.65 | bwd_inner: 4146.01 | bwd_allreduce: 13.99 | step: 27.43 + 62%|██████▏ | 470/759 [1:07:34<35:33, 7.38s/it] {'loss': 1.1969, 'learning_rate': 6.691454371912974e-06, 'epoch': 0.62} + 62%|██████▏ | 470/759 [1:07:34<35:33, 7.38s/it][2024-12-31 18:17:39,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.68 | bwd_microstep: 319.66 | bwd_inner_microstep: 319.28 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:17:40,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.28 | bwd_microstep: 286.50 | bwd_inner_microstep: 286.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:17:40,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.00 | bwd_microstep: 282.52 | bwd_inner_microstep: 282.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:41,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.36 | bwd_microstep: 264.14 | bwd_inner_microstep: 264.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:17:41,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.57 | bwd_microstep: 256.73 | bwd_inner_microstep: 256.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:42,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 248.52 | bwd_inner_microstep: 248.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:17:42,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 247.62 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:42,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:17:43,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 254.66 | bwd_inner_microstep: 254.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:43,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:44,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 245.68 | bwd_inner_microstep: 245.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:44,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:17:45,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:45,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:17:45,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 242.30 | bwd_inner_microstep: 242.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:46,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.03 | optimizer_gradients: 0.83 | optimizer_step: 3.14 +[2024-12-31 18:17:46,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 456.20 | bwd_inner_microstep: 241.77 | bwd_allreduce_microstep: 214.39 | step_microstep: 10.87 +[2024-12-31 18:17:46,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2813.44 | bwd: 4324.63 | bwd_inner: 4109.38 | bwd_allreduce: 214.65 | step: 13.59 + 62%|██████▏ | 471/759 [1:07:41<35:28, 7.39s/it] {'loss': 1.2269, 'learning_rate': 6.651203878290139e-06, 'epoch': 0.62} + 62%|██████▏ | 471/759 [1:07:41<35:28, 7.39s/it][2024-12-31 18:17:47,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.95 | bwd_microstep: 343.83 | bwd_inner_microstep: 343.46 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.21 +[2024-12-31 18:17:47,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.34 | bwd_microstep: 371.12 | bwd_inner_microstep: 371.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:48,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.13 | bwd_microstep: 295.93 | bwd_inner_microstep: 295.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:17:48,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.00 | bwd_microstep: 263.41 | bwd_inner_microstep: 263.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:49,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.73 | bwd_microstep: 261.39 | bwd_inner_microstep: 261.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:49,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 249.40 | bwd_inner_microstep: 249.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:50,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 250.01 | bwd_inner_microstep: 249.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:17:50,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.54 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:50,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 257.39 | bwd_inner_microstep: 257.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:51,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:17:51,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 245.77 | bwd_inner_microstep: 245.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:52,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:17:52,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:53,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 247.92 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.30 | step_microstep: 0.34 +[2024-12-31 18:17:53,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:17:54,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.87 | optimizer_gradients: 0.56 | optimizer_step: 3.16 +[2024-12-31 18:17:54,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 413.07 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 169.14 | step_microstep: 12.30 +[2024-12-31 18:17:54,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.27 | bwd: 4421.94 | bwd_inner: 4251.50 | bwd_allreduce: 169.69 | step: 15.25 + 62%|██████▏ | 472/759 [1:07:49<35:38, 7.45s/it] {'loss': 1.2106, 'learning_rate': 6.6110143990420824e-06, 'epoch': 0.62} + 62%|██████▏ | 472/759 [1:07:49<35:38, 7.45s/it][2024-12-31 18:17:54,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.25 | bwd_microstep: 363.71 | bwd_inner_microstep: 363.29 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.24 +[2024-12-31 18:17:55,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.48 | bwd_microstep: 297.81 | bwd_inner_microstep: 297.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:55,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.12 | bwd_microstep: 281.69 | bwd_inner_microstep: 281.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:56,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.12 | bwd_microstep: 257.67 | bwd_inner_microstep: 257.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:17:56,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.66 | bwd_microstep: 255.27 | bwd_inner_microstep: 255.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:17:57,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.80 | bwd_microstep: 256.14 | bwd_inner_microstep: 256.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:17:57,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.25 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:17:58,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 246.80 | bwd_inner_microstep: 246.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:17:58,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 245.97 | bwd_inner_microstep: 245.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:17:58,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.35 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:17:59,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.90 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:17:59,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 263.15 | bwd_microstep: 245.47 | bwd_inner_microstep: 245.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:00,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:00,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:01,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:01,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.64 | optimizer_gradients: 0.55 | optimizer_step: 3.08 +[2024-12-31 18:18:01,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 368.36 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 124.56 | step_microstep: 11.58 +[2024-12-31 18:18:01,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2970.43 | bwd: 4288.90 | bwd_inner: 4163.27 | bwd_allreduce: 124.83 | step: 14.28 + 62%|██████▏ | 473/759 [1:07:56<35:39, 7.48s/it] {'loss': 1.1969, 'learning_rate': 6.570886666412823e-06, 'epoch': 0.62} + 62%|██████▏ | 473/759 [1:07:56<35:39, 7.48s/it][2024-12-31 18:18:02,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.68 | bwd_microstep: 418.75 | bwd_inner_microstep: 418.40 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:18:03,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.64 | bwd_microstep: 351.83 | bwd_inner_microstep: 351.41 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:18:03,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.86 | bwd_microstep: 268.34 | bwd_inner_microstep: 268.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:03,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.32 | bwd_microstep: 262.67 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:04,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.67 | bwd_microstep: 255.89 | bwd_inner_microstep: 255.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:04,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 249.84 | bwd_inner_microstep: 249.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:05,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:05,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 249.91 | bwd_inner_microstep: 249.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:18:06,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:06,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 245.19 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.20 +[2024-12-31 18:18:07,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.27 | bwd_microstep: 246.09 | bwd_inner_microstep: 246.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:07,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 308.29 | bwd_inner_microstep: 308.12 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.19 +[2024-12-31 18:18:07,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:08,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 250.42 | bwd_inner_microstep: 250.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:08,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 244.24 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:09,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 1.02 | optimizer_step: 3.60 +[2024-12-31 18:18:09,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 257.31 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 13.86 | step_microstep: 13.26 +[2024-12-31 18:18:09,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2864.07 | bwd: 4347.72 | bwd_inner: 4332.32 | bwd_allreduce: 14.45 | step: 16.33 + 62%|██████▏ | 474/759 [1:08:04<35:34, 7.49s/it] {'loss': 1.239, 'learning_rate': 6.5308214115213785e-06, 'epoch': 0.62} + 62%|██████▏ | 474/759 [1:08:04<35:34, 7.49s/it][2024-12-31 18:18:09,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.99 | bwd_microstep: 348.55 | bwd_inner_microstep: 348.19 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:18:10,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.74 | bwd_microstep: 300.09 | bwd_inner_microstep: 300.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:18:10,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.46 | bwd_microstep: 318.33 | bwd_inner_microstep: 318.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:11,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.50 | bwd_microstep: 262.27 | bwd_inner_microstep: 262.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:11,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.45 | bwd_microstep: 258.23 | bwd_inner_microstep: 258.05 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.20 +[2024-12-31 18:18:12,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:12,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 248.03 | bwd_inner_microstep: 248.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:13,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:13,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:13,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.79 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:14,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 258.54 | bwd_inner_microstep: 258.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:14,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:15,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:15,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:16,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 252.21 | bwd_inner_microstep: 252.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:18:16,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.79 | optimizer_gradients: 1.29 | optimizer_step: 3.13 +[2024-12-31 18:18:16,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 478.31 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 232.78 | step_microstep: 11.38 +[2024-12-31 18:18:16,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2838.55 | bwd: 4442.83 | bwd_inner: 4209.03 | bwd_allreduce: 233.12 | step: 14.54 + 63%|██████▎ | 475/759 [1:08:11<35:34, 7.52s/it] {'loss': 1.2399, 'learning_rate': 6.490819364348434e-06, 'epoch': 0.63} + 63%|██████▎ | 475/759 [1:08:11<35:34, 7.52s/it][2024-12-31 18:18:17,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.90 | bwd_microstep: 342.83 | bwd_inner_microstep: 342.47 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:18:17,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.19 | bwd_microstep: 289.35 | bwd_inner_microstep: 289.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:18,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.48 | bwd_microstep: 266.11 | bwd_inner_microstep: 266.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:18:18,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.16 | bwd_microstep: 263.51 | bwd_inner_microstep: 263.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:19,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.74 | bwd_microstep: 255.26 | bwd_inner_microstep: 255.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:19,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.67 | bwd_microstep: 257.36 | bwd_inner_microstep: 257.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:20,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.67 | bwd_microstep: 250.35 | bwd_inner_microstep: 250.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:20,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 249.77 | bwd_inner_microstep: 249.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:21,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 247.02 | bwd_inner_microstep: 246.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:18:21,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.44 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:21,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 253.71 | bwd_inner_microstep: 253.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:22,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 246.10 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:22,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.88 | bwd_inner_microstep: 245.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:23,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 266.37 | bwd_inner_microstep: 266.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:18:23,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:24,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 6.69 | optimizer_step: 3.38 +[2024-12-31 18:18:24,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 258.10 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 14.13 | step_microstep: 17.45 +[2024-12-31 18:18:24,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.62 | bwd: 4185.52 | bwd_inner: 4170.50 | bwd_allreduce: 14.42 | step: 20.38 + 63%|██████▎ | 476/759 [1:08:19<35:10, 7.46s/it] {'loss': 1.2265, 'learning_rate': 6.450881253723035e-06, 'epoch': 0.63} + 63%|██████▎ | 476/759 [1:08:19<35:10, 7.46s/it][2024-12-31 18:18:24,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 242.11 | bwd_microstep: 344.75 | bwd_inner_microstep: 344.39 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.25 +[2024-12-31 18:18:25,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.39 | bwd_microstep: 266.28 | bwd_inner_microstep: 266.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:25,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.13 | bwd_microstep: 262.94 | bwd_inner_microstep: 262.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:26,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 255.68 | bwd_inner_microstep: 255.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:26,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 251.40 | bwd_inner_microstep: 251.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:27,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 251.36 | bwd_inner_microstep: 251.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:27,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:27,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 249.31 | bwd_inner_microstep: 249.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:28,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 233.06 | bwd_microstep: 285.65 | bwd_inner_microstep: 285.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:28,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:29,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:29,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:30,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:30,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:31,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 248.52 | bwd_inner_microstep: 248.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:31,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.25 | optimizer_gradients: 0.57 | optimizer_step: 3.07 +[2024-12-31 18:18:31,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 497.07 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 252.54 | step_microstep: 11.01 +[2024-12-31 18:18:31,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2890.88 | bwd: 4383.07 | bwd_inner: 4129.77 | bwd_allreduce: 252.78 | step: 14.06 + 63%|██████▎ | 477/759 [1:08:26<35:11, 7.49s/it] {'loss': 1.2446, 'learning_rate': 6.41100780730932e-06, 'epoch': 0.63} + 63%|██████▎ | 477/759 [1:08:26<35:11, 7.49s/it][2024-12-31 18:18:32,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 244.75 | bwd_microstep: 406.37 | bwd_inner_microstep: 406.22 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.09 +[2024-12-31 18:18:32,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.51 | bwd_microstep: 293.07 | bwd_inner_microstep: 293.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:33,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.98 | bwd_microstep: 282.97 | bwd_inner_microstep: 282.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:18:33,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.96 | bwd_microstep: 263.75 | bwd_inner_microstep: 263.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:34,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.79 | bwd_microstep: 257.02 | bwd_inner_microstep: 256.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:18:34,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.31 | bwd_microstep: 248.37 | bwd_inner_microstep: 248.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:35,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 248.63 | bwd_inner_microstep: 248.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:35,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 248.21 | bwd_inner_microstep: 248.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:36,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:36,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:36,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:18:37,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:37,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 247.00 | bwd_inner_microstep: 246.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:18:38,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 242.09 | bwd_inner_microstep: 242.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:18:38,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.84 | bwd_microstep: 240.44 | bwd_inner_microstep: 240.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:18:39,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.91 | optimizer_gradients: 0.79 | optimizer_step: 6.09 +[2024-12-31 18:18:39,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.43 | bwd_microstep: 375.27 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 133.75 | step_microstep: 14.00 +[2024-12-31 18:18:39,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2861.51 | bwd: 4330.21 | bwd_inner: 4195.72 | bwd_allreduce: 133.92 | step: 16.68 + 63%|██████▎ | 478/759 [1:08:34<35:02, 7.48s/it] {'loss': 1.2229, 'learning_rate': 6.371199751593264e-06, 'epoch': 0.63} + 63%|██████▎ | 478/759 [1:08:34<35:02, 7.48s/it][2024-12-31 18:18:39,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.61 | bwd_microstep: 349.52 | bwd_inner_microstep: 349.16 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.25 +[2024-12-31 18:18:40,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.22 | bwd_microstep: 293.49 | bwd_inner_microstep: 293.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:40,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.76 | bwd_microstep: 281.51 | bwd_inner_microstep: 281.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:41,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 262.81 | bwd_inner_microstep: 262.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:41,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 249.55 | bwd_inner_microstep: 249.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:42,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.94 | bwd_microstep: 249.17 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:42,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 246.31 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:42,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 246.24 | bwd_inner_microstep: 246.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:43,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:43,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:18:44,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:44,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:45,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 242.81 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:18:45,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:46,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 251.99 | bwd_inner_microstep: 251.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:46,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.94 | optimizer_gradients: 0.77 | optimizer_step: 3.22 +[2024-12-31 18:18:46,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 257.32 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 13.58 | step_microstep: 12.27 +[2024-12-31 18:18:46,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.90 | bwd: 4151.92 | bwd_inner: 4137.46 | bwd_allreduce: 13.86 | step: 15.15 + 63%|██████▎ | 479/759 [1:08:41<34:37, 7.42s/it] {'loss': 1.2036, 'learning_rate': 6.331457811869437e-06, 'epoch': 0.63} + 63%|██████▎ | 479/759 [1:08:41<34:37, 7.42s/it][2024-12-31 18:18:47,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.46 | bwd_microstep: 335.24 | bwd_inner_microstep: 334.88 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:18:47,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.25 | bwd_microstep: 291.03 | bwd_inner_microstep: 291.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:18:48,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.17 | bwd_microstep: 290.54 | bwd_inner_microstep: 290.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:48,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.40 | bwd_microstep: 267.00 | bwd_inner_microstep: 266.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:18:48,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.95 | bwd_microstep: 263.43 | bwd_inner_microstep: 263.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:18:49,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 257.33 | bwd_inner_microstep: 257.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:18:49,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 256.41 | bwd_inner_microstep: 256.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:50,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.28 | bwd_microstep: 251.33 | bwd_inner_microstep: 251.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:50,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.77 | bwd_microstep: 246.14 | bwd_inner_microstep: 246.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:51,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 245.47 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.35 +[2024-12-31 18:18:51,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:52,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:52,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.18 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:18:52,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.03 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:53,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.67 | bwd_microstep: 241.12 | bwd_inner_microstep: 241.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:53,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.85 | optimizer_gradients: 0.64 | optimizer_step: 3.30 +[2024-12-31 18:18:53,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 257.09 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 13.70 | step_microstep: 10.90 +[2024-12-31 18:18:53,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.36 | bwd: 4177.24 | bwd_inner: 4161.91 | bwd_allreduce: 14.30 | step: 13.87 + 63%|██████▎ | 480/759 [1:08:48<34:20, 7.39s/it] {'loss': 1.2002, 'learning_rate': 6.291782712227776e-06, 'epoch': 0.63} + 63%|██████▎ | 480/759 [1:08:48<34:20, 7.39s/it][2024-12-31 18:18:54,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.52 | bwd_microstep: 364.53 | bwd_inner_microstep: 364.16 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:18:54,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.78 | bwd_microstep: 314.74 | bwd_inner_microstep: 314.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:18:55,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.28 | bwd_microstep: 266.55 | bwd_inner_microstep: 266.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:18:55,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.91 | bwd_microstep: 268.12 | bwd_inner_microstep: 268.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:56,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.60 | bwd_microstep: 264.99 | bwd_inner_microstep: 264.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:18:56,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 250.61 | bwd_inner_microstep: 250.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:18:57,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 247.80 | bwd_inner_microstep: 247.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:18:57,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 247.47 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:58,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:18:58,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:18:58,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:18:59,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:18:59,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:00,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 240.80 | bwd_inner_microstep: 240.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:19:00,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.92 | bwd_microstep: 242.38 | bwd_inner_microstep: 242.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:01,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.33 | optimizer_gradients: 0.57 | optimizer_step: 3.08 +[2024-12-31 18:19:01,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.87 | bwd_microstep: 391.39 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 146.96 | step_microstep: 11.07 +[2024-12-31 18:19:01,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.25 | bwd: 4321.45 | bwd_inner: 4173.72 | bwd_allreduce: 147.20 | step: 13.85 + 63%|██████▎ | 481/759 [1:08:56<34:18, 7.41s/it] {'loss': 1.2262, 'learning_rate': 6.2521751755404226e-06, 'epoch': 0.63} + 63%|██████▎ | 481/759 [1:08:56<34:18, 7.41s/it][2024-12-31 18:19:01,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 208.06 | bwd_microstep: 315.55 | bwd_inner_microstep: 315.18 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:19:02,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.55 | bwd_microstep: 350.38 | bwd_inner_microstep: 350.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:02,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.66 | bwd_microstep: 281.25 | bwd_inner_microstep: 281.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:03,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.45 | bwd_microstep: 254.93 | bwd_inner_microstep: 254.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:19:03,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 257.29 | bwd_inner_microstep: 257.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:19:04,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.75 | bwd_microstep: 246.50 | bwd_inner_microstep: 246.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:04,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 247.16 | bwd_inner_microstep: 247.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:05,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:19:05,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:19:05,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:19:06,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:06,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 242.63 | bwd_inner_microstep: 242.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:19:07,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:19:07,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 241.79 | bwd_inner_microstep: 241.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:19:08,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 241.78 | bwd_inner_microstep: 241.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:19:08,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.59 | optimizer_gradients: 0.56 | optimizer_step: 3.11 +[2024-12-31 18:19:08,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.54 | bwd_microstep: 293.34 | bwd_inner_microstep: 241.53 | bwd_allreduce_microstep: 51.76 | step_microstep: 11.22 +[2024-12-31 18:19:08,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.72 | bwd: 4192.09 | bwd_inner: 4139.37 | bwd_allreduce: 52.06 | step: 13.74 + 64%|██████▎ | 482/759 [1:09:03<34:02, 7.37s/it] {'loss': 1.2469, 'learning_rate': 6.212635923448526e-06, 'epoch': 0.63} + 64%|██████▎ | 482/759 [1:09:03<34:02, 7.37s/it][2024-12-31 18:19:09,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.10 | bwd_microstep: 388.16 | bwd_inner_microstep: 387.65 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.26 +[2024-12-31 18:19:09,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 292.74 | bwd_microstep: 499.85 | bwd_inner_microstep: 499.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:10,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.68 | bwd_microstep: 286.34 | bwd_inner_microstep: 286.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:19:10,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.62 | bwd_microstep: 263.11 | bwd_inner_microstep: 263.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:11,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 249.45 | bwd_inner_microstep: 249.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:11,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.25 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:12,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:19:12,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:13,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:19:13,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:13,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:14,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.80 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:14,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.84 | bwd_microstep: 242.36 | bwd_inner_microstep: 242.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:15,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:15,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.95 | bwd_microstep: 225.54 | bwd_inner_microstep: 225.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:16,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.50 | optimizer_gradients: 0.68 | optimizer_step: 57.54 +[2024-12-31 18:19:16,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 255.07 | bwd_inner_microstep: 241.38 | bwd_allreduce_microstep: 13.58 | step_microstep: 68.78 +[2024-12-31 18:19:16,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2945.14 | bwd: 4368.14 | bwd_inner: 4353.46 | bwd_allreduce: 13.94 | step: 71.83 + 64%|██████▎ | 483/759 [1:09:11<34:20, 7.46s/it] {'loss': 1.2008, 'learning_rate': 6.173165676349103e-06, 'epoch': 0.64} + 64%|██████▎ | 483/759 [1:09:11<34:20, 7.46s/it][2024-12-31 18:19:16,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.06 | bwd_microstep: 365.56 | bwd_inner_microstep: 365.23 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:19:17,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.83 | bwd_microstep: 406.92 | bwd_inner_microstep: 406.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:19:17,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.40 | bwd_microstep: 257.93 | bwd_inner_microstep: 257.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:19:18,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 255.94 | bwd_inner_microstep: 255.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:18,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 247.04 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:19,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 263.19 | bwd_inner_microstep: 262.60 | bwd_allreduce_microstep: 0.40 | step_microstep: 0.21 +[2024-12-31 18:19:19,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.95 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:20,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 246.93 | bwd_inner_microstep: 246.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:20,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.86 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:21,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:21,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 241.09 | bwd_inner_microstep: 241.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:21,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:22,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:22,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 241.15 | bwd_inner_microstep: 241.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:23,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:23,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.72 | optimizer_step: 3.47 +[2024-12-31 18:19:23,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 256.45 | bwd_inner_microstep: 242.61 | bwd_allreduce_microstep: 13.71 | step_microstep: 11.48 +[2024-12-31 18:19:23,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2900.04 | bwd: 4245.90 | bwd_inner: 4230.69 | bwd_allreduce: 14.39 | step: 14.60 + 64%|██████▍ | 484/759 [1:09:18<34:15, 7.47s/it] {'loss': 1.2352, 'learning_rate': 6.133765153381918e-06, 'epoch': 0.64} + 64%|██████▍ | 484/759 [1:09:18<34:15, 7.47s/it][2024-12-31 18:19:24,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.57 | bwd_microstep: 350.84 | bwd_inner_microstep: 350.49 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:19:24,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.04 | bwd_microstep: 289.96 | bwd_inner_microstep: 289.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:19:25,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.48 | bwd_microstep: 276.11 | bwd_inner_microstep: 275.95 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.19 +[2024-12-31 18:19:25,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 261.44 | bwd_inner_microstep: 261.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:26,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.21 | bwd_microstep: 256.48 | bwd_inner_microstep: 256.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:26,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 248.00 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:27,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 251.99 | bwd_inner_microstep: 251.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:27,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:27,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:28,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:28,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 242.36 | bwd_inner_microstep: 242.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:19:29,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.96 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:19:29,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.99 | bwd_microstep: 241.04 | bwd_inner_microstep: 241.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:30,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 242.92 | bwd_inner_microstep: 242.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:19:30,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 240.89 | bwd_inner_microstep: 240.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:19:31,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 9.63 | optimizer_gradients: 7.14 | optimizer_step: 10.87 +[2024-12-31 18:19:31,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 255.72 | bwd_inner_microstep: 242.09 | bwd_allreduce_microstep: 13.53 | step_microstep: 29.82 +[2024-12-31 18:19:31,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.51 | bwd: 4133.23 | bwd_inner: 4118.59 | bwd_allreduce: 13.91 | step: 32.54 + 64%|██████▍ | 485/759 [1:09:25<33:54, 7.42s/it] {'loss': 1.2257, 'learning_rate': 6.094435072416379e-06, 'epoch': 0.64} + 64%|██████▍ | 485/759 [1:09:25<33:54, 7.42s/it][2024-12-31 18:19:31,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.75 | bwd_microstep: 344.96 | bwd_inner_microstep: 344.63 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:19:32,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.68 | bwd_microstep: 347.86 | bwd_inner_microstep: 347.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:32,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.70 | bwd_microstep: 280.47 | bwd_inner_microstep: 280.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:19:33,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.99 | bwd_microstep: 263.42 | bwd_inner_microstep: 263.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:19:33,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.94 | bwd_microstep: 256.50 | bwd_inner_microstep: 256.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:34,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 248.98 | bwd_inner_microstep: 248.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:34,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.97 | bwd_microstep: 247.70 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:34,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 252.62 | bwd_inner_microstep: 252.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:35,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 260.46 | bwd_inner_microstep: 260.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:35,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:36,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:36,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:19:37,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:37,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 252.40 | bwd_inner_microstep: 252.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:37,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.94 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:38,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.96 | optimizer_step: 3.59 +[2024-12-31 18:19:38,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.36 | bwd_microstep: 302.98 | bwd_inner_microstep: 242.05 | bwd_allreduce_microstep: 60.90 | step_microstep: 12.67 +[2024-12-31 18:19:38,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.97 | bwd: 4279.82 | bwd_inner: 4218.16 | bwd_allreduce: 61.14 | step: 15.71 + 64%|██████▍ | 486/759 [1:09:33<33:49, 7.43s/it] {'loss': 1.2022, 'learning_rate': 6.055176150038445e-06, 'epoch': 0.64} + 64%|██████▍ | 486/759 [1:09:33<33:49, 7.43s/it][2024-12-31 18:19:38,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.41 | bwd_microstep: 317.67 | bwd_inner_microstep: 317.28 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.20 +[2024-12-31 18:19:39,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.81 | bwd_microstep: 298.78 | bwd_inner_microstep: 298.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:19:39,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.04 | bwd_microstep: 265.32 | bwd_inner_microstep: 265.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:40,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.41 | bwd_microstep: 257.58 | bwd_inner_microstep: 257.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:40,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.53 | bwd_microstep: 257.30 | bwd_inner_microstep: 257.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:41,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 247.86 | bwd_inner_microstep: 247.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:41,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 247.08 | bwd_inner_microstep: 247.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:42,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 245.71 | bwd_inner_microstep: 245.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:19:42,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 248.29 | bwd_inner_microstep: 248.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:43,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.13 | bwd_microstep: 251.22 | bwd_inner_microstep: 251.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:43,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:19:43,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:19:44,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:44,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 245.77 | bwd_inner_microstep: 245.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:19:45,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 242.13 | bwd_inner_microstep: 242.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:45,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.37 | optimizer_gradients: 0.59 | optimizer_step: 3.09 +[2024-12-31 18:19:45,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.36 | bwd_microstep: 317.29 | bwd_inner_microstep: 255.46 | bwd_allreduce_microstep: 61.79 | step_microstep: 11.46 +[2024-12-31 18:19:45,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2807.09 | bwd: 4172.75 | bwd_inner: 4110.11 | bwd_allreduce: 62.06 | step: 14.40 + 64%|██████▍ | 487/759 [1:09:40<33:28, 7.38s/it] {'loss': 1.2565, 'learning_rate': 6.015989101537586e-06, 'epoch': 0.64} + 64%|██████▍ | 487/759 [1:09:40<33:28, 7.38s/it][2024-12-31 18:19:46,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.83 | bwd_microstep: 344.37 | bwd_inner_microstep: 344.02 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:19:46,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.23 | bwd_microstep: 288.07 | bwd_inner_microstep: 288.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:47,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.75 | bwd_microstep: 262.93 | bwd_inner_microstep: 262.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:47,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.04 | bwd_microstep: 254.92 | bwd_inner_microstep: 254.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:48,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.75 | bwd_microstep: 249.64 | bwd_inner_microstep: 249.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:19:48,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.46 | bwd_microstep: 245.76 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:49,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 246.09 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:49,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:19:49,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.57 | bwd_inner_microstep: 245.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:50,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:19:50,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.09 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:19:51,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:19:51,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 247.18 | bwd_inner_microstep: 247.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:52,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:52,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:53,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.62 | optimizer_gradients: 0.78 | optimizer_step: 3.11 +[2024-12-31 18:19:53,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.10 | bwd_microstep: 308.84 | bwd_inner_microstep: 241.41 | bwd_allreduce_microstep: 67.39 | step_microstep: 10.98 +[2024-12-31 18:19:53,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.87 | bwd: 4161.67 | bwd_inner: 4093.51 | bwd_allreduce: 67.64 | step: 13.43 + 64%|██████▍ | 488/759 [1:09:47<33:11, 7.35s/it] {'loss': 1.2518, 'learning_rate': 5.976874640893751e-06, 'epoch': 0.64} + 64%|██████▍ | 488/759 [1:09:47<33:11, 7.35s/it][2024-12-31 18:19:53,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.53 | bwd_microstep: 396.06 | bwd_inner_microstep: 395.71 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:19:54,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.91 | bwd_microstep: 285.60 | bwd_inner_microstep: 285.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:54,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.68 | bwd_microstep: 282.63 | bwd_inner_microstep: 282.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:55,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.03 | bwd_microstep: 262.92 | bwd_inner_microstep: 262.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:55,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.62 | bwd_microstep: 256.06 | bwd_inner_microstep: 256.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:19:55,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 248.33 | bwd_inner_microstep: 248.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:56,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:19:56,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 244.98 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:19:57,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:57,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:58,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:19:58,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:59,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.53 | bwd_microstep: 241.38 | bwd_inner_microstep: 241.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:59,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.12 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:19:59,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:00,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 7.27 | optimizer_step: 3.09 +[2024-12-31 18:20:00,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 395.97 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 152.87 | step_microstep: 18.23 +[2024-12-31 18:20:00,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.45 | bwd: 4328.69 | bwd_inner: 4175.05 | bwd_allreduce: 153.11 | step: 21.23 + 64%|██████▍ | 489/759 [1:09:55<33:15, 7.39s/it] {'loss': 1.2272, 'learning_rate': 5.937833480764339e-06, 'epoch': 0.64} + 64%|██████▍ | 489/759 [1:09:55<33:15, 7.39s/it][2024-12-31 18:20:01,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.61 | bwd_microstep: 335.73 | bwd_inner_microstep: 335.38 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:20:01,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.24 | bwd_microstep: 298.21 | bwd_inner_microstep: 298.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:02,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.75 | bwd_microstep: 288.42 | bwd_inner_microstep: 288.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:02,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.66 | bwd_microstep: 269.22 | bwd_inner_microstep: 269.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:02,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.31 | bwd_microstep: 257.56 | bwd_inner_microstep: 257.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:03,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.95 | bwd_microstep: 255.77 | bwd_inner_microstep: 255.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:20:03,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.40 | bwd_microstep: 255.46 | bwd_inner_microstep: 255.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:20:04,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 248.08 | bwd_inner_microstep: 248.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:20:04,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.93 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:20:05,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 245.97 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:20:05,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 246.60 | bwd_inner_microstep: 246.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:06,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 249.87 | bwd_inner_microstep: 249.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:20:06,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:06,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 247.80 | bwd_inner_microstep: 247.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:07,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:20:08,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.63 | optimizer_step: 3.11 +[2024-12-31 18:20:08,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.45 | bwd_microstep: 513.93 | bwd_inner_microstep: 242.54 | bwd_allreduce_microstep: 271.34 | step_microstep: 10.41 +[2024-12-31 18:20:08,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2856.63 | bwd: 4444.13 | bwd_inner: 4172.03 | bwd_allreduce: 271.59 | step: 12.93 + 65%|██████▍ | 490/759 [1:10:02<33:21, 7.44s/it] {'loss': 1.2077, 'learning_rate': 5.898866332471241e-06, 'epoch': 0.65} + 65%|██████▍ | 490/759 [1:10:03<33:21, 7.44s/it][2024-12-31 18:20:08,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.81 | bwd_microstep: 314.79 | bwd_inner_microstep: 314.45 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:20:09,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.97 | bwd_microstep: 286.44 | bwd_inner_microstep: 286.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:20:09,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.44 | bwd_microstep: 266.15 | bwd_inner_microstep: 266.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:09,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.18 | bwd_microstep: 262.95 | bwd_inner_microstep: 262.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:10,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.70 | bwd_microstep: 255.71 | bwd_inner_microstep: 255.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:10,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 249.58 | bwd_inner_microstep: 249.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:20:11,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.74 | bwd_microstep: 253.30 | bwd_inner_microstep: 253.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:11,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 245.35 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:20:12,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:12,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:20:13,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 247.84 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:13,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.29 | bwd_microstep: 244.23 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:13,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 243.86 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:20:14,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.29 +[2024-12-31 18:20:14,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:15,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.69 | optimizer_gradients: 0.82 | optimizer_step: 3.13 +[2024-12-31 18:20:15,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 314.92 | bwd_inner_microstep: 249.97 | bwd_allreduce_microstep: 64.91 | step_microstep: 10.84 +[2024-12-31 18:20:15,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.83 | bwd: 4162.17 | bwd_inner: 4096.00 | bwd_allreduce: 65.38 | step: 13.86 + 65%|██████▍ | 491/759 [1:10:10<32:59, 7.39s/it] {'loss': 1.2364, 'learning_rate': 5.859973905987866e-06, 'epoch': 0.65} + 65%|██████▍ | 491/759 [1:10:10<32:59, 7.39s/it][2024-12-31 18:20:15,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.94 | bwd_microstep: 348.12 | bwd_inner_microstep: 347.74 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:20:16,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.77 | bwd_microstep: 268.17 | bwd_inner_microstep: 268.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:16,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.41 | bwd_microstep: 267.10 | bwd_inner_microstep: 267.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:20:17,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.32 | bwd_microstep: 263.17 | bwd_inner_microstep: 263.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:17,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.98 | bwd_microstep: 255.47 | bwd_inner_microstep: 255.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:18,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 255.80 | bwd_inner_microstep: 255.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:18,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:19,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:19,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:19,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 245.24 | bwd_inner_microstep: 245.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:20:20,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:20,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 246.98 | bwd_inner_microstep: 246.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:20:21,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:21,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.58 | bwd_microstep: 241.00 | bwd_inner_microstep: 240.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:22,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.30 | bwd_microstep: 241.04 | bwd_inner_microstep: 241.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:20:22,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.64 | optimizer_gradients: 0.58 | optimizer_step: 3.59 +[2024-12-31 18:20:22,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 462.84 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 219.67 | step_microstep: 10.51 +[2024-12-31 18:20:22,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2812.50 | bwd: 4315.76 | bwd_inner: 4095.29 | bwd_allreduce: 219.94 | step: 13.44 + 65%|██████▍ | 492/759 [1:10:17<32:54, 7.39s/it] {'loss': 1.2377, 'learning_rate': 5.821156909926202e-06, 'epoch': 0.65} + 65%|██████▍ | 492/759 [1:10:17<32:54, 7.39s/it][2024-12-31 18:20:23,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.41 | bwd_microstep: 357.61 | bwd_inner_microstep: 357.27 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:20:23,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 262.26 | bwd_inner_microstep: 262.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:24,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.90 | bwd_microstep: 255.39 | bwd_inner_microstep: 255.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:24,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.69 | bwd_microstep: 258.84 | bwd_inner_microstep: 258.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.32 +[2024-12-31 18:20:25,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.05 | bwd_microstep: 247.90 | bwd_inner_microstep: 247.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:20:25,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 259.26 | bwd_inner_microstep: 259.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:25,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:26,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 245.49 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.21 +[2024-12-31 18:20:26,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:20:27,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:27,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:28,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.05 | bwd_microstep: 250.94 | bwd_inner_microstep: 250.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:28,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.43 | bwd_microstep: 256.51 | bwd_inner_microstep: 256.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:20:29,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.06 | bwd_microstep: 241.75 | bwd_inner_microstep: 241.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:20:29,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:20:29,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.63 | optimizer_gradients: 0.65 | optimizer_step: 3.34 +[2024-12-31 18:20:29,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 258.06 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 13.59 | step_microstep: 12.99 +[2024-12-31 18:20:29,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2820.94 | bwd: 4117.04 | bwd_inner: 4101.98 | bwd_allreduce: 14.06 | step: 16.17 + 65%|██████▍ | 493/759 [1:10:24<32:35, 7.35s/it] {'loss': 1.2399, 'learning_rate': 5.782416051523909e-06, 'epoch': 0.65} + 65%|██████▍ | 493/759 [1:10:24<32:35, 7.35s/it][2024-12-31 18:20:30,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.69 | bwd_microstep: 388.33 | bwd_inner_microstep: 387.99 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:20:31,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.12 | bwd_microstep: 297.52 | bwd_inner_microstep: 297.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:20:31,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.29 | bwd_microstep: 267.23 | bwd_inner_microstep: 267.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:32,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.21 | bwd_microstep: 262.63 | bwd_inner_microstep: 262.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:20:32,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.67 | bwd_microstep: 249.03 | bwd_inner_microstep: 249.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:20:32,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 248.29 | bwd_inner_microstep: 248.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:33,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:20:33,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:34,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:34,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 244.22 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.25 +[2024-12-31 18:20:35,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:35,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:20:35,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.72 | bwd_microstep: 241.62 | bwd_inner_microstep: 241.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:20:36,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.29 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:36,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 242.61 | bwd_inner_microstep: 242.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:37,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.09 | optimizer_gradients: 0.64 | optimizer_step: 3.16 +[2024-12-31 18:20:37,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 396.92 | bwd_inner_microstep: 248.24 | bwd_allreduce_microstep: 148.62 | step_microstep: 10.84 +[2024-12-31 18:20:37,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2854.97 | bwd: 4305.78 | bwd_inner: 4155.88 | bwd_allreduce: 149.06 | step: 13.77 + 65%|██████▌ | 494/759 [1:10:32<32:36, 7.38s/it] {'loss': 1.192, 'learning_rate': 5.743752036631443e-06, 'epoch': 0.65} + 65%|██████▌ | 494/759 [1:10:32<32:36, 7.38s/it][2024-12-31 18:20:38,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.78 | bwd_microstep: 357.64 | bwd_inner_microstep: 357.26 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:20:38,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.80 | bwd_microstep: 287.94 | bwd_inner_microstep: 287.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:38,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.71 | bwd_microstep: 262.68 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:39,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 256.07 | bwd_inner_microstep: 255.83 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.33 +[2024-12-31 18:20:39,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 255.73 | bwd_inner_microstep: 255.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:40,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.45 | bwd_microstep: 276.67 | bwd_inner_microstep: 276.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:40,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 246.23 | bwd_inner_microstep: 246.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:20:41,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 249.56 | bwd_inner_microstep: 249.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:41,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 250.65 | bwd_inner_microstep: 250.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:20:42,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.72 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:42,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:43,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:43,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:43,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:44,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.29 | bwd_microstep: 241.40 | bwd_inner_microstep: 241.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:44,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.64 | optimizer_step: 3.39 +[2024-12-31 18:20:44,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 257.36 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 13.59 | step_microstep: 12.17 +[2024-12-31 18:20:44,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2900.56 | bwd: 4166.50 | bwd_inner: 4151.57 | bwd_allreduce: 14.03 | step: 15.21 + 65%|██████▌ | 495/759 [1:10:39<32:32, 7.40s/it] {'loss': 1.242, 'learning_rate': 5.7051655696991825e-06, 'epoch': 0.65} + 65%|██████▌ | 495/759 [1:10:39<32:32, 7.40s/it][2024-12-31 18:20:45,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.91 | bwd_microstep: 337.66 | bwd_inner_microstep: 337.29 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:20:45,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.78 | bwd_microstep: 282.94 | bwd_inner_microstep: 282.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:46,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.37 | bwd_microstep: 256.91 | bwd_inner_microstep: 256.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:46,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 250.14 | bwd_inner_microstep: 250.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.32 +[2024-12-31 18:20:47,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.71 | bwd_microstep: 259.16 | bwd_inner_microstep: 259.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:47,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 256.71 | bwd_inner_microstep: 256.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:48,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 255.85 | bwd_inner_microstep: 255.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:48,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:49,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:49,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:49,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:50,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:50,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:51,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 248.62 | bwd_inner_microstep: 248.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:51,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:52,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 22.41 | optimizer_gradients: 6.73 | optimizer_step: 4.53 +[2024-12-31 18:20:52,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 257.68 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 13.74 | step_microstep: 35.78 +[2024-12-31 18:20:52,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2800.03 | bwd: 4118.04 | bwd_inner: 4103.24 | bwd_allreduce: 14.05 | step: 38.96 + 65%|██████▌ | 496/759 [1:10:47<32:14, 7.35s/it] {'loss': 1.2175, 'learning_rate': 5.666657353764594e-06, 'epoch': 0.65} + 65%|██████▌ | 496/759 [1:10:47<32:14, 7.35s/it][2024-12-31 18:20:52,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.23 | bwd_microstep: 349.55 | bwd_inner_microstep: 349.21 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:20:53,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 287.50 | bwd_microstep: 498.37 | bwd_inner_microstep: 498.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:20:53,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.73 | bwd_microstep: 269.85 | bwd_inner_microstep: 269.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:54,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.17 | bwd_microstep: 261.46 | bwd_inner_microstep: 261.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:20:54,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 250.26 | bwd_inner_microstep: 250.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:55,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 260.98 | bwd_inner_microstep: 260.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:55,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 247.19 | bwd_inner_microstep: 247.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:56,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:56,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.67 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:57,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:20:57,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:20:57,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:20:58,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:20:58,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:20:59,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.72 | bwd_microstep: 246.65 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:20:59,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.91 | optimizer_gradients: 0.73 | optimizer_step: 3.36 +[2024-12-31 18:20:59,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 261.92 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.35 +[2024-12-31 18:20:59,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2921.00 | bwd: 4357.86 | bwd_inner: 4343.06 | bwd_allreduce: 13.99 | step: 14.50 + 65%|██████▌ | 497/759 [1:10:54<32:26, 7.43s/it] {'loss': 1.2376, 'learning_rate': 5.628228090439434e-06, 'epoch': 0.65} + 65%|██████▌ | 497/759 [1:10:54<32:26, 7.43s/it][2024-12-31 18:21:00,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.89 | bwd_microstep: 390.59 | bwd_inner_microstep: 390.24 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:21:00,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 232.91 | bwd_microstep: 378.18 | bwd_inner_microstep: 378.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:21:01,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.27 | bwd_microstep: 268.80 | bwd_inner_microstep: 268.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:01,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.17 | bwd_microstep: 263.53 | bwd_inner_microstep: 263.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:02,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.27 | bwd_microstep: 254.66 | bwd_inner_microstep: 254.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:02,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 248.06 | bwd_inner_microstep: 248.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:03,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:03,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 274.59 | bwd_inner_microstep: 274.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:04,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 245.82 | bwd_inner_microstep: 245.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:21:04,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:04,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:05,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:05,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:06,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.91 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:21:06,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.22 +[2024-12-31 18:21:07,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.71 | optimizer_gradients: 0.58 | optimizer_step: 3.13 +[2024-12-31 18:21:07,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.04 | bwd_microstep: 275.87 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 32.32 | step_microstep: 12.79 +[2024-12-31 18:21:07,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.03 | bwd: 4311.39 | bwd_inner: 4277.88 | bwd_allreduce: 32.73 | step: 15.79 + 66%|██████▌ | 498/759 [1:11:02<32:23, 7.45s/it] {'loss': 1.2259, 'learning_rate': 5.589878479896959e-06, 'epoch': 0.66} + 66%|██████▌ | 498/759 [1:11:02<32:23, 7.45s/it][2024-12-31 18:21:07,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.01 | bwd_microstep: 316.36 | bwd_inner_microstep: 316.02 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:21:08,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.65 | bwd_microstep: 282.93 | bwd_inner_microstep: 282.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:21:08,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.86 | bwd_microstep: 257.23 | bwd_inner_microstep: 257.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:09,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.47 | bwd_microstep: 256.66 | bwd_inner_microstep: 256.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:21:09,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.75 | bwd_microstep: 254.70 | bwd_inner_microstep: 254.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:10,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 250.08 | bwd_inner_microstep: 249.79 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.26 +[2024-12-31 18:21:10,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 246.96 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:10,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.40 | bwd_microstep: 247.29 | bwd_inner_microstep: 247.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:11,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:11,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.29 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:21:12,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:21:12,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.64 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:13,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 241.21 | bwd_inner_microstep: 241.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:21:13,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 254.13 | bwd_inner_microstep: 254.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:13,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.01 | bwd_microstep: 242.55 | bwd_inner_microstep: 242.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:14,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.49 | optimizer_gradients: 0.62 | optimizer_step: 3.11 +[2024-12-31 18:21:14,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.95 | bwd_microstep: 495.97 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 250.83 | step_microstep: 11.03 +[2024-12-31 18:21:14,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2801.70 | bwd: 4324.42 | bwd_inner: 4072.52 | bwd_allreduce: 251.30 | step: 14.06 + 66%|██████▌ | 499/759 [1:11:09<32:13, 7.44s/it] {'loss': 1.232, 'learning_rate': 5.55160922085916e-06, 'epoch': 0.66} + 66%|██████▌ | 499/759 [1:11:09<32:13, 7.44s/it][2024-12-31 18:21:15,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.26 | bwd_microstep: 334.28 | bwd_inner_microstep: 333.94 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:21:15,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 287.34 | bwd_microstep: 499.09 | bwd_inner_microstep: 499.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:16,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.61 | bwd_microstep: 265.97 | bwd_inner_microstep: 265.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:16,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.05 | bwd_microstep: 266.36 | bwd_inner_microstep: 266.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:17,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 254.89 | bwd_inner_microstep: 254.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:17,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 246.87 | bwd_inner_microstep: 246.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:21:18,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 249.57 | bwd_inner_microstep: 249.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:21:18,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:19,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 250.27 | bwd_inner_microstep: 250.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:21:19,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 248.03 | bwd_inner_microstep: 248.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:19,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 252.67 | bwd_inner_microstep: 252.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:20,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.21 | bwd_microstep: 245.61 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:20,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:21,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 245.23 | bwd_inner_microstep: 245.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:21,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:21:22,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.90 | optimizer_gradients: 0.62 | optimizer_step: 3.28 +[2024-12-31 18:21:22,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 256.68 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.22 +[2024-12-31 18:21:22,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2899.18 | bwd: 4346.96 | bwd_inner: 4332.49 | bwd_allreduce: 13.85 | step: 14.23 + 66%|██████▌ | 500/759 [1:11:17<32:13, 7.47s/it] {'loss': 1.2215, 'learning_rate': 5.513421010584044e-06, 'epoch': 0.66} + 66%|██████▌ | 500/759 [1:11:17<32:13, 7.47s/it][2024-12-31 18:21:22,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.06 | bwd_microstep: 344.04 | bwd_inner_microstep: 343.70 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:21:23,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.80 | bwd_microstep: 368.36 | bwd_inner_microstep: 368.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:21:23,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.05 | bwd_microstep: 261.42 | bwd_inner_microstep: 261.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:24,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 274.72 | bwd_microstep: 471.15 | bwd_inner_microstep: 471.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:24,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 249.34 | bwd_inner_microstep: 249.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:25,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.55 | bwd_microstep: 255.43 | bwd_inner_microstep: 255.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:25,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 249.80 | bwd_inner_microstep: 249.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:21:26,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 248.90 | bwd_inner_microstep: 248.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:21:26,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 248.91 | bwd_inner_microstep: 248.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:27,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 254.47 | bwd_inner_microstep: 254.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:21:27,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 246.08 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:28,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:28,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 248.87 | bwd_inner_microstep: 248.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:21:28,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:29,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:29,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.92 | optimizer_gradients: 0.62 | optimizer_step: 3.27 +[2024-12-31 18:21:29,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 258.23 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 13.55 | step_microstep: 11.05 +[2024-12-31 18:21:29,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2949.80 | bwd: 4438.01 | bwd_inner: 4423.64 | bwd_allreduce: 13.82 | step: 13.73 + 66%|██████▌ | 501/759 [1:11:24<32:21, 7.53s/it] {'loss': 1.2117, 'learning_rate': 5.4753145448529284e-06, 'epoch': 0.66} + 66%|██████▌ | 501/759 [1:11:24<32:21, 7.53s/it][2024-12-31 18:21:30,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 206.67 | bwd_microstep: 315.62 | bwd_inner_microstep: 315.22 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:21:30,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.21 | bwd_microstep: 297.08 | bwd_inner_microstep: 297.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:31,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.68 | bwd_microstep: 299.60 | bwd_inner_microstep: 299.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:31,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.77 | bwd_microstep: 268.26 | bwd_inner_microstep: 268.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:21:32,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.50 | bwd_microstep: 267.53 | bwd_inner_microstep: 267.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:32,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.43 | bwd_microstep: 257.66 | bwd_inner_microstep: 257.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:33,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.09 | bwd_microstep: 249.84 | bwd_inner_microstep: 249.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:33,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.93 | bwd_microstep: 257.86 | bwd_inner_microstep: 257.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:21:34,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 247.35 | bwd_inner_microstep: 247.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:34,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 246.62 | bwd_inner_microstep: 246.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:34,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:35,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:35,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:36,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:21:36,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 253.60 | bwd_inner_microstep: 253.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:21:37,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.80 | optimizer_gradients: 0.58 | optimizer_step: 3.69 +[2024-12-31 18:21:37,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 452.34 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 207.29 | step_microstep: 11.15 +[2024-12-31 18:21:37,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2878.08 | bwd: 4391.58 | bwd_inner: 4183.52 | bwd_allreduce: 207.54 | step: 13.81 + 66%|██████▌ | 502/759 [1:11:32<32:15, 7.53s/it] {'loss': 1.1955, 'learning_rate': 5.437290517957767e-06, 'epoch': 0.66} + 66%|██████▌ | 502/759 [1:11:32<32:15, 7.53s/it][2024-12-31 18:21:37,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.00 | bwd_microstep: 347.15 | bwd_inner_microstep: 346.80 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:21:38,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.33 | bwd_microstep: 288.23 | bwd_inner_microstep: 288.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:38,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.84 | bwd_microstep: 262.25 | bwd_inner_microstep: 262.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:39,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.11 | bwd_microstep: 260.50 | bwd_inner_microstep: 260.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:39,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 268.59 | bwd_inner_microstep: 268.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:21:40,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 248.20 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:21:40,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:41,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 247.45 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:21:41,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 251.96 | bwd_inner_microstep: 251.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:41,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:21:42,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:21:42,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 247.41 | bwd_inner_microstep: 247.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:43,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:43,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 249.98 | bwd_inner_microstep: 249.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:21:44,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.20 | bwd_microstep: 241.11 | bwd_inner_microstep: 241.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:44,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 1.75 | optimizer_step: 3.44 +[2024-12-31 18:21:44,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.99 | bwd_microstep: 255.14 | bwd_inner_microstep: 241.41 | bwd_allreduce_microstep: 13.61 | step_microstep: 13.42 +[2024-12-31 18:21:44,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.13 | bwd: 4149.15 | bwd_inner: 4134.55 | bwd_allreduce: 13.89 | step: 16.18 + 66%|██████▋ | 503/759 [1:11:39<31:44, 7.44s/it] {'loss': 1.2329, 'learning_rate': 5.399349622688479e-06, 'epoch': 0.66} + 66%|██████▋ | 503/759 [1:11:39<31:44, 7.44s/it][2024-12-31 18:21:45,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.10 | bwd_microstep: 352.28 | bwd_inner_microstep: 351.89 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.21 +[2024-12-31 18:21:45,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.38 | bwd_microstep: 282.41 | bwd_inner_microstep: 282.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:46,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.99 | bwd_microstep: 264.39 | bwd_inner_microstep: 264.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:46,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 251.02 | bwd_inner_microstep: 250.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:21:47,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 251.00 | bwd_inner_microstep: 250.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:47,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:47,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:48,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:48,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:49,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 259.13 | bwd_inner_microstep: 259.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:49,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:21:50,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.65 | bwd_microstep: 242.23 | bwd_inner_microstep: 242.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:50,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:50,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:21:51,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.39 | bwd_microstep: 242.17 | bwd_inner_microstep: 242.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:51,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.68 | optimizer_step: 3.18 +[2024-12-31 18:21:51,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.51 | bwd_microstep: 258.19 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 13.62 | step_microstep: 10.40 +[2024-12-31 18:21:51,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2801.44 | bwd: 4120.53 | bwd_inner: 4106.00 | bwd_allreduce: 13.90 | step: 13.33 + 66%|██████▋ | 504/759 [1:11:46<31:19, 7.37s/it] {'loss': 1.2339, 'learning_rate': 5.3614925503203586e-06, 'epoch': 0.66} + 66%|██████▋ | 504/759 [1:11:46<31:19, 7.37s/it][2024-12-31 18:21:52,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 233.97 | bwd_microstep: 369.37 | bwd_inner_microstep: 369.01 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:21:52,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.82 | bwd_microstep: 290.78 | bwd_inner_microstep: 290.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:21:53,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.66 | bwd_microstep: 281.44 | bwd_inner_microstep: 281.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:21:53,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.14 | bwd_microstep: 262.00 | bwd_inner_microstep: 261.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:54,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.33 | bwd_microstep: 255.65 | bwd_inner_microstep: 255.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:54,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 249.41 | bwd_inner_microstep: 249.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:21:55,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:55,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 247.70 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:21:56,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.62 | bwd_microstep: 246.81 | bwd_inner_microstep: 246.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:21:56,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 245.53 | bwd_inner_microstep: 245.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:21:56,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:21:57,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:21:57,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:58,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:21:58,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.33 | bwd_microstep: 242.09 | bwd_inner_microstep: 242.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:21:59,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.06 | optimizer_gradients: 0.62 | optimizer_step: 3.09 +[2024-12-31 18:21:59,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.18 | bwd_microstep: 265.08 | bwd_inner_microstep: 226.19 | bwd_allreduce_microstep: 38.84 | step_microstep: 10.99 +[2024-12-31 18:21:59,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.87 | bwd: 4178.50 | bwd_inner: 4138.89 | bwd_allreduce: 39.09 | step: 13.82 + 67%|██████▋ | 505/759 [1:11:54<31:08, 7.36s/it] {'loss': 1.2062, 'learning_rate': 5.323719990601459e-06, 'epoch': 0.67} + 67%|██████▋ | 505/759 [1:11:54<31:08, 7.36s/it][2024-12-31 18:21:59,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 247.22 | bwd_microstep: 370.08 | bwd_inner_microstep: 369.73 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:22:00,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.74 | bwd_microstep: 294.43 | bwd_inner_microstep: 294.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:22:00,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.00 | bwd_microstep: 291.50 | bwd_inner_microstep: 291.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:01,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.22 | bwd_microstep: 263.28 | bwd_inner_microstep: 263.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:01,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.75 | bwd_microstep: 262.07 | bwd_inner_microstep: 262.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:02,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.45 | bwd_microstep: 255.46 | bwd_inner_microstep: 255.10 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.28 +[2024-12-31 18:22:02,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 294.29 | bwd_inner_microstep: 293.82 | bwd_allreduce_microstep: 0.27 | step_microstep: 0.32 +[2024-12-31 18:22:03,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 245.83 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:03,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:03,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.19 +[2024-12-31 18:22:04,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:04,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 259.20 | bwd_inner_microstep: 259.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:05,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:22:05,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:22:06,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:22:06,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.61 | optimizer_step: 3.24 +[2024-12-31 18:22:06,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.16 | bwd_microstep: 275.12 | bwd_inner_microstep: 242.71 | bwd_allreduce_microstep: 32.34 | step_microstep: 11.86 +[2024-12-31 18:22:06,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2875.34 | bwd: 4282.01 | bwd_inner: 4247.82 | bwd_allreduce: 33.15 | step: 14.65 + 67%|██████▋ | 506/759 [1:12:01<31:07, 7.38s/it] {'loss': 1.2327, 'learning_rate': 5.286032631740023e-06, 'epoch': 0.67} + 67%|██████▋ | 506/759 [1:12:01<31:07, 7.38s/it][2024-12-31 18:22:07,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.95 | bwd_microstep: 367.06 | bwd_inner_microstep: 366.71 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:22:07,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.20 | bwd_microstep: 290.99 | bwd_inner_microstep: 290.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:22:08,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.36 | bwd_microstep: 302.50 | bwd_inner_microstep: 302.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:08,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.92 | bwd_microstep: 255.01 | bwd_inner_microstep: 254.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:09,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 251.29 | bwd_inner_microstep: 251.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:22:09,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 247.20 | bwd_inner_microstep: 247.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:09,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 246.77 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:10,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 247.28 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:10,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 244.16 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.28 +[2024-12-31 18:22:11,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 274.94 | bwd_inner_microstep: 274.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:11,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 255.76 | bwd_inner_microstep: 255.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:22:12,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:12,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.03 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:12,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.54 | bwd_microstep: 240.74 | bwd_inner_microstep: 240.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:13,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:13,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 1.12 | optimizer_step: 3.46 +[2024-12-31 18:22:13,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.57 | bwd_microstep: 255.65 | bwd_inner_microstep: 241.60 | bwd_allreduce_microstep: 13.93 | step_microstep: 12.20 +[2024-12-31 18:22:13,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2830.02 | bwd: 4208.73 | bwd_inner: 4193.48 | bwd_allreduce: 14.32 | step: 14.93 + 67%|██████▋ | 507/759 [1:12:08<30:56, 7.37s/it] {'loss': 1.2644, 'learning_rate': 5.248431160391963e-06, 'epoch': 0.67} + 67%|██████▋ | 507/759 [1:12:08<30:56, 7.37s/it][2024-12-31 18:22:14,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.55 | bwd_microstep: 313.66 | bwd_inner_microstep: 313.33 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:22:14,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.57 | bwd_microstep: 284.00 | bwd_inner_microstep: 283.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:22:15,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.01 | bwd_microstep: 264.19 | bwd_inner_microstep: 264.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:22:15,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.77 | bwd_microstep: 262.60 | bwd_inner_microstep: 262.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:22:16,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 254.28 | bwd_inner_microstep: 254.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:22:16,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.36 | bwd_microstep: 254.56 | bwd_inner_microstep: 254.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:22:17,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 252.84 | bwd_inner_microstep: 252.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.38 +[2024-12-31 18:22:17,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.65 | bwd_microstep: 247.16 | bwd_inner_microstep: 247.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:22:18,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.70 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:18,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 251.96 | bwd_inner_microstep: 251.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:22:18,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 243.36 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:22:19,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:19,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:22:20,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:22:20,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.37 | bwd_microstep: 241.27 | bwd_inner_microstep: 241.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:21,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.37 | optimizer_gradients: 0.60 | optimizer_step: 3.09 +[2024-12-31 18:22:21,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.90 | bwd_microstep: 364.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 120.99 | step_microstep: 11.08 +[2024-12-31 18:22:21,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2820.71 | bwd: 4210.10 | bwd_inner: 4088.01 | bwd_allreduce: 121.32 | step: 13.70 + 67%|██████▋ | 508/759 [1:12:16<30:42, 7.34s/it] {'loss': 1.194, 'learning_rate': 5.2109162616483325e-06, 'epoch': 0.67} + 67%|██████▋ | 508/759 [1:12:16<30:42, 7.34s/it][2024-12-31 18:22:21,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.00 | bwd_microstep: 355.23 | bwd_inner_microstep: 354.86 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:22:22,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.39 | bwd_microstep: 283.50 | bwd_inner_microstep: 283.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:22:22,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.85 | bwd_microstep: 257.53 | bwd_inner_microstep: 257.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:23,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 262.12 | bwd_inner_microstep: 262.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:23,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 248.01 | bwd_inner_microstep: 247.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:24,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.60 | bwd_microstep: 255.25 | bwd_inner_microstep: 255.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:24,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 248.79 | bwd_inner_microstep: 248.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:24,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.24 | bwd_microstep: 245.41 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:25,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 247.29 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:25,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:22:26,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:22:26,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:27,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:27,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:27,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:28,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.55 | optimizer_step: 3.10 +[2024-12-31 18:22:28,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.67 | bwd_microstep: 391.31 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 147.49 | step_microstep: 10.45 +[2024-12-31 18:22:28,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.79 | bwd: 4258.19 | bwd_inner: 4109.90 | bwd_allreduce: 147.74 | step: 13.48 + 67%|██████▋ | 509/759 [1:12:23<30:39, 7.36s/it] {'loss': 1.2536, 'learning_rate': 5.1734886190228496e-06, 'epoch': 0.67} + 67%|██████▋ | 509/759 [1:12:23<30:39, 7.36s/it][2024-12-31 18:22:29,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.41 | bwd_microstep: 391.26 | bwd_inner_microstep: 390.91 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:22:29,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.08 | bwd_microstep: 341.41 | bwd_inner_microstep: 341.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:30,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.78 | bwd_microstep: 270.82 | bwd_inner_microstep: 270.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:22:30,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 262.18 | bwd_inner_microstep: 262.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:31,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.16 | bwd_microstep: 257.53 | bwd_inner_microstep: 257.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:31,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.98 | bwd_microstep: 254.34 | bwd_inner_microstep: 254.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:22:32,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:22:32,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 248.57 | bwd_inner_microstep: 248.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:32,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:33,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:33,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.70 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:34,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.18 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:34,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:35,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.47 | bwd_microstep: 241.15 | bwd_inner_microstep: 241.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:35,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:35,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.67 | optimizer_step: 3.37 +[2024-12-31 18:22:35,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.05 | bwd_microstep: 255.43 | bwd_inner_microstep: 241.68 | bwd_allreduce_microstep: 13.65 | step_microstep: 10.93 +[2024-12-31 18:22:35,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2871.70 | bwd: 4237.09 | bwd_inner: 4222.52 | bwd_allreduce: 13.93 | step: 13.94 + 67%|██████▋ | 510/759 [1:12:30<30:35, 7.37s/it] {'loss': 1.2289, 'learning_rate': 5.136148914439441e-06, 'epoch': 0.67} + 67%|██████▋ | 510/759 [1:12:30<30:35, 7.37s/it][2024-12-31 18:22:36,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.01 | bwd_microstep: 357.40 | bwd_inner_microstep: 357.05 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:22:37,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.73 | bwd_microstep: 352.14 | bwd_inner_microstep: 352.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:22:37,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 263.00 | bwd_inner_microstep: 262.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:38,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 248.73 | bwd_inner_microstep: 248.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:22:38,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.44 | bwd_microstep: 255.12 | bwd_inner_microstep: 255.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:22:38,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 249.61 | bwd_inner_microstep: 249.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:22:39,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 245.46 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:39,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:40,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:40,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:41,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:41,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:42,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.86 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:22:42,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:42,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:43,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.59 | optimizer_step: 3.27 +[2024-12-31 18:22:43,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 375.91 | bwd_inner_microstep: 241.61 | bwd_allreduce_microstep: 134.25 | step_microstep: 11.09 +[2024-12-31 18:22:43,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2879.45 | bwd: 4302.63 | bwd_inner: 4167.41 | bwd_allreduce: 134.50 | step: 14.29 + 67%|██████▋ | 511/759 [1:12:38<30:35, 7.40s/it] {'loss': 1.2126, 'learning_rate': 5.098897828219831e-06, 'epoch': 0.67} + 67%|██████▋ | 511/759 [1:12:38<30:35, 7.40s/it][2024-12-31 18:22:43,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.44 | bwd_microstep: 312.20 | bwd_inner_microstep: 311.87 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:22:44,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.70 | bwd_microstep: 289.94 | bwd_inner_microstep: 289.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:22:44,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.78 | bwd_microstep: 292.60 | bwd_inner_microstep: 292.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:22:45,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.85 | bwd_microstep: 263.14 | bwd_inner_microstep: 263.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:45,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.29 | bwd_microstep: 262.57 | bwd_inner_microstep: 262.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:22:46,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.21 | bwd_microstep: 255.81 | bwd_inner_microstep: 255.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:22:46,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.76 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:47,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 248.13 | bwd_inner_microstep: 248.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:22:47,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.68 | bwd_microstep: 247.06 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:22:48,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.77 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:48,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.03 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:22:48,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:22:49,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 270.64 | bwd_inner_microstep: 270.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:22:49,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.34 | bwd_microstep: 242.34 | bwd_inner_microstep: 242.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:22:50,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.36 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:50,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.04 | optimizer_gradients: 0.67 | optimizer_step: 3.10 +[2024-12-31 18:22:50,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 344.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 99.97 | step_microstep: 10.91 +[2024-12-31 18:22:50,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.80 | bwd: 4252.21 | bwd_inner: 4151.45 | bwd_allreduce: 100.21 | step: 14.08 + 67%|██████▋ | 512/759 [1:12:45<30:26, 7.40s/it] {'loss': 1.188, 'learning_rate': 5.061736039071124e-06, 'epoch': 0.67} + 67%|██████▋ | 512/759 [1:12:45<30:26, 7.40s/it][2024-12-31 18:22:51,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.82 | bwd_microstep: 359.70 | bwd_inner_microstep: 359.37 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:22:51,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.31 | bwd_microstep: 269.38 | bwd_inner_microstep: 269.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:52,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.50 | bwd_microstep: 267.80 | bwd_inner_microstep: 267.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:22:52,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.42 | bwd_microstep: 272.56 | bwd_inner_microstep: 272.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:53,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.11 | bwd_microstep: 248.31 | bwd_inner_microstep: 248.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:53,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 248.70 | bwd_inner_microstep: 248.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:54,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:54,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:55,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 246.40 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:22:55,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:55,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 251.13 | bwd_inner_microstep: 251.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:56,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:56,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:22:57,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:22:57,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.44 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.24 +[2024-12-31 18:22:58,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.74 | optimizer_gradients: 0.61 | optimizer_step: 3.29 +[2024-12-31 18:22:58,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.10 | bwd_microstep: 260.34 | bwd_inner_microstep: 242.44 | bwd_allreduce_microstep: 17.82 | step_microstep: 10.35 +[2024-12-31 18:22:58,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.71 | bwd: 4134.82 | bwd_inner: 4115.88 | bwd_allreduce: 18.21 | step: 13.17 + 68%|████��█▊ | 513/759 [1:12:53<30:07, 7.35s/it] {'loss': 1.2643, 'learning_rate': 5.024664224073454e-06, 'epoch': 0.68} + 68%|██████▊ | 513/759 [1:12:53<30:07, 7.35s/it][2024-12-31 18:22:58,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.90 | bwd_microstep: 335.73 | bwd_inner_microstep: 335.39 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:22:59,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.07 | bwd_microstep: 371.05 | bwd_inner_microstep: 371.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:22:59,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.75 | bwd_microstep: 291.23 | bwd_inner_microstep: 291.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:00,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.67 | bwd_microstep: 264.26 | bwd_inner_microstep: 264.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:23:00,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.47 | bwd_microstep: 254.18 | bwd_inner_microstep: 254.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:23:01,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 247.27 | bwd_inner_microstep: 247.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:23:01,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 248.31 | bwd_inner_microstep: 248.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:01,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 246.40 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:02,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:23:02,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:03,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:03,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:23:04,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 250.26 | bwd_inner_microstep: 250.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:04,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:23:04,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.92 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:23:05,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.54 | optimizer_gradients: 0.81 | optimizer_step: 3.94 +[2024-12-31 18:23:05,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.27 | bwd_microstep: 470.04 | bwd_inner_microstep: 242.15 | bwd_allreduce_microstep: 227.85 | step_microstep: 11.20 +[2024-12-31 18:23:05,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2854.21 | bwd: 4449.57 | bwd_inner: 4220.97 | bwd_allreduce: 228.09 | step: 14.07 + 68%|██████▊ | 514/759 [1:13:00<30:17, 7.42s/it] {'loss': 1.2361, 'learning_rate': 4.987683058667651e-06, 'epoch': 0.68} + 68%|██████▊ | 514/759 [1:13:00<30:17, 7.42s/it][2024-12-31 18:23:06,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.43 | bwd_microstep: 350.23 | bwd_inner_microstep: 349.84 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.22 +[2024-12-31 18:23:06,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.34 | bwd_microstep: 266.73 | bwd_inner_microstep: 266.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:07,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.83 | bwd_microstep: 263.50 | bwd_inner_microstep: 263.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:23:07,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.92 | bwd_microstep: 256.92 | bwd_inner_microstep: 256.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:08,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.47 | bwd_microstep: 255.16 | bwd_inner_microstep: 255.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:08,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 250.12 | bwd_inner_microstep: 250.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:08,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:23:09,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 248.13 | bwd_inner_microstep: 248.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:09,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 266.99 | bwd_inner_microstep: 266.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:10,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:10,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:11,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 254.83 | bwd_inner_microstep: 254.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:11,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:11,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:12,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.61 | bwd_microstep: 225.26 | bwd_inner_microstep: 225.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:12,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.72 | optimizer_gradients: 0.60 | optimizer_step: 3.10 +[2024-12-31 18:23:12,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.30 | bwd_microstep: 293.93 | bwd_inner_microstep: 242.03 | bwd_allreduce_microstep: 51.85 | step_microstep: 12.22 +[2024-12-31 18:23:12,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2787.19 | bwd: 4151.76 | bwd_inner: 4099.07 | bwd_allreduce: 52.15 | step: 15.22 + 68%|██████▊ | 515/759 [1:13:07<29:56, 7.36s/it] {'loss': 1.2565, 'learning_rate': 4.950793216642923e-06, 'epoch': 0.68} + 68%|██████▊ | 515/759 [1:13:07<29:56, 7.36s/it][2024-12-31 18:23:13,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.50 | bwd_microstep: 335.35 | bwd_inner_microstep: 334.99 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:23:13,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.54 | bwd_microstep: 306.03 | bwd_inner_microstep: 306.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:23:14,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.08 | bwd_microstep: 282.12 | bwd_inner_microstep: 282.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:14,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.87 | bwd_microstep: 266.53 | bwd_inner_microstep: 266.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:15,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.75 | bwd_microstep: 258.05 | bwd_inner_microstep: 258.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:23:15,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.22 | bwd_microstep: 247.02 | bwd_inner_microstep: 246.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:16,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 248.39 | bwd_inner_microstep: 248.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:16,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:17,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.29 | bwd_microstep: 245.92 | bwd_inner_microstep: 245.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:17,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:18,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 250.44 | bwd_inner_microstep: 250.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:18,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:18,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:19,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 242.42 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:23:19,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.12 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:20,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.77 | optimizer_step: 3.44 +[2024-12-31 18:23:20,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 262.20 | bwd_inner_microstep: 248.45 | bwd_allreduce_microstep: 13.62 | step_microstep: 10.98 +[2024-12-31 18:23:20,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2871.20 | bwd: 4163.29 | bwd_inner: 4148.76 | bwd_allreduce: 13.91 | step: 13.88 + 68%|██████▊ | 516/759 [1:13:15<29:47, 7.35s/it] {'loss': 1.2317, 'learning_rate': 4.913995370124578e-06, 'epoch': 0.68} + 68%|██████▊ | 516/759 [1:13:15<29:47, 7.35s/it][2024-12-31 18:23:20,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.57 | bwd_microstep: 334.01 | bwd_inner_microstep: 333.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:23:21,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.31 | bwd_microstep: 355.37 | bwd_inner_microstep: 355.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:21,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.20 | bwd_microstep: 265.83 | bwd_inner_microstep: 265.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:22,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.63 | bwd_microstep: 262.82 | bwd_inner_microstep: 262.45 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.28 +[2024-12-31 18:23:22,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.84 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.30 +[2024-12-31 18:23:23,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 245.87 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:23,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 251.12 | bwd_inner_microstep: 251.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:24,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:23:24,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.10 | bwd_microstep: 268.63 | bwd_inner_microstep: 268.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:23:24,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:25,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 241.09 | bwd_inner_microstep: 241.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:23:25,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:26,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.84 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:26,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:27,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 240.86 | bwd_inner_microstep: 240.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:27,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 26.65 | optimizer_gradients: 4.14 | optimizer_step: 13.38 +[2024-12-31 18:23:27,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 354.52 | bwd_inner_microstep: 340.76 | bwd_allreduce_microstep: 13.64 | step_microstep: 46.43 +[2024-12-31 18:23:27,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2858.60 | bwd: 4285.84 | bwd_inner: 4270.71 | bwd_allreduce: 14.22 | step: 49.57 + 68%|██████▊ | 517/759 [1:13:22<29:52, 7.41s/it] {'loss': 1.2132, 'learning_rate': 4.877290189561795e-06, 'epoch': 0.68} + 68%|██████▊ | 517/759 [1:13:22<29:52, 7.41s/it][2024-12-31 18:23:28,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.87 | bwd_microstep: 345.11 | bwd_inner_microstep: 344.76 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:23:28,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.56 | bwd_microstep: 296.77 | bwd_inner_microstep: 296.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:29,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.33 | bwd_microstep: 280.82 | bwd_inner_microstep: 280.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:29,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.94 | bwd_microstep: 263.63 | bwd_inner_microstep: 263.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:30,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 257.86 | bwd_inner_microstep: 257.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:23:30,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.60 | bwd_microstep: 256.38 | bwd_inner_microstep: 256.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:23:31,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 248.81 | bwd_inner_microstep: 248.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:31,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 256.89 | bwd_inner_microstep: 256.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:23:32,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:23:32,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:32,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.53 | bwd_inner_microstep: 244.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:33,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:33,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 244.08 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:23:34,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:34,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:23:35,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.17 | optimizer_gradients: 0.57 | optimizer_step: 3.09 +[2024-12-31 18:23:35,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 392.25 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 147.97 | step_microstep: 42.09 +[2024-12-31 18:23:35,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.91 | bwd: 4312.87 | bwd_inner: 4163.88 | bwd_allreduce: 148.26 | step: 45.09 + 68%|██████▊ | 518/759 [1:13:30<29:52, 7.44s/it] {'loss': 1.214, 'learning_rate': 4.840678343715399e-06, 'epoch': 0.68} + 68%|██████▊ | 518/759 [1:13:30<29:52, 7.44s/it][2024-12-31 18:23:35,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.23 | bwd_microstep: 311.80 | bwd_inner_microstep: 311.46 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.18 +[2024-12-31 18:23:36,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.57 | bwd_microstep: 293.55 | bwd_inner_microstep: 293.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:23:36,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.68 | bwd_microstep: 267.01 | bwd_inner_microstep: 266.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:37,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.40 | bwd_microstep: 256.34 | bwd_inner_microstep: 256.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:23:37,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 255.55 | bwd_inner_microstep: 255.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:38,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:38,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 246.52 | bwd_inner_microstep: 246.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:23:38,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.20 +[2024-12-31 18:23:39,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:39,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:23:40,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.04 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:40,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 255.84 | bwd_inner_microstep: 255.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:41,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 243.12 | bwd_inner_microstep: 243.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:23:41,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:23:41,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.55 | bwd_microstep: 241.02 | bwd_inner_microstep: 240.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:23:42,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 19.74 | optimizer_gradients: 13.92 | optimizer_step: 3.28 +[2024-12-31 18:23:42,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 268.23 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 24.33 | step_microstep: 38.83 +[2024-12-31 18:23:42,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.93 | bwd: 4106.02 | bwd_inner: 4080.55 | bwd_allreduce: 24.71 | step: 41.38 + 68%|██████▊ | 519/759 [1:13:37<29:29, 7.37s/it] {'loss': 1.2085, 'learning_rate': 4.804160499645667e-06, 'epoch': 0.68} + 68%|██████▊ | 519/759 [1:13:37<29:29, 7.37s/it][2024-12-31 18:23:43,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.75 | bwd_microstep: 358.72 | bwd_inner_microstep: 358.35 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:23:43,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.43 | bwd_microstep: 288.74 | bwd_inner_microstep: 288.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:44,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.22 | bwd_microstep: 284.39 | bwd_inner_microstep: 284.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:44,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.71 | bwd_microstep: 263.94 | bwd_inner_microstep: 263.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:44,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.47 | bwd_microstep: 254.40 | bwd_inner_microstep: 254.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:23:45,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:23:45,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 247.65 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:46,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.93 | bwd_microstep: 247.02 | bwd_inner_microstep: 246.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:23:46,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 246.29 | bwd_inner_microstep: 246.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:47,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 254.00 | bwd_inner_microstep: 253.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:23:47,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:48,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 246.71 | bwd_inner_microstep: 246.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:48,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:48,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 249.01 | bwd_inner_microstep: 248.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:23:49,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:49,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.84 | optimizer_gradients: 0.69 | optimizer_step: 3.24 +[2024-12-31 18:23:49,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.53 | bwd_microstep: 301.28 | bwd_inner_microstep: 247.01 | bwd_allreduce_microstep: 54.23 | step_microstep: 11.00 +[2024-12-31 18:23:49,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.08 | bwd: 4226.05 | bwd_inner: 4170.85 | bwd_allreduce: 54.51 | step: 13.95 + 69%|██████▊ | 520/759 [1:13:44<29:19, 7.36s/it] {'loss': 1.2343, 'learning_rate': 4.767737322700185e-06, 'epoch': 0.69} + 69%|██████▊ | 520/759 [1:13:44<29:19, 7.36s/it][2024-12-31 18:23:50,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.53 | bwd_microstep: 365.73 | bwd_inner_microstep: 365.35 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:23:50,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.73 | bwd_microstep: 291.41 | bwd_inner_microstep: 291.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:51,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.34 | bwd_microstep: 287.38 | bwd_inner_microstep: 287.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:23:51,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.54 | bwd_microstep: 263.80 | bwd_inner_microstep: 263.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:52,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.51 | bwd_microstep: 263.47 | bwd_inner_microstep: 263.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:52,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.71 | bwd_microstep: 256.12 | bwd_inner_microstep: 256.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:23:53,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.82 | bwd_microstep: 257.22 | bwd_inner_microstep: 257.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:53,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.42 | bwd_microstep: 246.33 | bwd_inner_microstep: 246.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:23:54,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:23:54,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:55,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 246.71 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:23:55,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:55,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.20 +[2024-12-31 18:23:56,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:23:56,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 210.97 | bwd_microstep: 241.90 | bwd_inner_microstep: 241.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:57,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.34 | optimizer_gradients: 0.62 | optimizer_step: 3.32 +[2024-12-31 18:23:57,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.68 | bwd_microstep: 255.15 | bwd_inner_microstep: 241.52 | bwd_allreduce_microstep: 13.54 | step_microstep: 11.87 +[2024-12-31 18:23:57,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2902.44 | bwd: 4197.98 | bwd_inner: 4183.43 | bwd_allreduce: 13.90 | step: 14.99 + 69%|██████▊ | 521/759 [1:13:52<29:19, 7.39s/it] {'loss': 1.2287, 'learning_rate': 4.7314094765017325e-06, 'epoch': 0.69} + 69%|██████▊ | 521/759 [1:13:52<29:19, 7.39s/it][2024-12-31 18:23:57,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.44 | bwd_microstep: 314.47 | bwd_inner_microstep: 314.12 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:23:58,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.59 | bwd_microstep: 291.05 | bwd_inner_microstep: 291.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:58,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.59 | bwd_microstep: 263.10 | bwd_inner_microstep: 263.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:23:59,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.57 | bwd_microstep: 262.82 | bwd_inner_microstep: 262.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:23:59,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 248.98 | bwd_inner_microstep: 248.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:00,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 248.78 | bwd_inner_microstep: 248.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:00,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 250.15 | bwd_inner_microstep: 250.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:00,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:01,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:01,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:02,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:24:02,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:03,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:03,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:03,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:04,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.72 | optimizer_gradients: 0.73 | optimizer_step: 3.16 +[2024-12-31 18:24:04,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 351.79 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 107.44 | step_microstep: 10.55 +[2024-12-31 18:24:04,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.20 | bwd: 4187.65 | bwd_inner: 4079.41 | bwd_allreduce: 107.68 | step: 13.13 + 69%|██████▉ | 522/759 [1:13:59<29:02, 7.35s/it] {'loss': 1.2358, 'learning_rate': 4.695177622936169e-06, 'epoch': 0.69} + 69%|██████▉ | 522/759 [1:13:59<29:02, 7.35s/it][2024-12-31 18:24:05,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.17 | bwd_microstep: 347.96 | bwd_inner_microstep: 347.62 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:24:05,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.28 | bwd_microstep: 363.29 | bwd_inner_microstep: 363.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:24:06,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.80 | bwd_microstep: 284.29 | bwd_inner_microstep: 284.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:24:06,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.09 | bwd_microstep: 262.12 | bwd_inner_microstep: 262.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:07,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.28 | bwd_microstep: 255.68 | bwd_inner_microstep: 255.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:07,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.11 | bwd_microstep: 247.82 | bwd_inner_microstep: 247.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:24:07,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 250.07 | bwd_inner_microstep: 250.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:24:08,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 257.67 | bwd_inner_microstep: 257.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:24:08,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.37 | bwd_microstep: 252.97 | bwd_inner_microstep: 252.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:24:09,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:24:09,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:10,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:10,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:11,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:24:11,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:24:11,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 1.12 | optimizer_step: 3.53 +[2024-12-31 18:24:11,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.04 | bwd_microstep: 257.72 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 13.94 | step_microstep: 12.52 +[2024-12-31 18:24:11,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2879.38 | bwd: 4250.26 | bwd_inner: 4235.41 | bwd_allreduce: 14.23 | step: 15.48 + 69%|██████▉ | 523/759 [1:14:06<28:59, 7.37s/it] {'loss': 1.2439, 'learning_rate': 4.659042422140399e-06, 'epoch': 0.69} + 69%|██████▉ | 523/759 [1:14:06<28:59, 7.37s/it][2024-12-31 18:24:12,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.63 | bwd_microstep: 343.15 | bwd_inner_microstep: 342.80 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:24:13,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.45 | bwd_microstep: 290.97 | bwd_inner_microstep: 290.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:24:13,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.82 | bwd_microstep: 270.43 | bwd_inner_microstep: 270.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:24:13,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.05 | bwd_microstep: 258.09 | bwd_inner_microstep: 258.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:24:14,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.29 | bwd_microstep: 255.51 | bwd_inner_microstep: 255.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:24:14,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 257.25 | bwd_inner_microstep: 257.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:15,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:24:15,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:16,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:24:16,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 265.84 | bwd_inner_microstep: 265.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:17,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:17,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 245.30 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:17,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:24:18,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:18,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:24:19,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.89 | optimizer_gradients: 0.60 | optimizer_step: 3.20 +[2024-12-31 18:24:19,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 268.25 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 23.93 | step_microstep: 12.20 +[2024-12-31 18:24:19,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.72 | bwd: 4169.50 | bwd_inner: 4144.77 | bwd_allreduce: 24.19 | step: 15.15 + 69%|██████▉ | 524/759 [1:14:14<28:45, 7.34s/it] {'loss': 1.2239, 'learning_rate': 4.623004532490328e-06, 'epoch': 0.69} + 69%|██████▉ | 524/759 [1:14:14<28:45, 7.34s/it][2024-12-31 18:24:19,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.58 | bwd_microstep: 366.28 | bwd_inner_microstep: 365.91 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.20 +[2024-12-31 18:24:20,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.50 | bwd_microstep: 315.85 | bwd_inner_microstep: 315.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:20,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.59 | bwd_microstep: 262.67 | bwd_inner_microstep: 262.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:21,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.91 | bwd_microstep: 254.60 | bwd_inner_microstep: 254.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:21,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 250.04 | bwd_inner_microstep: 250.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:24:22,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.26 | bwd_microstep: 249.17 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:24:22,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:23,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.40 | bwd_microstep: 245.69 | bwd_inner_microstep: 245.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:23,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:23,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 245.29 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:24,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 242.90 | bwd_inner_microstep: 242.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:24,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:25,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:25,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:26,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 241.27 | bwd_inner_microstep: 241.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:26,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.18 | optimizer_gradients: 0.64 | optimizer_step: 3.36 +[2024-12-31 18:24:26,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 273.79 | bwd_inner_microstep: 259.88 | bwd_allreduce_microstep: 13.81 | step_microstep: 12.03 +[2024-12-31 18:24:26,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.35 | bwd: 4172.23 | bwd_inner: 4157.55 | bwd_allreduce: 14.11 | step: 15.03 + 69%|██████▉ | 525/759 [1:14:21<28:36, 7.34s/it] {'loss': 1.2292, 'learning_rate': 4.587064610588881e-06, 'epoch': 0.69} + 69%|██████▉ | 525/759 [1:14:21<28:36, 7.34s/it][2024-12-31 18:24:27,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.93 | bwd_microstep: 360.68 | bwd_inner_microstep: 360.32 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:24:27,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.65 | bwd_microstep: 286.47 | bwd_inner_microstep: 286.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:28,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.37 | bwd_microstep: 268.03 | bwd_inner_microstep: 268.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:24:28,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.10 | bwd_microstep: 286.21 | bwd_inner_microstep: 286.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:29,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.03 | bwd_microstep: 261.55 | bwd_inner_microstep: 261.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:29,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 250.59 | bwd_inner_microstep: 250.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:29,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:30,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:24:30,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:31,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.61 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:31,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:32,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 265.67 | bwd_inner_microstep: 265.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:24:32,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:24:32,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.05 | bwd_microstep: 241.35 | bwd_inner_microstep: 241.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:33,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:33,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.18 | optimizer_gradients: 0.57 | optimizer_step: 3.09 +[2024-12-31 18:24:33,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.92 | bwd_microstep: 322.25 | bwd_inner_microstep: 258.72 | bwd_allreduce_microstep: 63.49 | step_microstep: 11.08 +[2024-12-31 18:24:33,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.00 | bwd: 4256.25 | bwd_inner: 4192.00 | bwd_allreduce: 63.73 | step: 14.15 + 69%|██████▉ | 526/759 [1:14:28<28:31, 7.35s/it] {'loss': 1.227, 'learning_rate': 4.551223311254013e-06, 'epoch': 0.69} + 69%|██████▉ | 526/759 [1:14:28<28:31, 7.35s/it][2024-12-31 18:24:34,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.10 | bwd_microstep: 365.04 | bwd_inner_microstep: 364.70 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:24:35,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.44 | bwd_microstep: 285.40 | bwd_inner_microstep: 285.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:35,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.89 | bwd_microstep: 268.69 | bwd_inner_microstep: 268.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:24:35,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.21 | bwd_microstep: 263.26 | bwd_inner_microstep: 263.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:24:36,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 262.52 | bwd_inner_microstep: 262.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:36,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 246.40 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:37,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:24:37,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 263.48 | bwd_inner_microstep: 263.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:38,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.06 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:38,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 246.23 | bwd_inner_microstep: 246.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:39,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:24:39,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 250.66 | bwd_inner_microstep: 250.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:24:39,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.02 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:40,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:40,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:41,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 0.56 | optimizer_step: 3.10 +[2024-12-31 18:24:41,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 296.81 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 52.57 | step_microstep: 10.62 +[2024-12-31 18:24:41,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2831.97 | bwd: 4217.11 | bwd_inner: 4163.78 | bwd_allreduce: 52.81 | step: 13.12 + 69%|██████▉ | 527/759 [1:14:36<28:22, 7.34s/it] {'loss': 1.2171, 'learning_rate': 4.515481287506811e-06, 'epoch': 0.69} + 69%|██████▉ | 527/759 [1:14:36<28:22, 7.34s/it][2024-12-31 18:24:41,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.01 | bwd_microstep: 362.31 | bwd_inner_microstep: 361.95 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:24:42,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.46 | bwd_microstep: 290.37 | bwd_inner_microstep: 290.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:24:42,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.03 | bwd_microstep: 310.09 | bwd_inner_microstep: 310.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:43,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.39 | bwd_microstep: 261.67 | bwd_inner_microstep: 261.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:43,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.16 | bwd_microstep: 254.94 | bwd_inner_microstep: 254.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:44,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:44,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 247.43 | bwd_inner_microstep: 247.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:45,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 256.06 | bwd_inner_microstep: 256.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:45,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:45,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:46,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:46,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:47,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:47,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 242.18 | bwd_inner_microstep: 242.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:47,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.90 | bwd_microstep: 241.24 | bwd_inner_microstep: 241.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:48,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 1.53 | optimizer_step: 3.30 +[2024-12-31 18:24:48,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.07 | bwd_microstep: 640.79 | bwd_inner_microstep: 226.51 | bwd_allreduce_microstep: 414.24 | step_microstep: 11.66 +[2024-12-31 18:24:48,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2856.48 | bwd: 4574.23 | bwd_inner: 4159.22 | bwd_allreduce: 414.48 | step: 12.60 + 70%|██████▉ | 528/759 [1:14:43<28:32, 7.41s/it] {'loss': 1.2125, 'learning_rate': 4.479839190559583e-06, 'epoch': 0.7} + 70%|██████▉ | 528/759 [1:14:43<28:32, 7.41s/it][2024-12-31 18:24:49,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.30 | bwd_microstep: 337.02 | bwd_inner_microstep: 336.91 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 18:24:49,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.80 | bwd_microstep: 288.75 | bwd_inner_microstep: 288.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:50,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.98 | bwd_microstep: 266.57 | bwd_inner_microstep: 266.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:24:50,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.02 | bwd_microstep: 256.40 | bwd_inner_microstep: 256.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:51,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.98 | bwd_microstep: 256.31 | bwd_inner_microstep: 256.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:51,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.06 | bwd_microstep: 247.97 | bwd_inner_microstep: 247.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:52,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.46 | bwd_microstep: 248.54 | bwd_inner_microstep: 248.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:52,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 245.49 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:24:52,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.23 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:53,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.02 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:53,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 248.05 | bwd_inner_microstep: 248.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:24:54,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:54,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:55,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.43 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:24:55,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:56,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.98 | optimizer_gradients: 0.71 | optimizer_step: 3.16 +[2024-12-31 18:24:56,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.56 | bwd_microstep: 371.33 | bwd_inner_microstep: 255.30 | bwd_allreduce_microstep: 115.99 | step_microstep: 11.12 +[2024-12-31 18:24:56,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.69 | bwd: 4231.21 | bwd_inner: 4114.63 | bwd_allreduce: 116.15 | step: 13.25 + 70%|██████▉ | 529/759 [1:14:51<28:16, 7.38s/it] {'loss': 1.2338, 'learning_rate': 4.444297669803981e-06, 'epoch': 0.7} + 70%|██████▉ | 529/759 [1:14:51<28:16, 7.38s/it][2024-12-31 18:24:56,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.90 | bwd_microstep: 359.26 | bwd_inner_microstep: 358.92 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:24:57,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 253.59 | bwd_microstep: 423.70 | bwd_inner_microstep: 423.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:57,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.26 | bwd_microstep: 267.52 | bwd_inner_microstep: 267.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:24:58,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.34 | bwd_microstep: 261.09 | bwd_inner_microstep: 261.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:24:58,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.00 | bwd_microstep: 257.64 | bwd_inner_microstep: 257.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:24:59,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 259.19 | bwd_inner_microstep: 259.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:24:59,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 250.05 | bwd_inner_microstep: 250.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:00,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 248.67 | bwd_inner_microstep: 248.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:00,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.53 | bwd_inner_microstep: 244.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:01,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 288.70 | bwd_inner_microstep: 288.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:01,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 247.00 | bwd_inner_microstep: 246.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:01,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:02,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:02,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.65 | bwd_microstep: 241.62 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:03,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 240.89 | bwd_inner_microstep: 240.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:03,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.79 | optimizer_step: 3.31 +[2024-12-31 18:25:03,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 256.74 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 13.79 | step_microstep: 10.86 +[2024-12-31 18:25:03,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.18 | bwd: 4333.89 | bwd_inner: 4319.21 | bwd_allreduce: 14.06 | step: 13.84 + 70%|██████▉ | 530/759 [1:14:58<28:18, 7.42s/it] {'loss': 1.2236, 'learning_rate': 4.408857372799179e-06, 'epoch': 0.7} + 70%|██████▉ | 530/759 [1:14:58<28:18, 7.42s/it][2024-12-31 18:25:04,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.23 | bwd_microstep: 353.60 | bwd_inner_microstep: 353.43 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.10 +[2024-12-31 18:25:04,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.93 | bwd_microstep: 287.86 | bwd_inner_microstep: 287.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:05,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.90 | bwd_microstep: 269.56 | bwd_inner_microstep: 269.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:05,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.23 | bwd_microstep: 256.18 | bwd_inner_microstep: 256.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:25:06,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 250.56 | bwd_inner_microstep: 250.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:06,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 249.75 | bwd_inner_microstep: 249.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:06,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 247.11 | bwd_inner_microstep: 247.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:07,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 245.23 | bwd_inner_microstep: 245.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:25:07,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 244.54 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:25:08,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:08,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:09,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:09,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.26 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:09,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:25:10,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 242.75 | bwd_inner_microstep: 242.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:10,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.13 | optimizer_gradients: 0.63 | optimizer_step: 3.11 +[2024-12-31 18:25:10,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 284.89 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 39.37 | step_microstep: 10.76 +[2024-12-31 18:25:10,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.95 | bwd: 4149.88 | bwd_inner: 4109.77 | bwd_allreduce: 39.59 | step: 13.57 + 70%|██████▉ | 531/759 [1:15:05<28:00, 7.37s/it] {'loss': 1.2755, 'learning_rate': 4.37351894526009e-06, 'epoch': 0.7} + 70%|██████▉ | 531/759 [1:15:05<28:00, 7.37s/it][2024-12-31 18:25:11,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.62 | bwd_microstep: 355.72 | bwd_inner_microstep: 355.35 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:25:12,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.15 | bwd_microstep: 406.73 | bwd_inner_microstep: 406.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:12,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.23 | bwd_microstep: 267.60 | bwd_inner_microstep: 267.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:13,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 263.62 | bwd_inner_microstep: 263.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:25:13,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.29 | bwd_microstep: 254.60 | bwd_inner_microstep: 254.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:13,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:14,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 247.86 | bwd_inner_microstep: 247.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:14,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 246.87 | bwd_inner_microstep: 246.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:15,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:15,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:16,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 247.24 | bwd_inner_microstep: 247.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:16,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 242.81 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:17,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:17,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 240.88 | bwd_inner_microstep: 240.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:25:17,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 241.52 | bwd_inner_microstep: 241.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:25:18,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.68 | optimizer_step: 3.42 +[2024-12-31 18:25:18,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 257.56 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.73 +[2024-12-31 18:25:18,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2872.32 | bwd: 4253.59 | bwd_inner: 4239.10 | bwd_allreduce: 13.88 | step: 14.86 + 70%|███████ | 532/759 [1:15:13<27:59, 7.40s/it] {'loss': 1.2603, 'learning_rate': 4.338283031045567e-06, 'epoch': 0.7} + 70%|███████ | 532/759 [1:15:13<27:59, 7.40s/it][2024-12-31 18:25:18,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.44 | bwd_microstep: 298.67 | bwd_inner_microstep: 298.33 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:25:19,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.24 | bwd_microstep: 281.46 | bwd_inner_microstep: 281.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:19,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.23 | bwd_microstep: 263.65 | bwd_inner_microstep: 263.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:20,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.69 | bwd_microstep: 261.86 | bwd_inner_microstep: 261.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:20,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.70 | bwd_microstep: 254.59 | bwd_inner_microstep: 254.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:21,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 251.20 | bwd_inner_microstep: 251.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:21,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 248.08 | bwd_inner_microstep: 248.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:22,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 245.91 | bwd_inner_microstep: 245.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:22,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 247.65 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:22,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:23,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 245.06 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:25:23,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:25:24,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 243.35 | bwd_inner_microstep: 243.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:24,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.45 | bwd_microstep: 241.39 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:25:25,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.04 | bwd_microstep: 245.25 | bwd_inner_microstep: 245.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:25,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.72 | optimizer_step: 3.16 +[2024-12-31 18:25:25,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.29 | bwd_microstep: 372.54 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 129.25 | step_microstep: 10.70 +[2024-12-31 18:25:25,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2787.67 | bwd: 4193.19 | bwd_inner: 4063.13 | bwd_allreduce: 129.50 | step: 13.82 + 70%|███████ | 533/759 [1:15:20<27:43, 7.36s/it] {'loss': 1.2462, 'learning_rate': 4.303150272146706e-06, 'epoch': 0.7} + 70%|███████ | 533/759 [1:15:20<27:43, 7.36s/it][2024-12-31 18:25:26,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 250.32 | bwd_microstep: 314.60 | bwd_inner_microstep: 314.25 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.26 +[2024-12-31 18:25:26,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.96 | bwd_microstep: 296.79 | bwd_inner_microstep: 296.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:25:27,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.59 | bwd_microstep: 268.03 | bwd_inner_microstep: 268.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:27,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.37 | bwd_microstep: 262.30 | bwd_inner_microstep: 262.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:28,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.83 | bwd_microstep: 255.37 | bwd_inner_microstep: 255.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:28,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.60 | bwd_microstep: 255.68 | bwd_inner_microstep: 255.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:28,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:29,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 247.73 | bwd_inner_microstep: 247.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:29,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 249.51 | bwd_inner_microstep: 249.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:30,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 245.47 | bwd_inner_microstep: 245.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:30,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 252.99 | bwd_inner_microstep: 252.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:25:31,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:31,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 242.66 | bwd_inner_microstep: 242.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:25:32,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 245.24 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.22 +[2024-12-31 18:25:32,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:32,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.37 | optimizer_gradients: 0.66 | optimizer_step: 3.15 +[2024-12-31 18:25:32,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 341.65 | bwd_inner_microstep: 253.01 | bwd_allreduce_microstep: 88.60 | step_microstep: 11.27 +[2024-12-31 18:25:32,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2869.57 | bwd: 4213.30 | bwd_inner: 4123.76 | bwd_allreduce: 88.91 | step: 14.15 + 70%|███████ | 534/759 [1:15:27<27:36, 7.36s/it] {'loss': 1.2025, 'learning_rate': 4.268121308675132e-06, 'epoch': 0.7} + 70%|███████ | 534/759 [1:15:27<27:36, 7.36s/it][2024-12-31 18:25:33,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 289.80 | bwd_microstep: 502.22 | bwd_inner_microstep: 501.85 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:25:34,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.12 | bwd_microstep: 314.90 | bwd_inner_microstep: 314.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:34,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.22 | bwd_microstep: 281.36 | bwd_inner_microstep: 281.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:25:35,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.90 | bwd_microstep: 267.32 | bwd_inner_microstep: 267.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:35,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 250.45 | bwd_inner_microstep: 250.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:36,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 247.96 | bwd_inner_microstep: 247.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:36,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 267.79 | bwd_inner_microstep: 267.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:25:37,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:37,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 245.67 | bwd_inner_microstep: 245.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:37,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 253.00 | bwd_inner_microstep: 252.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:38,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 246.39 | bwd_inner_microstep: 246.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:38,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:39,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.68 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:39,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.25 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:40,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.27 +[2024-12-31 18:25:40,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.48 | optimizer_gradients: 0.63 | optimizer_step: 3.32 +[2024-12-31 18:25:40,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 265.98 | bwd_inner_microstep: 252.35 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.56 +[2024-12-31 18:25:40,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2923.92 | bwd: 4364.86 | bwd_inner: 4350.26 | bwd_allreduce: 13.87 | step: 14.72 + 70%|███████ | 535/759 [1:15:35<27:44, 7.43s/it] {'loss': 1.231, 'learning_rate': 4.2331967788513295e-06, 'epoch': 0.7} + 70%|███████ | 535/759 [1:15:35<27:44, 7.43s/it][2024-12-31 18:25:41,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.61 | bwd_microstep: 336.64 | bwd_inner_microstep: 336.28 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:25:41,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.31 | bwd_microstep: 371.49 | bwd_inner_microstep: 371.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:42,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.99 | bwd_microstep: 283.71 | bwd_inner_microstep: 283.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:42,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.44 | bwd_microstep: 266.59 | bwd_inner_microstep: 266.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:43,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 250.11 | bwd_inner_microstep: 250.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:43,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 248.81 | bwd_inner_microstep: 248.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:44,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 263.19 | bwd_inner_microstep: 263.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:44,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 248.00 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:44,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 246.19 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:45,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.89 | bwd_inner_microstep: 245.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:45,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 249.62 | bwd_inner_microstep: 249.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:46,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:46,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:47,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.13 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:47,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.27 +[2024-12-31 18:25:48,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.08 | optimizer_gradients: 0.64 | optimizer_step: 3.13 +[2024-12-31 18:25:48,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.65 | bwd_microstep: 390.09 | bwd_inner_microstep: 242.49 | bwd_allreduce_microstep: 147.56 | step_microstep: 11.00 +[2024-12-31 18:25:48,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2851.68 | bwd: 4374.47 | bwd_inner: 4225.68 | bwd_allreduce: 148.08 | step: 14.08 + 71%|███████ | 536/759 [1:15:43<27:43, 7.46s/it] {'loss': 1.221, 'learning_rate': 4.198377318993035e-06, 'epoch': 0.71} + 71%|███████ | 536/759 [1:15:43<27:43, 7.46s/it][2024-12-31 18:25:48,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.36 | bwd_microstep: 352.04 | bwd_inner_microstep: 351.68 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.24 +[2024-12-31 18:25:49,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.87 | bwd_microstep: 365.66 | bwd_inner_microstep: 365.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:49,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.68 | bwd_microstep: 266.63 | bwd_inner_microstep: 266.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:25:50,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.45 | bwd_microstep: 303.11 | bwd_inner_microstep: 303.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:50,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 249.95 | bwd_inner_microstep: 249.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:51,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 249.74 | bwd_inner_microstep: 249.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:51,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 247.59 | bwd_inner_microstep: 247.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:25:52,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 269.99 | bwd_inner_microstep: 269.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:52,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:52,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:25:53,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:53,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.00 | bwd_microstep: 242.85 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:54,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 248.84 | bwd_inner_microstep: 248.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:54,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.63 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:25:55,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:25:55,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.77 | optimizer_step: 3.37 +[2024-12-31 18:25:55,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 256.08 | bwd_inner_microstep: 242.09 | bwd_allreduce_microstep: 13.87 | step_microstep: 11.20 +[2024-12-31 18:25:55,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2899.07 | bwd: 4273.40 | bwd_inner: 4258.64 | bwd_allreduce: 14.15 | step: 14.28 + 71%|███████ | 537/759 [1:15:50<27:36, 7.46s/it] {'loss': 1.2132, 'learning_rate': 4.1636635635036235e-06, 'epoch': 0.71} + 71%|███████ | 537/759 [1:15:50<27:36, 7.46s/it][2024-12-31 18:25:56,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.75 | bwd_microstep: 304.80 | bwd_inner_microstep: 304.45 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:25:56,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.57 | bwd_microstep: 267.05 | bwd_inner_microstep: 267.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:25:57,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.85 | bwd_microstep: 284.91 | bwd_inner_microstep: 284.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:25:57,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.55 | bwd_microstep: 263.68 | bwd_inner_microstep: 263.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:25:57,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.72 | bwd_microstep: 258.23 | bwd_inner_microstep: 258.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:25:58,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 264.14 | bwd_inner_microstep: 264.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:25:58,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 246.46 | bwd_inner_microstep: 246.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:25:59,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 246.82 | bwd_inner_microstep: 246.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:25:59,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:26:00,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:00,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:01,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 257.43 | bwd_inner_microstep: 257.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:01,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 242.25 | bwd_inner_microstep: 242.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:01,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:02,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.52 | bwd_microstep: 252.51 | bwd_inner_microstep: 252.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:02,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.82 | optimizer_gradients: 0.67 | optimizer_step: 3.49 +[2024-12-31 18:26:02,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.74 | bwd_microstep: 256.20 | bwd_inner_microstep: 242.50 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.83 +[2024-12-31 18:26:02,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2804.39 | bwd: 4126.03 | bwd_inner: 4111.43 | bwd_allreduce: 13.91 | step: 14.70 + 71%|███████ | 538/759 [1:15:57<27:12, 7.39s/it] {'loss': 1.2579, 'learning_rate': 4.129056144860567e-06, 'epoch': 0.71} + 71%|███████ | 538/759 [1:15:57<27:12, 7.39s/it][2024-12-31 18:26:03,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.74 | bwd_microstep: 355.05 | bwd_inner_microstep: 354.67 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:26:03,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.46 | bwd_microstep: 287.90 | bwd_inner_microstep: 287.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:04,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.45 | bwd_microstep: 309.27 | bwd_inner_microstep: 309.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:04,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.22 | bwd_microstep: 261.98 | bwd_inner_microstep: 261.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:26:05,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 250.95 | bwd_inner_microstep: 250.76 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.30 +[2024-12-31 18:26:05,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.80 | bwd_microstep: 254.54 | bwd_inner_microstep: 254.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:06,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:06,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:07,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:26:07,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 252.78 | bwd_inner_microstep: 252.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:07,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 247.14 | bwd_inner_microstep: 246.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:08,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:08,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 242.78 | bwd_inner_microstep: 242.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:09,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 254.41 | bwd_inner_microstep: 254.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:09,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:26:10,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.71 | optimizer_gradients: 0.66 | optimizer_step: 3.34 +[2024-12-31 18:26:10,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.36 | bwd_microstep: 258.37 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 13.55 | step_microstep: 11.34 +[2024-12-31 18:26:10,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2841.24 | bwd: 4202.12 | bwd_inner: 4187.15 | bwd_allreduce: 13.96 | step: 14.43 + 71%|███████ | 539/759 [1:16:05<27:01, 7.37s/it] {'loss': 1.2362, 'learning_rate': 4.094555693603891e-06, 'epoch': 0.71} + 71%|███████ | 539/759 [1:16:05<27:01, 7.37s/it][2024-12-31 18:26:10,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.45 | bwd_microstep: 368.16 | bwd_inner_microstep: 367.82 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:26:11,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.36 | bwd_microstep: 342.83 | bwd_inner_microstep: 342.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:11,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.83 | bwd_microstep: 286.22 | bwd_inner_microstep: 286.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:26:12,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.27 | bwd_microstep: 263.25 | bwd_inner_microstep: 263.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:12,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.39 | bwd_microstep: 262.56 | bwd_inner_microstep: 262.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:26:13,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 249.08 | bwd_inner_microstep: 249.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:26:13,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.46 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:13,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:26:14,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.41 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.37 +[2024-12-31 18:26:14,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:26:15,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 242.43 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:15,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 246.97 | bwd_inner_microstep: 246.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:26:16,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:16,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 253.94 | bwd_inner_microstep: 253.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:17,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 242.90 | bwd_inner_microstep: 242.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:17,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.72 | optimizer_gradients: 0.64 | optimizer_step: 3.09 +[2024-12-31 18:26:17,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.61 | bwd_microstep: 400.33 | bwd_inner_microstep: 241.86 | bwd_allreduce_microstep: 158.43 | step_microstep: 13.82 +[2024-12-31 18:26:17,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2847.90 | bwd: 4383.48 | bwd_inner: 4223.64 | bwd_allreduce: 158.97 | step: 16.44 + 71%|███████ | 540/759 [1:16:12<27:03, 7.41s/it] {'loss': 1.2067, 'learning_rate': 4.060162838324708e-06, 'epoch': 0.71} + 71%|███████ | 540/759 [1:16:12<27:03, 7.41s/it][2024-12-31 18:26:18,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.04 | bwd_microstep: 357.82 | bwd_inner_microstep: 357.49 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.20 +[2024-12-31 18:26:18,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.47 | bwd_microstep: 287.73 | bwd_inner_microstep: 287.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:26:19,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.20 | bwd_microstep: 281.86 | bwd_inner_microstep: 281.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:19,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.68 | bwd_microstep: 258.91 | bwd_inner_microstep: 258.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:20,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.53 | bwd_microstep: 254.87 | bwd_inner_microstep: 254.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:20,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:26:20,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 248.34 | bwd_inner_microstep: 248.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:21,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:21,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 245.89 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:22,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.37 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:22,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:23,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.10 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:23,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:26:24,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 245.81 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:26:24,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 241.00 | bwd_inner_microstep: 240.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:24,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.98 | optimizer_gradients: 0.64 | optimizer_step: 3.29 +[2024-12-31 18:26:24,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.65 | bwd_microstep: 255.12 | bwd_inner_microstep: 241.40 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.70 +[2024-12-31 18:26:24,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2867.01 | bwd: 4151.42 | bwd_inner: 4136.96 | bwd_allreduce: 13.88 | step: 14.38 + 71%|███████▏ | 541/759 [1:16:19<26:48, 7.38s/it] {'loss': 1.2212, 'learning_rate': 4.025878205653747e-06, 'epoch': 0.71} + 71%|███████▏ | 541/759 [1:16:19<26:48, 7.38s/it][2024-12-31 18:26:25,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.76 | bwd_microstep: 340.24 | bwd_inner_microstep: 339.86 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:26:26,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.34 | bwd_microstep: 304.48 | bwd_inner_microstep: 304.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:26,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.59 | bwd_microstep: 275.64 | bwd_inner_microstep: 275.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:26,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.68 | bwd_microstep: 256.51 | bwd_inner_microstep: 256.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:26:27,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 259.47 | bwd_inner_microstep: 259.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:26:27,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:28,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:28,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 245.92 | bwd_inner_microstep: 245.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:29,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:29,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.41 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:29,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 242.61 | bwd_inner_microstep: 242.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:30,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.03 | bwd_microstep: 241.48 | bwd_inner_microstep: 241.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:30,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:31,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 242.47 | bwd_inner_microstep: 242.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:31,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.88 | bwd_microstep: 240.88 | bwd_inner_microstep: 240.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:32,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.29 | optimizer_gradients: 0.61 | optimizer_step: 5.83 +[2024-12-31 18:26:32,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 331.50 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 87.21 | step_microstep: 15.33 +[2024-12-31 18:26:32,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2818.81 | bwd: 4205.07 | bwd_inner: 4117.09 | bwd_allreduce: 87.46 | step: 18.25 + 71%|███████▏ | 542/759 [1:16:27<26:37, 7.36s/it] {'loss': 1.2176, 'learning_rate': 3.991702420249941e-06, 'epoch': 0.71} + 71%|███████▏ | 542/759 [1:16:27<26:37, 7.36s/it][2024-12-31 18:26:32,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 263.17 | bwd_microstep: 311.92 | bwd_inner_microstep: 311.58 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:26:33,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.83 | bwd_microstep: 357.18 | bwd_inner_microstep: 357.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:26:33,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.95 | bwd_microstep: 262.99 | bwd_inner_microstep: 262.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:26:34,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.20 | bwd_microstep: 257.85 | bwd_inner_microstep: 257.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:34,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 250.95 | bwd_inner_microstep: 250.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:35,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 248.18 | bwd_inner_microstep: 248.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:35,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 247.70 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:36,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.28 | bwd_microstep: 247.25 | bwd_inner_microstep: 247.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:36,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:37,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:37,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:37,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:38,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 243.25 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:38,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 251.89 | bwd_inner_microstep: 251.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:26:39,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:39,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.06 | optimizer_gradients: 0.56 | optimizer_step: 3.12 +[2024-12-31 18:26:39,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.64 | bwd_microstep: 292.53 | bwd_inner_microstep: 227.29 | bwd_allreduce_microstep: 65.19 | step_microstep: 11.41 +[2024-12-31 18:26:39,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2910.70 | bwd: 4193.25 | bwd_inner: 4127.28 | bwd_allreduce: 65.44 | step: 14.28 + 72%|███████▏ | 543/759 [1:16:34<26:34, 7.38s/it] {'loss': 1.2189, 'learning_rate': 3.957636104789056e-06, 'epoch': 0.72} + 72%|███████▏ | 543/759 [1:16:34<26:34, 7.38s/it][2024-12-31 18:26:40,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.31 | bwd_microstep: 334.52 | bwd_inner_microstep: 334.18 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:26:40,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.21 | bwd_microstep: 309.25 | bwd_inner_microstep: 309.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:41,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.49 | bwd_microstep: 286.92 | bwd_inner_microstep: 286.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:41,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.44 | bwd_microstep: 268.65 | bwd_inner_microstep: 268.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:26:42,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.47 | bwd_microstep: 267.30 | bwd_inner_microstep: 267.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:42,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.76 | bwd_microstep: 258.08 | bwd_inner_microstep: 258.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:26:43,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 249.38 | bwd_inner_microstep: 249.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:43,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 251.71 | bwd_inner_microstep: 251.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:43,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.33 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:44,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 246.62 | bwd_inner_microstep: 246.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:44,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.42 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:45,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:26:45,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.73 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:46,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:46,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.11 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:47,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.05 | optimizer_gradients: 1.44 | optimizer_step: 3.10 +[2024-12-31 18:26:47,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 391.30 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 146.23 | step_microstep: 12.84 +[2024-12-31 18:26:47,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2867.40 | bwd: 4331.82 | bwd_inner: 4184.75 | bwd_allreduce: 146.47 | step: 15.88 + 72%|███████▏ | 544/759 [1:16:42<26:34, 7.42s/it] {'loss': 1.204, 'learning_rate': 3.9236798799523375e-06, 'epoch': 0.72} + 72%|███████▏ | 544/759 [1:16:42<26:34, 7.42s/it][2024-12-31 18:26:47,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.21 | bwd_microstep: 343.14 | bwd_inner_microstep: 342.77 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:26:48,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.89 | bwd_microstep: 287.37 | bwd_inner_microstep: 287.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:48,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.32 | bwd_microstep: 263.24 | bwd_inner_microstep: 263.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:49,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.31 | bwd_microstep: 295.39 | bwd_inner_microstep: 295.18 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.19 +[2024-12-31 18:26:49,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.62 | bwd_microstep: 250.70 | bwd_inner_microstep: 250.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:50,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 249.23 | bwd_inner_microstep: 249.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:26:50,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:50,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:26:51,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:51,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 247.05 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:52,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:52,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.88 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:53,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:26:53,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:53,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 257.86 | bwd_inner_microstep: 257.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:26:54,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.63 | optimizer_gradients: 0.68 | optimizer_step: 3.33 +[2024-12-31 18:26:54,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.19 | bwd_microstep: 239.20 | bwd_inner_microstep: 225.52 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.47 +[2024-12-31 18:26:54,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2815.07 | bwd: 4158.08 | bwd_inner: 4143.32 | bwd_allreduce: 13.99 | step: 14.52 + 72%|███████▏ | 545/759 [1:16:49<26:17, 7.37s/it] {'loss': 1.2195, 'learning_rate': 3.8898343644151945e-06, 'epoch': 0.72} + 72%|███████▏ | 545/759 [1:16:49<26:17, 7.37s/it][2024-12-31 18:26:55,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.64 | bwd_microstep: 352.13 | bwd_inner_microstep: 351.76 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:26:55,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.28 | bwd_microstep: 299.02 | bwd_inner_microstep: 298.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:56,015] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 292.09 | bwd_inner_microstep: 292.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:26:56,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 262.32 | bwd_inner_microstep: 262.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:56,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 259.27 | bwd_inner_microstep: 259.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:57,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 250.24 | bwd_inner_microstep: 250.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:57,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 249.29 | bwd_inner_microstep: 249.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:26:58,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:58,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.50 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:26:59,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:26:59,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:26:59,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 288.02 | bwd_inner_microstep: 288.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:00,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:27:00,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 246.53 | bwd_inner_microstep: 246.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:01,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 241.75 | bwd_inner_microstep: 241.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:02,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.99 | optimizer_gradients: 0.79 | optimizer_step: 3.09 +[2024-12-31 18:27:02,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 519.62 | bwd_inner_microstep: 242.11 | bwd_allreduce_microstep: 277.47 | step_microstep: 11.18 +[2024-12-31 18:27:02,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.14 | bwd: 4487.38 | bwd_inner: 4208.97 | bwd_allreduce: 277.79 | step: 13.93 + 72%|███████▏ | 546/759 [1:16:56<26:22, 7.43s/it] {'loss': 1.2191, 'learning_rate': 3.856100174835945e-06, 'epoch': 0.72} + 72%|███████▏ | 546/759 [1:16:56<26:22, 7.43s/it][2024-12-31 18:27:02,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.75 | bwd_microstep: 291.38 | bwd_inner_microstep: 291.02 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.25 +[2024-12-31 18:27:02,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.91 | bwd_microstep: 282.95 | bwd_inner_microstep: 282.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:03,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.32 | bwd_microstep: 281.33 | bwd_inner_microstep: 281.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:04,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 274.35 | bwd_microstep: 487.16 | bwd_inner_microstep: 487.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:04,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.35 | bwd_microstep: 257.18 | bwd_inner_microstep: 257.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:05,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.09 | bwd_microstep: 250.92 | bwd_inner_microstep: 250.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:05,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 245.87 | bwd_inner_microstep: 245.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:27:06,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.77 | bwd_microstep: 276.65 | bwd_inner_microstep: 276.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:06,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:06,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:07,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 250.27 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.46 | step_microstep: 0.28 +[2024-12-31 18:27:07,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 281.71 | bwd_inner_microstep: 281.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:08,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:08,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:09,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 242.78 | bwd_inner_microstep: 242.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:09,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.52 | optimizer_step: 3.62 +[2024-12-31 18:27:09,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 255.67 | bwd_inner_microstep: 241.56 | bwd_allreduce_microstep: 14.00 | step_microstep: 14.21 +[2024-12-31 18:27:09,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2898.17 | bwd: 4385.91 | bwd_inner: 4370.03 | bwd_allreduce: 14.75 | step: 17.20 + 72%|███████▏ | 547/759 [1:17:04<26:24, 7.47s/it] {'loss': 1.2254, 'learning_rate': 3.822477925844564e-06, 'epoch': 0.72} + 72%|███████▏ | 547/759 [1:17:04<26:24, 7.47s/it][2024-12-31 18:27:10,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.88 | bwd_microstep: 343.30 | bwd_inner_microstep: 342.95 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:27:10,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 247.76 | bwd_microstep: 412.91 | bwd_inner_microstep: 412.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:27:11,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.49 | bwd_microstep: 286.63 | bwd_inner_microstep: 286.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:11,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.74 | bwd_microstep: 262.78 | bwd_inner_microstep: 262.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:12,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.28 | bwd_microstep: 261.09 | bwd_inner_microstep: 261.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:27:12,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.50 | bwd_microstep: 261.03 | bwd_inner_microstep: 261.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:13,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.27 | bwd_microstep: 250.03 | bwd_inner_microstep: 250.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:13,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 248.54 | bwd_inner_microstep: 248.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:14,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 248.06 | bwd_inner_microstep: 248.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:14,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.21 +[2024-12-31 18:27:14,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:15,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:15,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:16,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.80 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:16,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.01 | bwd_microstep: 241.36 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:17,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.63 | optimizer_gradients: 0.58 | optimizer_step: 3.17 +[2024-12-31 18:27:17,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.36 | bwd_microstep: 309.35 | bwd_inner_microstep: 242.27 | bwd_allreduce_microstep: 67.02 | step_microstep: 11.82 +[2024-12-31 18:27:17,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2908.10 | bwd: 4342.29 | bwd_inner: 4274.26 | bwd_allreduce: 67.37 | step: 14.88 + 72%|███████▏ | 548/759 [1:17:12<26:21, 7.50s/it] {'loss': 1.2103, 'learning_rate': 3.7889682300315e-06, 'epoch': 0.72} + 72%|███████▏ | 548/759 [1:17:12<26:21, 7.50s/it][2024-12-31 18:27:17,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 290.61 | bwd_microstep: 548.50 | bwd_inner_microstep: 548.14 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:27:18,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.96 | bwd_microstep: 267.99 | bwd_inner_microstep: 267.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:18,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 263.36 | bwd_inner_microstep: 263.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:19,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.59 | bwd_microstep: 262.50 | bwd_inner_microstep: 262.37 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.21 +[2024-12-31 18:27:19,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 252.16 | bwd_inner_microstep: 252.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:20,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 248.97 | bwd_inner_microstep: 248.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:20,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 249.83 | bwd_inner_microstep: 249.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:21,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.97 | bwd_inner_microstep: 247.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:27:21,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:21,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:22,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:27:22,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 244.60 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:23,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:23,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:24,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 269.98 | bwd_microstep: 447.71 | bwd_inner_microstep: 447.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:27:24,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.71 | optimizer_step: 3.56 +[2024-12-31 18:27:24,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.82 | bwd_microstep: 256.86 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 13.69 | step_microstep: 11.33 +[2024-12-31 18:27:24,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2971.20 | bwd: 4513.99 | bwd_inner: 4499.24 | bwd_allreduce: 14.04 | step: 14.37 + 72%|███████▏ | 549/759 [1:17:19<26:32, 7.58s/it] {'loss': 1.2557, 'learning_rate': 3.755571697936493e-06, 'epoch': 0.72} + 72%|███████▏ | 549/759 [1:17:19<26:32, 7.58s/it][2024-12-31 18:27:25,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.81 | bwd_microstep: 342.87 | bwd_inner_microstep: 342.37 | bwd_allreduce_microstep: 0.22 | step_microstep: 0.29 +[2024-12-31 18:27:26,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.09 | bwd_microstep: 314.29 | bwd_inner_microstep: 314.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:26,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.89 | bwd_microstep: 269.08 | bwd_inner_microstep: 269.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:26,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.59 | bwd_microstep: 269.33 | bwd_inner_microstep: 269.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:27,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.07 | bwd_microstep: 256.96 | bwd_inner_microstep: 256.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:27,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.71 | bwd_microstep: 255.53 | bwd_inner_microstep: 255.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:28,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 249.88 | bwd_inner_microstep: 249.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:28,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 248.05 | bwd_inner_microstep: 248.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:29,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:29,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.09 | bwd_microstep: 295.03 | bwd_inner_microstep: 294.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:30,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:30,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 256.48 | bwd_inner_microstep: 256.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:27:30,944] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:31,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:27:31,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.97 | bwd_microstep: 241.29 | bwd_inner_microstep: 241.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:32,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.32 | optimizer_gradients: 0.55 | optimizer_step: 3.11 +[2024-12-31 18:27:32,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 334.22 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 88.46 | step_microstep: 10.78 +[2024-12-31 18:27:32,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.35 | bwd: 4310.34 | bwd_inner: 4220.84 | bwd_allreduce: 88.79 | step: 13.76 + 72%|███████▏ | 550/759 [1:17:27<26:15, 7.54s/it] {'loss': 1.2157, 'learning_rate': 3.722288938037478e-06, 'epoch': 0.72} + 72%|███████▏ | 550/759 [1:17:27<26:15, 7.54s/it][2024-12-31 18:27:32,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.42 | bwd_microstep: 363.80 | bwd_inner_microstep: 363.43 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:27:33,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.20 | bwd_microstep: 282.88 | bwd_inner_microstep: 282.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:27:33,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.10 | bwd_microstep: 266.96 | bwd_inner_microstep: 266.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:34,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.62 | bwd_microstep: 262.52 | bwd_inner_microstep: 262.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:34,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.04 | bwd_microstep: 255.76 | bwd_inner_microstep: 255.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:27:35,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.34 | bwd_microstep: 256.56 | bwd_inner_microstep: 256.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:27:35,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.02 | bwd_microstep: 248.44 | bwd_inner_microstep: 248.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:27:36,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 261.45 | bwd_inner_microstep: 261.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:27:36,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:27:36,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:37,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:37,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 250.76 | bwd_inner_microstep: 250.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:27:38,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.08 | bwd_microstep: 242.52 | bwd_inner_microstep: 242.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:38,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 242.75 | bwd_inner_microstep: 242.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:27:39,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.08 | bwd_microstep: 224.83 | bwd_inner_microstep: 224.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:39,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.39 | optimizer_gradients: 0.57 | optimizer_step: 3.11 +[2024-12-31 18:27:39,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.66 | bwd_microstep: 297.92 | bwd_inner_microstep: 242.09 | bwd_allreduce_microstep: 55.79 | step_microstep: 11.73 +[2024-12-31 18:27:39,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.80 | bwd: 4189.89 | bwd_inner: 4133.32 | bwd_allreduce: 56.03 | step: 14.59 + 73%|███████▎ | 551/759 [1:17:34<25:52, 7.46s/it] {'loss': 1.2242, 'learning_rate': 3.689120556739475e-06, 'epoch': 0.73} + 73%|███████▎ | 551/759 [1:17:34<25:52, 7.46s/it][2024-12-31 18:27:40,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.92 | bwd_microstep: 370.35 | bwd_inner_microstep: 370.01 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:27:40,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.06 | bwd_microstep: 292.54 | bwd_inner_microstep: 292.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:27:41,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.40 | bwd_microstep: 287.55 | bwd_inner_microstep: 287.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:27:41,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.21 | bwd_microstep: 263.06 | bwd_inner_microstep: 263.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:27:42,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 247.37 | bwd_inner_microstep: 247.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:27:42,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 249.00 | bwd_inner_microstep: 248.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:27:43,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.27 | bwd_microstep: 248.81 | bwd_inner_microstep: 248.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:27:43,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:27:43,860] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 251.80 | bwd_inner_microstep: 251.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:27:44,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.89 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:27:44,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:45,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.62 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:27:45,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.88 | bwd_microstep: 241.07 | bwd_inner_microstep: 241.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:27:46,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:46,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 240.99 | bwd_inner_microstep: 240.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:46,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.97 | optimizer_gradients: 0.63 | optimizer_step: 3.30 +[2024-12-31 18:27:46,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.11 | bwd_microstep: 239.97 | bwd_inner_microstep: 226.26 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.34 +[2024-12-31 18:27:46,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2831.40 | bwd: 4155.78 | bwd_inner: 4141.32 | bwd_allreduce: 13.88 | step: 13.75 + 73%|███████▎ | 552/759 [1:17:41<25:30, 7.40s/it] {'loss': 1.2322, 'learning_rate': 3.6560671583635467e-06, 'epoch': 0.73} + 73%|███████▎ | 552/759 [1:17:41<25:30, 7.40s/it][2024-12-31 18:27:47,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.06 | bwd_microstep: 358.15 | bwd_inner_microstep: 357.80 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:27:47,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.03 | bwd_microstep: 286.79 | bwd_inner_microstep: 286.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:27:48,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.36 | bwd_microstep: 283.45 | bwd_inner_microstep: 283.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:27:48,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.27 | bwd_microstep: 262.67 | bwd_inner_microstep: 262.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:27:49,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.47 | bwd_microstep: 256.51 | bwd_inner_microstep: 256.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:27:49,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 248.10 | bwd_inner_microstep: 248.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:50,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 247.42 | bwd_inner_microstep: 247.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:50,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:51,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:51,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:51,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:52,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:52,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:27:53,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.60 | bwd_microstep: 241.24 | bwd_inner_microstep: 241.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.70 +[2024-12-31 18:27:53,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.53 | bwd_microstep: 262.26 | bwd_inner_microstep: 262.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:27:54,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.09 | optimizer_gradients: 0.86 | optimizer_step: 6.28 +[2024-12-31 18:27:54,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 522.19 | bwd_inner_microstep: 242.08 | bwd_allreduce_microstep: 280.06 | step_microstep: 15.17 +[2024-12-31 18:27:54,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.37 | bwd: 4436.98 | bwd_inner: 4156.14 | bwd_allreduce: 280.31 | step: 18.52 + 73%|███████▎ | 553/759 [1:17:49<25:32, 7.44s/it] {'loss': 1.1993, 'learning_rate': 3.6231293451357994e-06, 'epoch': 0.73} + 73%|███████▎ | 553/759 [1:17:49<25:32, 7.44s/it][2024-12-31 18:27:54,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.76 | bwd_microstep: 342.24 | bwd_inner_microstep: 341.90 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:27:55,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.71 | bwd_microstep: 290.82 | bwd_inner_microstep: 290.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:55,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.08 | bwd_microstep: 272.59 | bwd_inner_microstep: 271.99 | bwd_allreduce_microstep: 0.34 | step_microstep: 0.46 +[2024-12-31 18:27:56,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.73 | bwd_microstep: 258.10 | bwd_inner_microstep: 258.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:27:56,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 250.43 | bwd_inner_microstep: 250.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:27:57,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 260.25 | bwd_inner_microstep: 260.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:57,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:27:58,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:27:58,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 245.24 | bwd_inner_microstep: 245.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:27:59,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:59,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:27:59,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.36 | bwd_microstep: 247.90 | bwd_inner_microstep: 247.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:28:00,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.33 | bwd_microstep: 241.93 | bwd_inner_microstep: 241.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:00,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.15 | bwd_microstep: 240.94 | bwd_inner_microstep: 240.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:01,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:01,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 0.68 | optimizer_step: 7.52 +[2024-12-31 18:28:01,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 258.99 | bwd_inner_microstep: 245.23 | bwd_allreduce_microstep: 13.63 | step_microstep: 15.69 +[2024-12-31 18:28:01,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.84 | bwd: 4134.53 | bwd_inner: 4119.34 | bwd_allreduce: 14.31 | step: 19.00 + 73%|███████▎ | 554/759 [1:17:56<25:14, 7.39s/it] {'loss': 1.2278, 'learning_rate': 3.590307717176401e-06, 'epoch': 0.73} + 73%|███████▎ | 554/759 [1:17:56<25:14, 7.39s/it][2024-12-31 18:28:02,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.32 | bwd_microstep: 392.86 | bwd_inner_microstep: 392.50 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:28:02,822] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.29 | bwd_microstep: 287.62 | bwd_inner_microstep: 287.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:03,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.07 | bwd_microstep: 267.95 | bwd_inner_microstep: 267.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:03,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 249.28 | bwd_inner_microstep: 249.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:04,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 251.38 | bwd_inner_microstep: 251.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:04,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.40 | bwd_microstep: 250.01 | bwd_inner_microstep: 249.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:05,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 247.12 | bwd_inner_microstep: 247.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:05,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.19 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:05,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:06,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:06,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:07,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:07,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.62 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:08,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 249.85 | bwd_inner_microstep: 249.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:08,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:09,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.91 | optimizer_gradients: 0.56 | optimizer_step: 3.09 +[2024-12-31 18:28:09,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.83 | bwd_microstep: 548.13 | bwd_inner_microstep: 242.74 | bwd_allreduce_microstep: 305.35 | step_microstep: 10.60 +[2024-12-31 18:28:09,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2841.37 | bwd: 4455.05 | bwd_inner: 4148.89 | bwd_allreduce: 305.60 | step: 13.74 + 73%|███████▎ | 555/759 [1:18:04<25:19, 7.45s/it] {'loss': 1.2176, 'learning_rate': 3.557602872488638e-06, 'epoch': 0.73} + 73%|███████▎ | 555/759 [1:18:04<25:19, 7.45s/it][2024-12-31 18:28:09,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 280.84 | bwd_microstep: 335.84 | bwd_inner_microstep: 335.49 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:28:10,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.84 | bwd_microstep: 281.97 | bwd_inner_microstep: 281.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:28:10,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.46 | bwd_microstep: 267.02 | bwd_inner_microstep: 266.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:28:11,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.26 | bwd_microstep: 264.05 | bwd_inner_microstep: 264.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:11,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.67 | bwd_microstep: 256.26 | bwd_inner_microstep: 256.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:28:12,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.12 | bwd_microstep: 254.94 | bwd_inner_microstep: 254.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:12,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.89 | bwd_microstep: 248.77 | bwd_inner_microstep: 248.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:13,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 248.12 | bwd_inner_microstep: 248.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:28:13,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:13,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:28:14,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.86 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:14,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.48 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:15,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:15,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:16,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.27 | bwd_microstep: 242.79 | bwd_inner_microstep: 242.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:28:16,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.17 | optimizer_gradients: 0.57 | optimizer_step: 3.13 +[2024-12-31 18:28:16,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 352.93 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 109.39 | step_microstep: 11.79 +[2024-12-31 18:28:16,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2918.76 | bwd: 4218.35 | bwd_inner: 4108.17 | bwd_allreduce: 109.65 | step: 14.80 + 73%|███████▎ | 556/759 [1:18:11<25:11, 7.44s/it] {'loss': 1.2265, 'learning_rate': 3.525015406948039e-06, 'epoch': 0.73} + 73%|███████▎ | 556/759 [1:18:11<25:11, 7.44s/it][2024-12-31 18:28:17,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 249.36 | bwd_microstep: 446.81 | bwd_inner_microstep: 446.45 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:28:17,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.10 | bwd_microstep: 311.94 | bwd_inner_microstep: 311.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:18,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.39 | bwd_microstep: 283.81 | bwd_inner_microstep: 283.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:18,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.19 | bwd_microstep: 262.49 | bwd_inner_microstep: 262.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:28:19,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.03 | bwd_microstep: 263.47 | bwd_inner_microstep: 263.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:19,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.02 | bwd_microstep: 256.69 | bwd_inner_microstep: 256.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:20,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 255.36 | bwd_inner_microstep: 255.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:28:20,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.32 | bwd_microstep: 248.38 | bwd_inner_microstep: 248.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:28:21,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.51 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:21,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 250.65 | bwd_inner_microstep: 250.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:22,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 332.42 | bwd_inner_microstep: 331.29 | bwd_allreduce_microstep: 0.49 | step_microstep: 0.79 +[2024-12-31 18:28:22,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:23,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:23,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:23,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:24,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.28 | optimizer_gradients: 0.57 | optimizer_step: 3.09 +[2024-12-31 18:28:24,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.99 | bwd_microstep: 376.48 | bwd_inner_microstep: 226.55 | bwd_allreduce_microstep: 149.89 | step_microstep: 11.34 +[2024-12-31 18:28:24,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2914.59 | bwd: 4512.22 | bwd_inner: 4360.50 | bwd_allreduce: 150.56 | step: 14.89 + 73%|███████▎ | 557/759 [1:18:19<25:24, 7.55s/it] {'loss': 1.2085, 'learning_rate': 3.492545914291512e-06, 'epoch': 0.73} + 73%|███████▎ | 557/759 [1:18:19<25:24, 7.55s/it][2024-12-31 18:28:25,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.02 | bwd_microstep: 366.55 | bwd_inner_microstep: 366.19 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:28:25,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 223.26 | bwd_microstep: 283.54 | bwd_inner_microstep: 283.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:26,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.86 | bwd_microstep: 274.18 | bwd_inner_microstep: 274.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:26,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 252.09 | bwd_inner_microstep: 252.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:28:26,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 247.31 | bwd_inner_microstep: 247.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:28:27,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 244.52 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:28:27,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:28,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.62 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:28,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 255.23 | bwd_inner_microstep: 255.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:29,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:28:29,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:29,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 242.84 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:30,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.29 | bwd_microstep: 240.93 | bwd_inner_microstep: 240.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:30,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.08 | bwd_microstep: 242.44 | bwd_inner_microstep: 242.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:31,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:31,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.01 | optimizer_gradients: 0.56 | optimizer_step: 3.14 +[2024-12-31 18:28:31,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.35 | bwd_microstep: 301.39 | bwd_inner_microstep: 241.63 | bwd_allreduce_microstep: 59.71 | step_microstep: 10.77 +[2024-12-31 18:28:31,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2880.11 | bwd: 4171.43 | bwd_inner: 4110.93 | bwd_allreduce: 59.97 | step: 13.49 + 74%|███████▎ | 558/759 [1:18:26<25:03, 7.48s/it] {'loss': 1.2287, 'learning_rate': 3.4601949861065086e-06, 'epoch': 0.74} + 74%|███████▎ | 558/759 [1:18:26<25:03, 7.48s/it][2024-12-31 18:28:32,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.67 | bwd_microstep: 346.36 | bwd_inner_microstep: 346.02 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:28:32,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.68 | bwd_microstep: 297.91 | bwd_inner_microstep: 297.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:28:33,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.16 | bwd_microstep: 317.39 | bwd_inner_microstep: 317.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:33,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.32 | bwd_microstep: 265.21 | bwd_inner_microstep: 265.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:34,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 249.12 | bwd_inner_microstep: 249.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:28:34,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.12 | bwd_microstep: 248.20 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:35,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:28:35,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:36,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:28:36,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 280.35 | bwd_inner_microstep: 280.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:36,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.44 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:37,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:37,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 242.53 | bwd_inner_microstep: 242.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:38,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:38,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:28:39,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.45 | optimizer_gradients: 0.56 | optimizer_step: 3.16 +[2024-12-31 18:28:39,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 411.91 | bwd_inner_microstep: 252.79 | bwd_allreduce_microstep: 159.07 | step_microstep: 11.70 +[2024-12-31 18:28:39,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2834.72 | bwd: 4374.01 | bwd_inner: 4214.15 | bwd_allreduce: 159.32 | step: 14.57 + 74%|███████▎ | 559/759 [1:18:34<24:56, 7.48s/it] {'loss': 1.2073, 'learning_rate': 3.4279632118202744e-06, 'epoch': 0.74} + 74%|███████▎ | 559/759 [1:18:34<24:56, 7.48s/it][2024-12-31 18:28:39,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 247.16 | bwd_microstep: 333.12 | bwd_inner_microstep: 332.75 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:28:40,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.40 | bwd_microstep: 282.82 | bwd_inner_microstep: 282.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:28:40,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.31 | bwd_microstep: 265.73 | bwd_inner_microstep: 265.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:28:41,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 249.54 | bwd_inner_microstep: 249.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:41,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 248.29 | bwd_inner_microstep: 248.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:42,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:42,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 245.39 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:28:43,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:43,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 245.36 | bwd_inner_microstep: 245.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:43,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:28:44,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:28:44,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.15 | bwd_microstep: 242.65 | bwd_inner_microstep: 242.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:28:45,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 241.73 | bwd_inner_microstep: 241.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:28:45,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 241.19 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:28:46,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 242.31 | bwd_inner_microstep: 242.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:46,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.51 | optimizer_gradients: 0.83 | optimizer_step: 3.12 +[2024-12-31 18:28:46,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 480.13 | bwd_inner_microstep: 241.88 | bwd_allreduce_microstep: 238.20 | step_microstep: 11.67 +[2024-12-31 18:28:46,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2827.93 | bwd: 4302.87 | bwd_inner: 4063.90 | bwd_allreduce: 238.44 | step: 14.57 + 74%|███████▍ | 560/759 [1:18:41<24:45, 7.46s/it] {'loss': 1.2092, 'learning_rate': 3.3958511786890923e-06, 'epoch': 0.74} + 74%|███████▍ | 560/759 [1:18:41<24:45, 7.46s/it][2024-12-31 18:28:47,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.62 | bwd_microstep: 345.69 | bwd_inner_microstep: 345.36 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.17 +[2024-12-31 18:28:47,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.94 | bwd_microstep: 286.11 | bwd_inner_microstep: 286.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:48,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.36 | bwd_microstep: 304.16 | bwd_inner_microstep: 304.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:28:48,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.80 | bwd_microstep: 262.98 | bwd_inner_microstep: 262.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:49,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:49,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.96 | bwd_microstep: 257.63 | bwd_inner_microstep: 257.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:28:50,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 246.61 | bwd_inner_microstep: 246.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:50,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:50,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:51,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:51,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:52,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:52,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.69 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:53,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.95 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:28:53,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:54,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.76 | optimizer_gradients: 0.65 | optimizer_step: 3.33 +[2024-12-31 18:28:54,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.71 | bwd_microstep: 256.30 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.14 +[2024-12-31 18:28:54,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.80 | bwd: 4167.34 | bwd_inner: 4152.87 | bwd_allreduce: 13.82 | step: 14.07 + 74%|███████▍ | 561/759 [1:18:49<24:30, 7.42s/it] {'loss': 1.2181, 'learning_rate': 3.3638594717875807e-06, 'epoch': 0.74} + 74%|███████▍ | 561/759 [1:18:49<24:30, 7.42s/it][2024-12-31 18:28:54,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.62 | bwd_microstep: 312.52 | bwd_inner_microstep: 312.16 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:28:55,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.46 | bwd_microstep: 297.30 | bwd_inner_microstep: 297.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:28:55,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.94 | bwd_microstep: 284.04 | bwd_inner_microstep: 284.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:56,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.26 | bwd_microstep: 264.77 | bwd_inner_microstep: 264.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:28:56,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.18 | bwd_microstep: 273.14 | bwd_inner_microstep: 273.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:56,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.63 | bwd_microstep: 248.71 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:28:57,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 259.19 | bwd_inner_microstep: 259.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:57,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 248.71 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:28:58,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 245.77 | bwd_inner_microstep: 245.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:58,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.70 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:28:59,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:28:59,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:28:59,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:00,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:00,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:01,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.68 | optimizer_step: 3.38 +[2024-12-31 18:29:01,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.86 | bwd_microstep: 255.40 | bwd_inner_microstep: 241.70 | bwd_allreduce_microstep: 13.58 | step_microstep: 10.91 +[2024-12-31 18:29:01,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2828.10 | bwd: 4152.29 | bwd_inner: 4137.84 | bwd_allreduce: 13.87 | step: 13.47 + 74%|███████▍ | 562/759 [1:18:56<24:12, 7.38s/it] {'loss': 1.2126, 'learning_rate': 3.33198867399804e-06, 'epoch': 0.74} + 74%|███████▍ | 562/759 [1:18:56<24:12, 7.38s/it][2024-12-31 18:29:01,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.23 | bwd_microstep: 343.70 | bwd_inner_microstep: 343.33 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:29:02,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.28 | bwd_microstep: 289.41 | bwd_inner_microstep: 289.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:02,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.67 | bwd_microstep: 286.79 | bwd_inner_microstep: 286.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:03,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.13 | bwd_microstep: 268.99 | bwd_inner_microstep: 268.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:03,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.22 | bwd_microstep: 255.27 | bwd_inner_microstep: 255.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:04,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.72 | bwd_microstep: 250.03 | bwd_inner_microstep: 250.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:04,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:05,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:05,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.83 | bwd_microstep: 247.22 | bwd_inner_microstep: 247.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:05,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:06,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 246.48 | bwd_inner_microstep: 246.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:06,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:29:07,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:07,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 242.40 | bwd_inner_microstep: 242.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:08,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.62 | bwd_microstep: 241.46 | bwd_inner_microstep: 241.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:08,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.76 | optimizer_step: 3.28 +[2024-12-31 18:29:08,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.91 | bwd_microstep: 255.50 | bwd_inner_microstep: 241.88 | bwd_allreduce_microstep: 13.53 | step_microstep: 10.90 +[2024-12-31 18:29:08,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.46 | bwd: 4151.57 | bwd_inner: 4137.04 | bwd_allreduce: 13.81 | step: 13.86 + 74%|███████▍ | 563/759 [1:19:03<24:00, 7.35s/it] {'loss': 1.2117, 'learning_rate': 3.3002393659998357e-06, 'epoch': 0.74} + 74%|███████▍ | 563/759 [1:19:03<24:00, 7.35s/it][2024-12-31 18:29:09,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.06 | bwd_microstep: 394.33 | bwd_inner_microstep: 393.99 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:29:09,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.04 | bwd_microstep: 294.48 | bwd_inner_microstep: 294.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:10,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.21 | bwd_microstep: 267.60 | bwd_inner_microstep: 267.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:10,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.95 | bwd_microstep: 254.45 | bwd_inner_microstep: 254.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:11,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 250.52 | bwd_inner_microstep: 250.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:11,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 246.70 | bwd_inner_microstep: 246.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:11,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:12,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:12,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:29:13,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 262.07 | bwd_inner_microstep: 262.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:29:13,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:14,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 247.39 | bwd_inner_microstep: 247.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:14,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 242.42 | bwd_inner_microstep: 242.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:15,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.67 | bwd_microstep: 241.15 | bwd_inner_microstep: 241.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:15,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 240.81 | bwd_inner_microstep: 240.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:15,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.16 | optimizer_gradients: 0.55 | optimizer_step: 3.09 +[2024-12-31 18:29:15,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.62 | bwd_microstep: 349.88 | bwd_inner_microstep: 241.69 | bwd_allreduce_microstep: 108.14 | step_microstep: 12.29 +[2024-12-31 18:29:15,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2832.42 | bwd: 4267.59 | bwd_inner: 4158.64 | bwd_allreduce: 108.38 | step: 15.24 + 74%|███████▍ | 564/759 [1:19:10<23:55, 7.36s/it] {'loss': 1.2126, 'learning_rate': 3.2686121262588165e-06, 'epoch': 0.74} + 74%|███████▍ | 564/759 [1:19:10<23:55, 7.36s/it][2024-12-31 18:29:16,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.20 | bwd_microstep: 357.96 | bwd_inner_microstep: 357.61 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:29:17,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.07 | bwd_microstep: 298.22 | bwd_inner_microstep: 298.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:17,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.21 | bwd_microstep: 283.23 | bwd_inner_microstep: 283.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:29:18,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.25 | bwd_microstep: 263.10 | bwd_inner_microstep: 263.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:18,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.85 | bwd_microstep: 264.03 | bwd_inner_microstep: 264.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:18,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.14 | bwd_microstep: 256.52 | bwd_inner_microstep: 256.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:19,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.93 | bwd_microstep: 253.21 | bwd_inner_microstep: 253.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:29:19,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 245.41 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:20,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 246.09 | bwd_inner_microstep: 246.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:20,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:21,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.15 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:21,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:22,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:29:22,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:22,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.75 | bwd_microstep: 241.38 | bwd_inner_microstep: 241.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:23,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.72 | optimizer_gradients: 0.85 | optimizer_step: 3.42 +[2024-12-31 18:29:23,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 257.23 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.64 +[2024-12-31 18:29:23,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2861.84 | bwd: 4188.39 | bwd_inner: 4173.91 | bwd_allreduce: 13.89 | step: 14.73 + 74%|███████▍ | 565/759 [1:19:18<23:47, 7.36s/it] {'loss': 1.226, 'learning_rate': 3.2371075310167634e-06, 'epoch': 0.74} + 74%|███████▍ | 565/759 [1:19:18<23:47, 7.36s/it][2024-12-31 18:29:23,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.67 | bwd_microstep: 334.90 | bwd_inner_microstep: 334.54 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:29:24,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.95 | bwd_microstep: 282.92 | bwd_inner_microstep: 282.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:29:24,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.33 | bwd_microstep: 285.06 | bwd_inner_microstep: 285.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:29:25,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 256.09 | bwd_inner_microstep: 256.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:25,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 250.20 | bwd_inner_microstep: 250.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:26,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 249.14 | bwd_inner_microstep: 249.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:29:26,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.43 | bwd_microstep: 245.44 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:27,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 246.80 | bwd_inner_microstep: 246.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:27,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:27,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:28,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 244.54 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:28,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.77 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:29,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:29,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 247.98 | bwd_inner_microstep: 247.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:30,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:29:30,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.39 | optimizer_gradients: 0.57 | optimizer_step: 3.20 +[2024-12-31 18:29:30,541] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.63 | bwd_microstep: 283.71 | bwd_inner_microstep: 226.92 | bwd_allreduce_microstep: 56.74 | step_microstep: 12.14 +[2024-12-31 18:29:30,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.75 | bwd: 4145.85 | bwd_inner: 4088.33 | bwd_allreduce: 56.99 | step: 14.29 + 75%|███████▍ | 566/759 [1:19:25<23:32, 7.32s/it] {'loss': 1.1847, 'learning_rate': 3.205726154280905e-06, 'epoch': 0.75} + 75%|███████▍ | 566/759 [1:19:25<23:32, 7.32s/it][2024-12-31 18:29:31,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 287.15 | bwd_microstep: 490.43 | bwd_inner_microstep: 490.09 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:29:31,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.01 | bwd_microstep: 290.98 | bwd_inner_microstep: 290.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:29:32,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.71 | bwd_microstep: 314.18 | bwd_inner_microstep: 314.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:29:32,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.51 | bwd_microstep: 262.25 | bwd_inner_microstep: 262.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:29:33,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.69 | bwd_microstep: 255.65 | bwd_inner_microstep: 255.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:29:33,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.57 | bwd_microstep: 255.63 | bwd_inner_microstep: 255.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:34,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.00 | bwd_microstep: 248.97 | bwd_inner_microstep: 248.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:29:34,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:35,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:29:35,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 246.90 | bwd_inner_microstep: 246.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:35,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.47 | bwd_microstep: 246.91 | bwd_inner_microstep: 246.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:36,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 245.47 | bwd_inner_microstep: 245.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:36,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:37,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:29:37,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:38,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.62 | optimizer_step: 3.25 +[2024-12-31 18:29:38,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 257.29 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 13.56 | step_microstep: 16.74 +[2024-12-31 18:29:38,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2924.43 | bwd: 4339.14 | bwd_inner: 4324.66 | bwd_allreduce: 13.84 | step: 19.49 + 75%|███████▍ | 567/759 [1:19:33<23:39, 7.39s/it] {'loss': 1.2131, 'learning_rate': 3.174468567813461e-06, 'epoch': 0.75} + 75%|███████▍ | 567/759 [1:19:33<23:39, 7.39s/it][2024-12-31 18:29:38,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.89 | bwd_microstep: 345.60 | bwd_inner_microstep: 345.25 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:29:39,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.95 | bwd_microstep: 281.86 | bwd_inner_microstep: 281.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:39,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.98 | bwd_microstep: 294.07 | bwd_inner_microstep: 293.69 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.23 +[2024-12-31 18:29:40,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 251.01 | bwd_inner_microstep: 250.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:40,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.89 | bwd_microstep: 257.16 | bwd_inner_microstep: 257.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:29:41,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 248.36 | bwd_inner_microstep: 248.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:29:41,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:29:41,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:29:42,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:42,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.39 | bwd_microstep: 250.10 | bwd_inner_microstep: 250.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:43,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:29:43,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:29:44,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:29:44,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 241.36 | bwd_inner_microstep: 241.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:44,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.88 | bwd_microstep: 241.06 | bwd_inner_microstep: 241.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:45,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.61 | optimizer_gradients: 1.16 | optimizer_step: 3.20 +[2024-12-31 18:29:45,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 379.70 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 135.58 | step_microstep: 13.60 +[2024-12-31 18:29:45,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2808.61 | bwd: 4259.91 | bwd_inner: 4123.08 | bwd_allreduce: 135.98 | step: 15.92 + 75%|███████▍ | 568/759 [1:19:40<23:28, 7.38s/it] {'loss': 1.2046, 'learning_rate': 3.143335341121202e-06, 'epoch': 0.75} + 75%|███████▍ | 568/759 [1:19:40<23:28, 7.38s/it][2024-12-31 18:29:46,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 243.99 | bwd_microstep: 384.41 | bwd_inner_microstep: 384.07 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:29:46,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.27 | bwd_microstep: 310.81 | bwd_inner_microstep: 310.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:29:47,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.78 | bwd_microstep: 284.62 | bwd_inner_microstep: 284.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:47,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.49 | bwd_microstep: 262.82 | bwd_inner_microstep: 262.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:48,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:29:48,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 246.99 | bwd_inner_microstep: 246.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:48,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.56 | bwd_microstep: 247.36 | bwd_inner_microstep: 247.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:49,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:49,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 283.08 | bwd_inner_microstep: 283.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:29:50,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 246.28 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.31 +[2024-12-31 18:29:50,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:29:51,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:51,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:52,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.52 | bwd_microstep: 257.29 | bwd_inner_microstep: 256.90 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.36 +[2024-12-31 18:29:52,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.92 | bwd_microstep: 242.03 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:29:52,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.70 | optimizer_gradients: 0.64 | optimizer_step: 3.30 +[2024-12-31 18:29:52,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.72 | bwd_microstep: 254.99 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 13.59 | step_microstep: 13.09 +[2024-12-31 18:29:52,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2868.05 | bwd: 4246.88 | bwd_inner: 4231.56 | bwd_allreduce: 14.25 | step: 16.43 + 75%|███████▍ | 569/759 [1:19:47<23:27, 7.41s/it] {'loss': 1.2495, 'learning_rate': 3.1123270414451035e-06, 'epoch': 0.75} + 75%|███████▍ | 569/759 [1:19:47<23:27, 7.41s/it][2024-12-31 18:29:53,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.91 | bwd_microstep: 394.72 | bwd_inner_microstep: 394.37 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.25 +[2024-12-31 18:29:54,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.99 | bwd_microstep: 304.25 | bwd_inner_microstep: 304.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:54,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.39 | bwd_microstep: 268.11 | bwd_inner_microstep: 268.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:55,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.94 | bwd_microstep: 266.17 | bwd_inner_microstep: 266.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:29:55,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.99 | bwd_microstep: 255.41 | bwd_inner_microstep: 255.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:29:55,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:56,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 250.94 | bwd_inner_microstep: 250.63 | bwd_allreduce_microstep: 0.19 | step_microstep: 0.26 +[2024-12-31 18:29:56,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.56 | bwd_microstep: 247.59 | bwd_inner_microstep: 247.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:57,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 251.33 | bwd_inner_microstep: 251.17 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.23 +[2024-12-31 18:29:57,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:29:58,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 248.03 | bwd_inner_microstep: 248.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:29:58,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 242.31 | bwd_inner_microstep: 242.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:29:59,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:59,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.62 | bwd_microstep: 240.50 | bwd_inner_microstep: 240.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:29:59,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 242.83 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:30:00,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.72 | optimizer_step: 3.57 +[2024-12-31 18:30:00,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.66 | bwd_microstep: 255.21 | bwd_inner_microstep: 241.29 | bwd_allreduce_microstep: 13.80 | step_microstep: 11.52 +[2024-12-31 18:30:00,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2884.12 | bwd: 4201.37 | bwd_inner: 4186.10 | bwd_allreduce: 14.41 | step: 14.73 + 75%|███████▌ | 570/759 [1:19:55<23:19, 7.40s/it] {'loss': 1.2398, 'learning_rate': 3.081444233749994e-06, 'epoch': 0.75} + 75%|███████▌ | 570/759 [1:19:55<23:19, 7.40s/it][2024-12-31 18:30:00,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 208.50 | bwd_microstep: 307.00 | bwd_inner_microstep: 306.66 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:30:01,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.53 | bwd_microstep: 368.22 | bwd_inner_microstep: 368.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:30:01,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 269.78 | bwd_inner_microstep: 269.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:02,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.83 | bwd_microstep: 255.27 | bwd_inner_microstep: 255.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:02,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 250.23 | bwd_inner_microstep: 250.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:30:03,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 246.81 | bwd_inner_microstep: 246.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:03,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 245.22 | bwd_inner_microstep: 245.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:04,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.76 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:30:04,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.08 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:05,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:05,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:05,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 242.73 | bwd_inner_microstep: 242.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:06,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 245.71 | bwd_inner_microstep: 245.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:30:06,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.30 +[2024-12-31 18:30:07,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.61 | bwd_microstep: 241.72 | bwd_inner_microstep: 241.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.31 +[2024-12-31 18:30:07,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.85 | optimizer_gradients: 0.64 | optimizer_step: 3.30 +[2024-12-31 18:30:07,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 256.44 | bwd_inner_microstep: 242.76 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.32 +[2024-12-31 18:30:07,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2880.58 | bwd: 4154.50 | bwd_inner: 4140.05 | bwd_allreduce: 13.85 | step: 14.22 + 75%|███████▌ | 571/759 [1:20:02<23:06, 7.38s/it] {'loss': 1.2245, 'learning_rate': 3.050687480714256e-06, 'epoch': 0.75} + 75%|███████▌ | 571/759 [1:20:02<23:06, 7.38s/it][2024-12-31 18:30:08,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.04 | bwd_microstep: 386.03 | bwd_inner_microstep: 385.52 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.27 +[2024-12-31 18:30:08,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.19 | bwd_microstep: 297.58 | bwd_inner_microstep: 297.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:09,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.13 | bwd_microstep: 288.81 | bwd_inner_microstep: 288.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:09,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.88 | bwd_microstep: 269.72 | bwd_inner_microstep: 269.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:30:10,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.54 | bwd_microstep: 262.43 | bwd_inner_microstep: 262.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:10,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.45 | bwd_microstep: 256.22 | bwd_inner_microstep: 256.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:11,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 247.39 | bwd_inner_microstep: 247.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:30:11,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 253.91 | bwd_inner_microstep: 253.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:11,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:30:12,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:12,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 242.95 | bwd_inner_microstep: 242.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:13,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.31 +[2024-12-31 18:30:13,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:30:14,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:30:14,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:15,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.36 | optimizer_gradients: 0.80 | optimizer_step: 3.39 +[2024-12-31 18:30:15,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 258.13 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 13.83 | step_microstep: 14.56 +[2024-12-31 18:30:15,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.03 | bwd: 4233.62 | bwd_inner: 4218.33 | bwd_allreduce: 14.25 | step: 17.60 + 75%|███████▌ | 572/759 [1:20:09<23:00, 7.38s/it] {'loss': 1.2042, 'learning_rate': 3.0200573427195877e-06, 'epoch': 0.75} + 75%|███████▌ | 572/759 [1:20:10<23:00, 7.38s/it][2024-12-31 18:30:15,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.10 | bwd_microstep: 314.99 | bwd_inner_microstep: 314.62 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:30:16,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.78 | bwd_microstep: 294.48 | bwd_inner_microstep: 294.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:16,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.84 | bwd_microstep: 263.87 | bwd_inner_microstep: 263.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:17,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.18 | bwd_microstep: 258.22 | bwd_inner_microstep: 258.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:17,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 249.16 | bwd_inner_microstep: 249.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:17,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 256.72 | bwd_inner_microstep: 256.50 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.25 +[2024-12-31 18:30:18,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 245.87 | bwd_inner_microstep: 245.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:30:18,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:30:19,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:30:19,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.53 | bwd_microstep: 246.81 | bwd_inner_microstep: 246.33 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.28 +[2024-12-31 18:30:20,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.37 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:20,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.59 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.22 +[2024-12-31 18:30:20,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 255.80 | bwd_inner_microstep: 255.67 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.22 +[2024-12-31 18:30:21,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 248.34 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:21,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:22,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.42 | optimizer_gradients: 0.78 | optimizer_step: 3.49 +[2024-12-31 18:30:22,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 257.63 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 13.65 | step_microstep: 11.94 +[2024-12-31 18:30:22,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2800.49 | bwd: 4118.03 | bwd_inner: 4102.43 | bwd_allreduce: 14.48 | step: 15.06 + 75%|███████▌ | 573/759 [1:20:17<22:43, 7.33s/it] {'loss': 1.2298, 'learning_rate': 2.9895543778407875e-06, 'epoch': 0.75} + 75%|███████▌ | 573/759 [1:20:17<22:43, 7.33s/it][2024-12-31 18:30:22,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.30 | bwd_microstep: 396.66 | bwd_inner_microstep: 396.31 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:30:23,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.33 | bwd_microstep: 286.78 | bwd_inner_microstep: 286.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:23,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.13 | bwd_microstep: 263.35 | bwd_inner_microstep: 263.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:24,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 262.98 | bwd_inner_microstep: 262.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:24,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:25,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:25,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:30:26,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 247.12 | bwd_inner_microstep: 247.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:30:26,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 246.00 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:26,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 254.81 | bwd_inner_microstep: 254.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:27,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:27,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:28,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 254.35 | bwd_inner_microstep: 254.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:28,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 241.28 | bwd_inner_microstep: 241.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:29,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.60 | bwd_microstep: 241.99 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:29,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 1.22 | optimizer_step: 3.40 +[2024-12-31 18:30:29,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.77 | bwd_microstep: 256.49 | bwd_inner_microstep: 242.38 | bwd_allreduce_microstep: 13.95 | step_microstep: 13.52 +[2024-12-31 18:30:29,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2844.06 | bwd: 4183.29 | bwd_inner: 4168.31 | bwd_allreduce: 14.26 | step: 16.58 + 76%|███████▌ | 574/759 [1:20:24<22:36, 7.33s/it] {'loss': 1.2284, 'learning_rate': 2.959179141835591e-06, 'epoch': 0.76} + 76%|███████▌ | 574/759 [1:20:24<22:36, 7.33s/it][2024-12-31 18:30:30,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 270.48 | bwd_microstep: 357.12 | bwd_inner_microstep: 356.78 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:30:30,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.64 | bwd_microstep: 290.80 | bwd_inner_microstep: 290.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:30:31,188] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.65 | bwd_microstep: 261.53 | bwd_inner_microstep: 261.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:30:31,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.10 | bwd_microstep: 249.12 | bwd_inner_microstep: 249.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:32,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 251.22 | bwd_inner_microstep: 251.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:32,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.16 | bwd_microstep: 254.76 | bwd_inner_microstep: 254.47 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.26 +[2024-12-31 18:30:33,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 249.10 | bwd_inner_microstep: 249.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:30:33,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 248.73 | bwd_inner_microstep: 248.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:30:33,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 260.04 | bwd_inner_microstep: 260.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:34,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:30:34,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:30:35,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:35,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:36,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.59 | bwd_microstep: 241.33 | bwd_inner_microstep: 241.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:36,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 241.20 | bwd_inner_microstep: 241.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:36,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 12.34 | optimizer_gradients: 9.53 | optimizer_step: 7.31 +[2024-12-31 18:30:36,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 258.04 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 14.33 | step_microstep: 32.79 +[2024-12-31 18:30:36,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2919.20 | bwd: 4138.83 | bwd_inner: 4123.14 | bwd_allreduce: 14.87 | step: 35.75 + 76%|███████▌ | 575/759 [1:20:31<22:31, 7.34s/it] {'loss': 1.2351, 'learning_rate': 2.9289321881345257e-06, 'epoch': 0.76} + 76%|███████▌ | 575/759 [1:20:31<22:31, 7.34s/it][2024-12-31 18:30:37,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.94 | bwd_microstep: 305.11 | bwd_inner_microstep: 304.78 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:30:38,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.33 | bwd_microstep: 320.16 | bwd_inner_microstep: 320.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:30:38,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.78 | bwd_microstep: 280.62 | bwd_inner_microstep: 280.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:30:38,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.88 | bwd_microstep: 261.03 | bwd_inner_microstep: 260.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:30:39,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 249.60 | bwd_inner_microstep: 249.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:39,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.37 | bwd_microstep: 256.19 | bwd_inner_microstep: 256.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:40,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 248.92 | bwd_inner_microstep: 248.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:40,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:41,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.68 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:30:41,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:30:42,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.39 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:30:42,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:42,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.04 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:30:43,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:43,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.46 | bwd_microstep: 224.83 | bwd_inner_microstep: 224.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:44,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.70 | optimizer_step: 3.18 +[2024-12-31 18:30:44,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 326.65 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 84.96 | step_microstep: 52.44 +[2024-12-31 18:30:44,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2821.45 | bwd: 4184.00 | bwd_inner: 4098.16 | bwd_allreduce: 85.21 | step: 55.64 + 76%|███████▌ | 576/759 [1:20:39<22:24, 7.35s/it] {'loss': 1.2263, 'learning_rate': 2.898814067830855e-06, 'epoch': 0.76} + 76%|███████▌ | 576/759 [1:20:39<22:24, 7.35s/it][2024-12-31 18:30:44,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.27 | bwd_microstep: 317.87 | bwd_inner_microstep: 317.51 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:30:45,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.56 | bwd_microstep: 286.07 | bwd_inner_microstep: 286.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:30:45,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.67 | bwd_microstep: 280.67 | bwd_inner_microstep: 280.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:46,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.63 | bwd_microstep: 262.84 | bwd_inner_microstep: 262.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:46,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.11 | bwd_microstep: 257.31 | bwd_inner_microstep: 256.98 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.27 +[2024-12-31 18:30:47,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 247.35 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:47,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 253.34 | bwd_inner_microstep: 249.25 | bwd_allreduce_microstep: 3.85 | step_microstep: 0.30 +[2024-12-31 18:30:48,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 245.44 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:30:48,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:49,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 259.46 | bwd_inner_microstep: 259.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:30:49,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.85 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:49,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:50,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:50,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 244.53 | bwd_inner_microstep: 244.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:30:51,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:30:51,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 20.38 | optimizer_gradients: 0.59 | optimizer_step: 8.51 +[2024-12-31 18:30:51,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 372.06 | bwd_inner_microstep: 253.74 | bwd_allreduce_microstep: 118.27 | step_microstep: 32.12 +[2024-12-31 18:30:51,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.53 | bwd: 4246.69 | bwd_inner: 4123.16 | bwd_allreduce: 122.73 | step: 35.34 + 76%|███████▌ | 577/759 [1:20:46<22:23, 7.38s/it] {'loss': 1.2225, 'learning_rate': 2.868825329670524e-06, 'epoch': 0.76} + 76%|███████▌ | 577/759 [1:20:46<22:23, 7.38s/it][2024-12-31 18:30:52,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 257.88 | bwd_microstep: 335.08 | bwd_inner_microstep: 334.74 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:30:53,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 247.90 | bwd_microstep: 413.70 | bwd_inner_microstep: 413.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:53,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.46 | bwd_microstep: 258.07 | bwd_inner_microstep: 258.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:30:54,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.48 | bwd_microstep: 263.28 | bwd_inner_microstep: 263.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:30:54,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.53 | bwd_microstep: 259.85 | bwd_inner_microstep: 259.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:30:54,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.93 | bwd_microstep: 247.13 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:30:55,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 247.39 | bwd_inner_microstep: 247.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:55,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 246.86 | bwd_inner_microstep: 246.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:56,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 251.26 | bwd_inner_microstep: 250.94 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:30:56,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:57,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:30:57,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 252.22 | bwd_inner_microstep: 252.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:30:57,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.78 | bwd_microstep: 251.47 | bwd_inner_microstep: 251.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:30:58,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.05 | bwd_inner_microstep: 244.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:30:58,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.14 | bwd_microstep: 224.93 | bwd_inner_microstep: 224.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:30:59,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.64 | optimizer_step: 3.33 +[2024-12-31 18:30:59,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 257.82 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 14.06 | step_microstep: 11.38 +[2024-12-31 18:30:59,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2915.34 | bwd: 4242.20 | bwd_inner: 4226.94 | bwd_allreduce: 14.47 | step: 13.93 + 76%|███████▌ | 578/759 [1:20:54<22:22, 7.42s/it] {'loss': 1.2288, 'learning_rate': 2.83896652004215e-06, 'epoch': 0.76} + 76%|███████▌ | 578/759 [1:20:54<22:22, 7.42s/it][2024-12-31 18:30:59,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.76 | bwd_microstep: 335.45 | bwd_inner_microstep: 335.08 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:31:00,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.00 | bwd_microstep: 310.78 | bwd_inner_microstep: 310.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:00,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.15 | bwd_microstep: 288.26 | bwd_inner_microstep: 288.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:31:01,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.53 | bwd_microstep: 264.85 | bwd_inner_microstep: 264.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:31:01,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.34 | bwd_microstep: 285.49 | bwd_inner_microstep: 285.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:02,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 249.77 | bwd_inner_microstep: 249.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:31:02,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.39 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.27 +[2024-12-31 18:31:03,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 245.68 | bwd_inner_microstep: 245.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:31:03,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 247.73 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:31:04,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 283.67 | bwd_inner_microstep: 283.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:04,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.15 | bwd_microstep: 242.44 | bwd_inner_microstep: 242.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:04,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 253.06 | bwd_inner_microstep: 253.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:31:05,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:05,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:31:06,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:06,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.64 | optimizer_step: 3.46 +[2024-12-31 18:31:06,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 257.43 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 13.75 | step_microstep: 11.31 +[2024-12-31 18:31:06,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2862.31 | bwd: 4240.97 | bwd_inner: 4225.74 | bwd_allreduce: 14.31 | step: 14.56 + 76%|███████▋ | 579/759 [1:21:01<22:14, 7.41s/it] {'loss': 1.2179, 'learning_rate': 2.809238182967092e-06, 'epoch': 0.76} + 76%|███████▋ | 579/759 [1:21:01<22:14, 7.41s/it][2024-12-31 18:31:07,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.16 | bwd_microstep: 316.08 | bwd_inner_microstep: 315.73 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:31:07,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.06 | bwd_microstep: 287.63 | bwd_inner_microstep: 287.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:31:08,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.74 | bwd_microstep: 281.04 | bwd_inner_microstep: 281.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:08,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.77 | bwd_microstep: 261.66 | bwd_inner_microstep: 261.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:09,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.50 | bwd_microstep: 250.16 | bwd_inner_microstep: 250.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.30 +[2024-12-31 18:31:09,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.39 | bwd_microstep: 247.96 | bwd_inner_microstep: 247.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:31:10,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 247.88 | bwd_inner_microstep: 247.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:31:10,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 246.49 | bwd_inner_microstep: 246.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:31:10,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 249.25 | bwd_inner_microstep: 249.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:11,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:11,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:31:12,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 249.42 | bwd_inner_microstep: 249.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:12,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:13,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.70 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:13,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 251.47 | bwd_inner_microstep: 251.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:13,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.86 | optimizer_gradients: 0.64 | optimizer_step: 3.30 +[2024-12-31 18:31:13,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.60 | bwd_microstep: 270.95 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 26.76 | step_microstep: 10.71 +[2024-12-31 18:31:13,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.04 | bwd: 4140.33 | bwd_inner: 4112.48 | bwd_allreduce: 27.04 | step: 13.88 + 76%|███████▋ | 580/759 [1:21:08<21:59, 7.37s/it] {'loss': 1.2299, 'learning_rate': 2.779640860089523e-06, 'epoch': 0.76} + 76%|███████▋ | 580/759 [1:21:08<21:59, 7.37s/it][2024-12-31 18:31:14,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.66 | bwd_microstep: 339.99 | bwd_inner_microstep: 339.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:31:15,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.72 | bwd_microstep: 391.06 | bwd_inner_microstep: 391.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:31:15,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.19 | bwd_microstep: 341.58 | bwd_inner_microstep: 341.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:16,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.33 | bwd_microstep: 265.63 | bwd_inner_microstep: 265.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:31:16,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.17 | bwd_microstep: 258.20 | bwd_inner_microstep: 258.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:31:17,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.74 | bwd_microstep: 256.25 | bwd_inner_microstep: 256.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:17,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.56 | bwd_microstep: 249.87 | bwd_inner_microstep: 249.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:18,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 247.14 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:18,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:18,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:19,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:19,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:31:20,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.97 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:20,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:21,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:21,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.34 | optimizer_gradients: 0.64 | optimizer_step: 3.28 +[2024-12-31 18:31:21,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.76 | bwd_microstep: 419.23 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 176.73 | step_microstep: 13.87 +[2024-12-31 18:31:21,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2918.02 | bwd: 4478.03 | bwd_inner: 4300.51 | bwd_allreduce: 176.97 | step: 16.84 + 77%|███████▋ | 581/759 [1:21:16<22:09, 7.47s/it] {'loss': 1.1786, 'learning_rate': 2.7501750906665603e-06, 'epoch': 0.77} + 77%|███████▋ | 581/759 [1:21:16<22:09, 7.47s/it][2024-12-31 18:31:22,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.17 | bwd_microstep: 346.44 | bwd_inner_microstep: 345.90 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.27 +[2024-12-31 18:31:22,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.68 | bwd_microstep: 287.62 | bwd_inner_microstep: 287.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:23,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.45 | bwd_microstep: 262.99 | bwd_inner_microstep: 262.80 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.25 +[2024-12-31 18:31:23,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.46 | bwd_microstep: 261.83 | bwd_inner_microstep: 261.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:24,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 251.11 | bwd_inner_microstep: 251.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:24,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.25 | bwd_microstep: 248.75 | bwd_inner_microstep: 248.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:31:24,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.23 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:25,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 247.23 | bwd_inner_microstep: 247.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:25,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:26,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 245.02 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:26,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:27,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:31:27,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:28,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 252.38 | bwd_inner_microstep: 252.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:28,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.56 | bwd_microstep: 241.50 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:28,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.63 | optimizer_step: 3.30 +[2024-12-31 18:31:28,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.98 | bwd_microstep: 305.97 | bwd_inner_microstep: 241.88 | bwd_allreduce_microstep: 64.04 | step_microstep: 10.63 +[2024-12-31 18:31:28,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.04 | bwd: 4174.94 | bwd_inner: 4109.77 | bwd_allreduce: 64.52 | step: 13.65 + 77%|███████▋ | 582/759 [1:21:23<21:51, 7.41s/it] {'loss': 1.229, 'learning_rate': 2.7208414115584436e-06, 'epoch': 0.77} + 77%|███████▋ | 582/759 [1:21:23<21:51, 7.41s/it][2024-12-31 18:31:29,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.77 | bwd_microstep: 354.49 | bwd_inner_microstep: 354.07 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.23 +[2024-12-31 18:31:30,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.46 | bwd_microstep: 289.99 | bwd_inner_microstep: 289.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:30,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.09 | bwd_microstep: 280.41 | bwd_inner_microstep: 280.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:30,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.62 | bwd_microstep: 266.22 | bwd_inner_microstep: 266.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:31:31,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.07 | bwd_microstep: 248.37 | bwd_inner_microstep: 248.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:31:31,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 247.82 | bwd_inner_microstep: 247.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:32,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 249.16 | bwd_inner_microstep: 249.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:32,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:31:33,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.88 | bwd_microstep: 245.96 | bwd_inner_microstep: 245.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:33,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:34,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:34,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 242.81 | bwd_inner_microstep: 242.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:34,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.25 +[2024-12-31 18:31:35,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:35,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:36,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.70 | optimizer_step: 3.51 +[2024-12-31 18:31:36,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 258.01 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.51 +[2024-12-31 18:31:36,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.99 | bwd: 4149.64 | bwd_inner: 4134.89 | bwd_allreduce: 13.99 | step: 14.60 + 77%|███████▋ | 583/759 [1:21:31<21:40, 7.39s/it] {'loss': 1.2126, 'learning_rate': 2.691640357218759e-06, 'epoch': 0.77} + 77%|███████▋ | 583/759 [1:21:31<21:40, 7.39s/it][2024-12-31 18:31:36,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.75 | bwd_microstep: 349.93 | bwd_inner_microstep: 349.59 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:31:37,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.94 | bwd_microstep: 289.11 | bwd_inner_microstep: 289.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:37,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.55 | bwd_microstep: 367.27 | bwd_inner_microstep: 367.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:38,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.62 | bwd_microstep: 260.52 | bwd_inner_microstep: 260.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:38,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.04 | bwd_microstep: 261.69 | bwd_inner_microstep: 261.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:39,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 248.90 | bwd_inner_microstep: 248.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:31:39,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:40,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 246.37 | bwd_inner_microstep: 246.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:40,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:41,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 247.01 | bwd_inner_microstep: 246.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:41,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:41,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 242.79 | bwd_inner_microstep: 242.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:31:42,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:42,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.44 | bwd_microstep: 241.48 | bwd_inner_microstep: 241.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:31:43,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:43,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.03 | optimizer_gradients: 0.67 | optimizer_step: 3.09 +[2024-12-31 18:31:43,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.55 | bwd_microstep: 446.33 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 202.98 | step_microstep: 11.98 +[2024-12-31 18:31:43,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2897.99 | bwd: 4423.67 | bwd_inner: 4219.64 | bwd_allreduce: 203.28 | step: 15.19 + 77%|███████▋ | 584/759 [1:21:38<21:44, 7.46s/it] {'loss': 1.2048, 'learning_rate': 2.662572459684699e-06, 'epoch': 0.77} + 77%|███████▋ | 584/759 [1:21:38<21:44, 7.46s/it][2024-12-31 18:31:44,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.82 | bwd_microstep: 347.34 | bwd_inner_microstep: 346.93 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.26 +[2024-12-31 18:31:44,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.16 | bwd_microstep: 284.12 | bwd_inner_microstep: 284.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:45,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.91 | bwd_microstep: 264.63 | bwd_inner_microstep: 264.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:31:45,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.56 | bwd_microstep: 256.14 | bwd_inner_microstep: 256.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:31:46,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.28 | bwd_microstep: 256.85 | bwd_inner_microstep: 256.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:31:46,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 248.54 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.26 +[2024-12-31 18:31:47,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:47,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 246.69 | bwd_inner_microstep: 246.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:48,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.25 | bwd_inner_microstep: 247.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:48,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 247.31 | bwd_inner_microstep: 247.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:48,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 249.33 | bwd_inner_microstep: 249.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:49,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:49,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.94 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:31:50,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:50,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:51,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.56 | optimizer_gradients: 0.61 | optimizer_step: 3.14 +[2024-12-31 18:31:51,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.77 | bwd_microstep: 472.02 | bwd_inner_microstep: 251.50 | bwd_allreduce_microstep: 220.47 | step_microstep: 11.43 +[2024-12-31 18:31:51,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.93 | bwd: 4343.28 | bwd_inner: 4121.61 | bwd_allreduce: 221.02 | step: 14.16 + 77%|███████▋ | 585/759 [1:21:46<21:37, 7.46s/it] {'loss': 1.2251, 'learning_rate': 2.6336382485673574e-06, 'epoch': 0.77} + 77%|███████▋ | 585/759 [1:21:46<21:37, 7.46s/it][2024-12-31 18:31:51,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.86 | bwd_microstep: 309.05 | bwd_inner_microstep: 308.70 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:31:52,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.15 | bwd_microstep: 370.53 | bwd_inner_microstep: 370.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:52,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.09 | bwd_microstep: 256.22 | bwd_inner_microstep: 256.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:31:53,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.92 | bwd_microstep: 256.73 | bwd_inner_microstep: 256.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:53,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:54,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 248.15 | bwd_inner_microstep: 248.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:54,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 248.64 | bwd_inner_microstep: 248.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:55,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:55,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.76 | bwd_inner_microstep: 245.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:31:55,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 244.01 | bwd_inner_microstep: 243.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:31:56,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.96 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:56,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 244.01 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:31:57,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 253.39 | bwd_inner_microstep: 253.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:31:57,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.10 | bwd_microstep: 225.94 | bwd_inner_microstep: 225.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:31:58,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 245.73 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:31:58,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 11.80 | optimizer_gradients: 0.65 | optimizer_step: 31.07 +[2024-12-31 18:31:58,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 612.09 | bwd_inner_microstep: 249.18 | bwd_allreduce_microstep: 362.86 | step_microstep: 49.90 +[2024-12-31 18:31:58,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.19 | bwd: 4500.32 | bwd_inner: 4136.63 | bwd_allreduce: 363.12 | step: 52.83 + 77%|███████▋ | 586/759 [1:21:53<21:40, 7.52s/it] {'loss': 1.2086, 'learning_rate': 2.6048382510420954e-06, 'epoch': 0.77} + 77%|███████▋ | 586/759 [1:21:54<21:40, 7.52s/it][2024-12-31 18:31:59,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.71 | bwd_microstep: 314.25 | bwd_inner_microstep: 313.89 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:32:00,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.50 | bwd_microstep: 286.45 | bwd_inner_microstep: 286.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:32:00,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.71 | bwd_microstep: 267.95 | bwd_inner_microstep: 267.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:01,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.64 | bwd_microstep: 255.64 | bwd_inner_microstep: 255.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:01,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 249.41 | bwd_inner_microstep: 249.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:01,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 246.34 | bwd_inner_microstep: 246.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:02,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.02 | bwd_microstep: 248.80 | bwd_inner_microstep: 248.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:02,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:32:03,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 246.61 | bwd_inner_microstep: 246.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:03,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 247.62 | bwd_inner_microstep: 247.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:04,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 276.22 | bwd_inner_microstep: 276.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:04,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:04,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:05,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.99 | bwd_microstep: 246.63 | bwd_inner_microstep: 246.38 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.29 +[2024-12-31 18:32:05,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 242.46 | bwd_inner_microstep: 242.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:32:06,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.86 | optimizer_gradients: 0.56 | optimizer_step: 3.18 +[2024-12-31 18:32:06,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.15 | bwd_microstep: 296.91 | bwd_inner_microstep: 241.86 | bwd_allreduce_microstep: 55.01 | step_microstep: 10.62 +[2024-12-31 18:32:06,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2803.39 | bwd: 4157.99 | bwd_inner: 4101.81 | bwd_allreduce: 55.44 | step: 13.73 + 77%|███████▋ | 587/759 [1:22:01<21:21, 7.45s/it] {'loss': 1.2318, 'learning_rate': 2.576172991838933e-06, 'epoch': 0.77} + 77%|███████▋ | 587/759 [1:22:01<21:21, 7.45s/it][2024-12-31 18:32:06,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.69 | bwd_microstep: 350.98 | bwd_inner_microstep: 350.64 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.18 +[2024-12-31 18:32:07,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.63 | bwd_microstep: 287.30 | bwd_inner_microstep: 287.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:07,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.20 | bwd_microstep: 268.83 | bwd_inner_microstep: 268.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:08,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.33 | bwd_microstep: 257.96 | bwd_inner_microstep: 257.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:08,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 248.74 | bwd_inner_microstep: 248.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:09,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 248.08 | bwd_inner_microstep: 248.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:09,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:32:10,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 246.31 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:32:10,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:32:10,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 246.04 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:11,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:11,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:12,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 241.43 | bwd_inner_microstep: 241.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:32:12,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 242.68 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:13,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.97 | bwd_microstep: 241.48 | bwd_inner_microstep: 241.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:13,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.69 | optimizer_step: 9.31 +[2024-12-31 18:32:13,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.60 | bwd_microstep: 268.69 | bwd_inner_microstep: 242.58 | bwd_allreduce_microstep: 26.04 | step_microstep: 17.44 +[2024-12-31 18:32:13,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.60 | bwd: 4127.97 | bwd_inner: 4101.12 | bwd_allreduce: 26.30 | step: 20.10 + 77%|███████▋ | 588/759 [1:22:08<21:01, 7.38s/it] {'loss': 1.2496, 'learning_rate': 2.547642993232976e-06, 'epoch': 0.77} + 77%|███████▋ | 588/759 [1:22:08<21:01, 7.38s/it][2024-12-31 18:32:14,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.73 | bwd_microstep: 362.95 | bwd_inner_microstep: 362.84 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 18:32:14,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.74 | bwd_microstep: 285.64 | bwd_inner_microstep: 285.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:15,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.86 | bwd_microstep: 266.81 | bwd_inner_microstep: 266.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:15,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.87 | bwd_microstep: 263.26 | bwd_inner_microstep: 263.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:15,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 249.50 | bwd_inner_microstep: 249.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:16,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.21 | bwd_microstep: 257.35 | bwd_inner_microstep: 257.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:16,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 247.68 | bwd_inner_microstep: 247.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:17,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.33 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:17,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.18 | bwd_inner_microstep: 245.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:18,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 257.39 | bwd_inner_microstep: 257.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:18,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:18,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 250.88 | bwd_inner_microstep: 250.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:19,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:19,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:20,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 240.92 | bwd_inner_microstep: 240.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:20,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.69 | optimizer_gradients: 0.63 | optimizer_step: 3.30 +[2024-12-31 18:32:20,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.78 | bwd_microstep: 567.74 | bwd_inner_microstep: 241.83 | bwd_allreduce_microstep: 325.86 | step_microstep: 12.56 +[2024-12-31 18:32:20,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.47 | bwd: 4476.18 | bwd_inner: 4149.78 | bwd_allreduce: 326.01 | step: 13.19 + 78%|███████▊ | 589/759 [1:22:15<20:58, 7.40s/it] {'loss': 1.2442, 'learning_rate': 2.519248775034918e-06, 'epoch': 0.78} + 78%|███████▊ | 589/759 [1:22:15<20:58, 7.40s/it][2024-12-31 18:32:21,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.99 | bwd_microstep: 347.49 | bwd_inner_microstep: 347.15 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:32:22,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.00 | bwd_microstep: 308.84 | bwd_inner_microstep: 308.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:32:22,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.36 | bwd_microstep: 268.53 | bwd_inner_microstep: 268.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:32:22,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.67 | bwd_microstep: 257.83 | bwd_inner_microstep: 257.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:32:23,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.77 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:23,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 250.92 | bwd_inner_microstep: 250.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:24,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 263.78 | bwd_inner_microstep: 263.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:32:24,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:32:25,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 254.05 | bwd_inner_microstep: 254.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:32:25,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 245.66 | bwd_inner_microstep: 245.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:26,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:32:26,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 242.68 | bwd_inner_microstep: 242.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:32:26,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.30 | bwd_microstep: 261.94 | bwd_inner_microstep: 261.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:32:27,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.94 | bwd_microstep: 241.69 | bwd_inner_microstep: 241.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:27,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 241.06 | bwd_inner_microstep: 241.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:28,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.72 | optimizer_step: 3.14 +[2024-12-31 18:32:28,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 408.27 | bwd_inner_microstep: 253.80 | bwd_allreduce_microstep: 154.43 | step_microstep: 10.30 +[2024-12-31 18:32:28,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.85 | bwd: 4331.23 | bwd_inner: 4175.97 | bwd_allreduce: 154.67 | step: 13.18 + 78%|███████▊ | 590/759 [1:22:23<20:52, 7.41s/it] {'loss': 1.2059, 'learning_rate': 2.490990854581563e-06, 'epoch': 0.78} + 78%|███████▊ | 590/759 [1:22:23<20:52, 7.41s/it][2024-12-31 18:32:29,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.80 | bwd_microstep: 365.83 | bwd_inner_microstep: 365.49 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:32:29,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 264.96 | bwd_microstep: 444.29 | bwd_inner_microstep: 444.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:32:30,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.53 | bwd_microstep: 262.55 | bwd_inner_microstep: 262.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:30,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.20 | bwd_microstep: 249.13 | bwd_inner_microstep: 249.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:31,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 248.55 | bwd_inner_microstep: 248.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:31,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 249.70 | bwd_inner_microstep: 249.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:32:31,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 251.48 | bwd_inner_microstep: 251.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:32,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 272.63 | bwd_inner_microstep: 272.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:32,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 245.69 | bwd_inner_microstep: 245.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:33,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 247.90 | bwd_inner_microstep: 247.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:32:33,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 249.01 | bwd_inner_microstep: 248.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:34,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:34,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:35,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:32:35,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.35 | bwd_microstep: 241.19 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:35,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.69 | optimizer_step: 3.51 +[2024-12-31 18:32:35,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 255.35 | bwd_inner_microstep: 241.62 | bwd_allreduce_microstep: 13.60 | step_microstep: 11.25 +[2024-12-31 18:32:35,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2908.60 | bwd: 4315.15 | bwd_inner: 4300.68 | bwd_allreduce: 13.89 | step: 13.83 + 78%|███████▊ | 591/759 [1:22:30<20:49, 7.44s/it] {'loss': 1.2276, 'learning_rate': 2.4628697467263916e-06, 'epoch': 0.78} + 78%|███████▊ | 591/759 [1:22:30<20:49, 7.44s/it][2024-12-31 18:32:36,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 283.16 | bwd_microstep: 491.58 | bwd_inner_microstep: 491.21 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.25 +[2024-12-31 18:32:37,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.58 | bwd_microstep: 291.70 | bwd_inner_microstep: 291.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:37,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.10 | bwd_microstep: 260.97 | bwd_inner_microstep: 260.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:38,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.96 | bwd_microstep: 262.80 | bwd_inner_microstep: 262.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:38,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.91 | bwd_microstep: 272.11 | bwd_inner_microstep: 272.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:32:39,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 248.22 | bwd_inner_microstep: 248.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:32:39,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 250.57 | bwd_inner_microstep: 250.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:32:39,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 252.24 | bwd_inner_microstep: 251.90 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.28 +[2024-12-31 18:32:40,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 247.58 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:40,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:41,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.26 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:32:41,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.23 +[2024-12-31 18:32:42,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:42,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 268.06 | bwd_inner_microstep: 268.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:42,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.05 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:43,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.81 | optimizer_step: 3.42 +[2024-12-31 18:32:43,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 258.10 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 13.54 | step_microstep: 11.23 +[2024-12-31 18:32:43,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2896.56 | bwd: 4324.48 | bwd_inner: 4309.61 | bwd_allreduce: 14.10 | step: 14.42 + 78%|███████▊ | 592/759 [1:22:38<20:45, 7.46s/it] {'loss': 1.2529, 'learning_rate': 2.4348859638301857e-06, 'epoch': 0.78} + 78%|███████▊ | 592/759 [1:22:38<20:45, 7.46s/it][2024-12-31 18:32:43,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.91 | bwd_microstep: 336.79 | bwd_inner_microstep: 336.43 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:32:44,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.41 | bwd_microstep: 302.90 | bwd_inner_microstep: 302.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:44,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.85 | bwd_microstep: 281.55 | bwd_inner_microstep: 281.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:45,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.49 | bwd_microstep: 292.00 | bwd_inner_microstep: 291.82 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.24 +[2024-12-31 18:32:45,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 250.23 | bwd_inner_microstep: 250.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:32:46,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.17 | bwd_microstep: 256.31 | bwd_inner_microstep: 256.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:46,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 246.48 | bwd_inner_microstep: 246.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:32:47,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:32:47,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 246.37 | bwd_inner_microstep: 246.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:48,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:32:48,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:48,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.23 | bwd_microstep: 242.69 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:32:49,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 241.14 | bwd_inner_microstep: 241.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:49,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:50,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:32:50,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.83 | optimizer_step: 3.48 +[2024-12-31 18:32:50,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.00 | bwd_microstep: 256.50 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 13.56 | step_microstep: 11.36 +[2024-12-31 18:32:50,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.58 | bwd: 4173.39 | bwd_inner: 4158.51 | bwd_allreduce: 13.96 | step: 14.48 + 78%|███████▊ | 593/759 [1:22:45<20:31, 7.42s/it] {'loss': 1.209, 'learning_rate': 2.4070400157517036e-06, 'epoch': 0.78} + 78%|███████▊ | 593/759 [1:22:45<20:31, 7.42s/it][2024-12-31 18:32:51,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.35 | bwd_microstep: 339.06 | bwd_inner_microstep: 338.94 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.07 +[2024-12-31 18:32:51,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.38 | bwd_microstep: 343.13 | bwd_inner_microstep: 343.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:52,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.97 | bwd_microstep: 271.34 | bwd_inner_microstep: 271.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:52,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.18 | bwd_microstep: 261.79 | bwd_inner_microstep: 261.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:32:53,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.78 | bwd_microstep: 255.72 | bwd_inner_microstep: 255.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:53,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:54,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:54,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:54,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:55,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:55,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:56,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:56,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.67 | bwd_microstep: 241.02 | bwd_inner_microstep: 240.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:57,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.46 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:57,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:58,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.05 | optimizer_gradients: 0.71 | optimizer_step: 3.13 +[2024-12-31 18:32:58,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 508.02 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 264.14 | step_microstep: 11.14 +[2024-12-31 18:32:58,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.99 | bwd: 4423.47 | bwd_inner: 4158.78 | bwd_allreduce: 264.30 | step: 11.79 + 78%|███████▊ | 594/759 [1:22:53<20:23, 7.42s/it] {'loss': 1.2104, 'learning_rate': 2.3793324098383796e-06, 'epoch': 0.78} + 78%|███████▊ | 594/759 [1:22:53<20:23, 7.42s/it][2024-12-31 18:32:58,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 247.22 | bwd_microstep: 407.29 | bwd_inner_microstep: 407.18 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.07 +[2024-12-31 18:32:59,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.24 | bwd_microstep: 298.54 | bwd_inner_microstep: 298.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:32:59,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.42 | bwd_microstep: 289.46 | bwd_inner_microstep: 289.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:33:00,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.11 | bwd_microstep: 276.35 | bwd_inner_microstep: 276.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:33:00,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 258.96 | bwd_inner_microstep: 258.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:33:01,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:33:01,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 247.34 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:02,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:33:02,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:02,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 253.14 | bwd_inner_microstep: 253.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:33:03,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:03,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 252.37 | bwd_inner_microstep: 252.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:33:04,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 246.70 | bwd_inner_microstep: 246.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:04,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 251.96 | bwd_inner_microstep: 251.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:05,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.68 | bwd_microstep: 241.00 | bwd_inner_microstep: 240.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:05,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 5.60 | optimizer_step: 3.34 +[2024-12-31 18:33:05,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.47 | bwd_microstep: 256.02 | bwd_inner_microstep: 242.17 | bwd_allreduce_microstep: 13.75 | step_microstep: 15.59 +[2024-12-31 18:33:05,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2877.07 | bwd: 4260.99 | bwd_inner: 4246.46 | bwd_allreduce: 13.94 | step: 18.31 + 78%|███████▊ | 595/759 [1:23:00<20:15, 7.41s/it] {'loss': 1.2115, 'learning_rate': 2.351763650917074e-06, 'epoch': 0.78} + 78%|███████▊ | 595/759 [1:23:00<20:15, 7.41s/it][2024-12-31 18:33:06,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.49 | bwd_microstep: 310.22 | bwd_inner_microstep: 309.86 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:33:06,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.85 | bwd_microstep: 291.24 | bwd_inner_microstep: 291.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:33:07,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.97 | bwd_microstep: 281.64 | bwd_inner_microstep: 281.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:07,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.75 | bwd_microstep: 267.35 | bwd_inner_microstep: 267.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:07,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.74 | bwd_microstep: 258.03 | bwd_inner_microstep: 258.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:33:08,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 250.68 | bwd_inner_microstep: 250.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:08,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:09,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:09,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.85 | bwd_microstep: 246.16 | bwd_inner_microstep: 246.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:10,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.47 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:33:10,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.81 | bwd_microstep: 247.03 | bwd_inner_microstep: 246.66 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.27 +[2024-12-31 18:33:11,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:33:11,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 252.21 | bwd_inner_microstep: 252.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.33 +[2024-12-31 18:33:11,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:12,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 242.85 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:13,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.68 | optimizer_gradients: 0.56 | optimizer_step: 11.56 +[2024-12-31 18:33:13,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.66 | bwd_microstep: 438.04 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 193.03 | step_microstep: 24.21 +[2024-12-31 18:33:13,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.92 | bwd: 4310.80 | bwd_inner: 4116.38 | bwd_allreduce: 193.58 | step: 27.32 + 79%|███████▊ | 596/759 [1:23:08<20:12, 7.44s/it] {'loss': 1.1976, 'learning_rate': 2.3243342412848923e-06, 'epoch': 0.79} + 79%|███████▊ | 596/759 [1:23:08<20:12, 7.44s/it][2024-12-31 18:33:13,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.28 | bwd_microstep: 303.08 | bwd_inner_microstep: 302.73 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:33:14,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.99 | bwd_microstep: 280.60 | bwd_inner_microstep: 280.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:33:14,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.16 | bwd_microstep: 263.55 | bwd_inner_microstep: 263.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:33:14,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.83 | bwd_microstep: 257.82 | bwd_inner_microstep: 257.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:33:15,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.56 | bwd_microstep: 255.39 | bwd_inner_microstep: 255.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:15,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 260.01 | bwd_inner_microstep: 259.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:16,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:33:16,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:33:17,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 255.94 | bwd_inner_microstep: 255.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:17,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.32 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:18,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:18,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:18,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:33:19,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.70 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:19,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.36 | bwd_microstep: 267.31 | bwd_inner_microstep: 267.19 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.22 +[2024-12-31 18:33:20,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.63 | optimizer_step: 3.35 +[2024-12-31 18:33:20,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 257.51 | bwd_inner_microstep: 243.59 | bwd_allreduce_microstep: 13.81 | step_microstep: 10.83 +[2024-12-31 18:33:20,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2854.29 | bwd: 4113.73 | bwd_inner: 4098.88 | bwd_allreduce: 14.14 | step: 13.55 + 79%|███████▊ | 597/759 [1:23:15<19:56, 7.39s/it] {'loss': 1.225, 'learning_rate': 2.2970446807000237e-06, 'epoch': 0.79} + 79%|███████▊ | 597/759 [1:23:15<19:56, 7.39s/it][2024-12-31 18:33:20,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.55 | bwd_microstep: 342.75 | bwd_inner_microstep: 342.40 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:33:21,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.76 | bwd_microstep: 313.79 | bwd_inner_microstep: 313.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:33:21,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.65 | bwd_microstep: 281.20 | bwd_inner_microstep: 281.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:33:22,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.62 | bwd_microstep: 264.85 | bwd_inner_microstep: 264.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:22,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.74 | bwd_microstep: 256.63 | bwd_inner_microstep: 256.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:33:23,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.34 | bwd_microstep: 250.16 | bwd_inner_microstep: 250.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:23,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.16 | bwd_microstep: 247.44 | bwd_inner_microstep: 247.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:24,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.00 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:24,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 251.32 | bwd_inner_microstep: 251.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:25,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:25,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:25,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.26 +[2024-12-31 18:33:26,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.88 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:33:26,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:27,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 249.59 | bwd_inner_microstep: 249.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:33:28,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 11.25 | optimizer_gradients: 11.50 | optimizer_step: 4.93 +[2024-12-31 18:33:28,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 666.91 | bwd_inner_microstep: 248.54 | bwd_allreduce_microstep: 418.32 | step_microstep: 30.29 +[2024-12-31 18:33:28,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2854.58 | bwd: 4589.87 | bwd_inner: 4170.50 | bwd_allreduce: 418.65 | step: 33.47 + 79%|███████▉ | 598/759 [1:23:23<20:06, 7.50s/it] {'loss': 1.226, 'learning_rate': 2.26989546637263e-06, 'epoch': 0.79} + 79%|███████▉ | 598/759 [1:23:23<20:06, 7.50s/it][2024-12-31 18:33:28,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.28 | bwd_microstep: 303.88 | bwd_inner_microstep: 303.73 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.09 +[2024-12-31 18:33:29,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.92 | bwd_microstep: 269.70 | bwd_inner_microstep: 269.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:33:29,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.86 | bwd_microstep: 269.59 | bwd_inner_microstep: 269.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:33:29,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.49 | bwd_microstep: 261.67 | bwd_inner_microstep: 261.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:33:30,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.64 | bwd_microstep: 255.82 | bwd_inner_microstep: 255.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:33:30,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:33:31,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 268.50 | bwd_inner_microstep: 268.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:31,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 246.02 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:33:32,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:32,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:33,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.85 | bwd_microstep: 244.22 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:33:33,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.57 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:33,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.94 | bwd_microstep: 242.88 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:34,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 253.34 | bwd_inner_microstep: 253.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:34,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 242.39 | bwd_inner_microstep: 242.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:33:35,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.25 | optimizer_gradients: 0.66 | optimizer_step: 3.09 +[2024-12-31 18:33:35,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 353.59 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 109.55 | step_microstep: 13.24 +[2024-12-31 18:33:35,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2818.27 | bwd: 4194.11 | bwd_inner: 4083.78 | bwd_allreduce: 109.74 | step: 15.84 + 79%|███████▉ | 599/759 [1:23:30<19:49, 7.44s/it] {'loss': 1.2215, 'learning_rate': 2.2428870929558012e-06, 'epoch': 0.79} + 79%|███████▉ | 599/759 [1:23:30<19:49, 7.44s/it][2024-12-31 18:33:35,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.81 | bwd_microstep: 359.48 | bwd_inner_microstep: 359.14 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:33:36,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.95 | bwd_microstep: 297.34 | bwd_inner_microstep: 297.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:36,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.55 | bwd_microstep: 300.67 | bwd_inner_microstep: 300.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:33:37,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.49 | bwd_microstep: 255.60 | bwd_inner_microstep: 255.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:33:37,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 249.99 | bwd_inner_microstep: 249.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:33:38,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 249.89 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:38,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:33:39,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:33:39,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:40,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 253.93 | bwd_inner_microstep: 253.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:40,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:40,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:33:41,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 241.36 | bwd_inner_microstep: 241.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:33:41,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 251.34 | bwd_inner_microstep: 251.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:42,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 242.50 | bwd_inner_microstep: 242.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:33:42,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.87 | optimizer_gradients: 0.77 | optimizer_step: 3.37 +[2024-12-31 18:33:42,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.66 | bwd_microstep: 256.39 | bwd_inner_microstep: 241.64 | bwd_allreduce_microstep: 14.62 | step_microstep: 13.65 +[2024-12-31 18:33:42,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2812.94 | bwd: 4179.38 | bwd_inner: 4163.88 | bwd_allreduce: 14.91 | step: 16.69 + 79%|███████▉ | 600/759 [1:23:37<19:35, 7.39s/it] {'loss': 1.2065, 'learning_rate': 2.2160200525365326e-06, 'epoch': 0.79} + 79%|███████▉ | 600/759 [1:23:37<19:35, 7.39s/it][INFO|trainer.py:2936] 2024-12-31 18:33:43,955 >> Saving model checkpoint to work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600 +[INFO|configuration_utils.py:473] 2024-12-31 18:33:43,974 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/config.json +[INFO|configuration_utils.py:594] 2024-12-31 18:33:43,987 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/generation_config.json +[INFO|modeling_utils.py:2493] 2024-12-31 18:35:55,507 >> Model weights saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/model.safetensors +[INFO|tokenization_utils_base.py:2433] 2024-12-31 18:35:55,751 >> tokenizer config file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/tokenizer_config.json +[INFO|tokenization_utils_base.py:2442] 2024-12-31 18:35:55,853 >> Special tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/special_tokens_map.json +[INFO|tokenization_utils_base.py:2493] 2024-12-31 18:35:55,858 >> added tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/added_tokens.json +[2024-12-31 18:36:14,316] [INFO] [logging.py:96:log_dist] [Rank 0] [Torch] Checkpoint global_step600 is about to be saved! +[2024-12-31 18:36:14,393] [INFO] [logging.py:96:log_dist] [Rank 0] Saving model checkpoint: work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/global_step600/mp_rank_00_model_states.pt +[2024-12-31 18:36:14,393] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/global_step600/mp_rank_00_model_states.pt... +[2024-12-31 18:36:40,501] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/global_step600/mp_rank_00_model_states.pt. +[2024-12-31 18:36:40,513] [INFO] [torch_checkpoint_engine.py:21:save] [Torch] Saving work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/global_step600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... +[2024-12-31 18:36:42,074] [INFO] [torch_checkpoint_engine.py:23:save] [Torch] Saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/global_step600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. +[2024-12-31 18:36:42,113] [INFO] [engine.py:3488:_save_zero_checkpoint] zero checkpoint saved work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tmp-checkpoint-600/global_step600/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +[2024-12-31 18:36:42,113] [INFO] [torch_checkpoint_engine.py:33:commit] [Torch] Checkpoint global_step600 is ready now! +[INFO|trainer.py:3028] 2024-12-31 18:36:42,389 >> Deleting older checkpoint [work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/checkpoint-400] due to args.save_total_limit +[2024-12-31 18:36:43,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 305.01 | bwd_microstep: 363.04 | bwd_inner_microstep: 362.69 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:36:43,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.43 | bwd_microstep: 412.41 | bwd_inner_microstep: 412.14 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.36 +[2024-12-31 18:36:44,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.60 | bwd_microstep: 262.05 | bwd_inner_microstep: 262.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:36:44,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.50 | bwd_microstep: 254.32 | bwd_inner_microstep: 254.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:36:45,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.38 | bwd_microstep: 256.66 | bwd_inner_microstep: 256.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:36:45,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 249.44 | bwd_inner_microstep: 249.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:36:46,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 245.24 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:36:46,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 247.04 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:36:58,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.58 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:37:11,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.15 | bwd_microstep: 242.86 | bwd_inner_microstep: 242.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:37:11,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.77 | bwd_microstep: 253.94 | bwd_inner_microstep: 253.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:37:19,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.68 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:37:19,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 243.01 | bwd_inner_microstep: 242.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:37:43,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.88 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:37:43,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 166.65 | bwd_microstep: 240.46 | bwd_inner_microstep: 240.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:37:53,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.12 | optimizer_gradients: 0.84 | optimizer_step: 3.78 +[2024-12-31 18:37:53,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.66 | bwd_microstep: 267.66 | bwd_inner_microstep: 253.69 | bwd_allreduce_microstep: 13.88 | step_microstep: 14.29 +[2024-12-31 18:37:53,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3070.42 | bwd: 4268.86 | bwd_inner: 4253.85 | bwd_allreduce: 14.34 | step: 17.30 + 79%|███████▉ | 601/759 [1:27:48<3:31:55, 80.48s/it] {'loss': 1.244, 'learning_rate': 2.1892948346267583e-06, 'epoch': 0.79} + 79%|███████▉ | 601/759 [1:27:48<3:31:55, 80.48s/it][2024-12-31 18:37:54,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.80 | bwd_microstep: 345.64 | bwd_inner_microstep: 345.28 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.23 +[2024-12-31 18:37:54,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.41 | bwd_microstep: 265.27 | bwd_inner_microstep: 265.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:37:55,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.71 | bwd_microstep: 267.29 | bwd_inner_microstep: 267.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:37:55,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 255.31 | bwd_inner_microstep: 255.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:37:56,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 251.21 | bwd_inner_microstep: 251.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:37:56,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 247.14 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:37:56,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 251.73 | bwd_inner_microstep: 251.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:37:57,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 250.07 | bwd_inner_microstep: 249.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:37:57,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:37:58,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.23 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:37:58,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:37:59,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 256.61 | bwd_inner_microstep: 256.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:37:59,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.92 | bwd_microstep: 243.02 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:38:00,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.58 | bwd_microstep: 246.83 | bwd_inner_microstep: 246.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:38:00,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 241.28 | bwd_inner_microstep: 241.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:01,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.32 | optimizer_gradients: 0.57 | optimizer_step: 3.13 +[2024-12-31 18:38:01,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 572.39 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 327.26 | step_microstep: 11.35 +[2024-12-31 18:38:01,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2803.97 | bwd: 4425.66 | bwd_inner: 4097.43 | bwd_allreduce: 327.55 | step: 14.51 + 79%|███████▉ | 602/759 [1:27:56<2:33:20, 58.60s/it] {'loss': 1.2354, 'learning_rate': 2.1627119261544348e-06, 'epoch': 0.79} + 79%|███████▉ | 602/759 [1:27:56<2:33:20, 58.60s/it][2024-12-31 18:38:01,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.00 | bwd_microstep: 308.30 | bwd_inner_microstep: 307.95 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:38:02,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.12 | bwd_microstep: 287.08 | bwd_inner_microstep: 286.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:38:02,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.38 | bwd_microstep: 265.74 | bwd_inner_microstep: 265.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:03,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.46 | bwd_microstep: 266.79 | bwd_inner_microstep: 266.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:38:03,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 249.81 | bwd_inner_microstep: 249.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:38:04,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 266.96 | bwd_inner_microstep: 266.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:04,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 247.28 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:04,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 247.99 | bwd_inner_microstep: 247.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:05,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:38:05,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:06,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.67 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:06,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:07,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.33 | bwd_microstep: 241.18 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:07,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.72 | bwd_microstep: 242.13 | bwd_inner_microstep: 242.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:07,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:08,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.82 | optimizer_gradients: 0.57 | optimizer_step: 3.12 +[2024-12-31 18:38:08,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.41 | bwd_microstep: 321.89 | bwd_inner_microstep: 241.40 | bwd_allreduce_microstep: 80.45 | step_microstep: 19.25 +[2024-12-31 18:38:08,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2790.50 | bwd: 4162.98 | bwd_inner: 4081.58 | bwd_allreduce: 80.74 | step: 22.07 + 79%|███████▉ | 603/759 [1:28:03<1:52:18, 43.20s/it] {'loss': 1.257, 'learning_rate': 2.1362718114546777e-06, 'epoch': 0.79} + 79%|███████▉ | 603/759 [1:28:03<1:52:18, 43.20s/it][2024-12-31 18:38:08,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.57 | bwd_microstep: 310.60 | bwd_inner_microstep: 310.26 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:38:09,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.00 | bwd_microstep: 280.81 | bwd_inner_microstep: 280.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:38:09,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.98 | bwd_microstep: 263.50 | bwd_inner_microstep: 263.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:10,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 257.32 | bwd_inner_microstep: 257.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:10,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 247.52 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:11,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:11,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.61 | bwd_microstep: 250.03 | bwd_inner_microstep: 250.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:38:12,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:38:12,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:12,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:13,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:13,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 243.02 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:14,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:14,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.15 | bwd_microstep: 242.58 | bwd_inner_microstep: 242.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:15,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.06 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:15,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.35 | optimizer_gradients: 0.66 | optimizer_step: 3.06 +[2024-12-31 18:38:15,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 392.85 | bwd_inner_microstep: 258.99 | bwd_allreduce_microstep: 133.82 | step_microstep: 11.25 +[2024-12-31 18:38:15,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2783.58 | bwd: 4195.79 | bwd_inner: 4061.19 | bwd_allreduce: 134.07 | step: 14.13 + 80%|███████▉ | 604/759 [1:28:10<1:23:44, 32.42s/it] {'loss': 1.2592, 'learning_rate': 2.109974972260921e-06, 'epoch': 0.8} + 80%|███████▉ | 604/759 [1:28:10<1:23:44, 32.42s/it][2024-12-31 18:38:16,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 273.68 | bwd_microstep: 468.97 | bwd_inner_microstep: 468.59 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:38:16,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.99 | bwd_microstep: 289.16 | bwd_inner_microstep: 289.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:17,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.20 | bwd_microstep: 280.72 | bwd_inner_microstep: 280.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:17,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.22 | bwd_microstep: 257.62 | bwd_inner_microstep: 257.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:18,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.64 | bwd_microstep: 256.91 | bwd_inner_microstep: 256.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:18,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 248.23 | bwd_inner_microstep: 248.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:19,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 246.24 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:19,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 247.06 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:20,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.26 +[2024-12-31 18:38:20,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:20,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:21,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:21,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.64 | bwd_microstep: 243.62 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.25 +[2024-12-31 18:38:22,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:22,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 285.69 | bwd_inner_microstep: 285.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:23,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.41 | optimizer_step: 3.58 +[2024-12-31 18:38:23,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 257.42 | bwd_inner_microstep: 243.15 | bwd_allreduce_microstep: 14.15 | step_microstep: 13.20 +[2024-12-31 18:38:23,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2888.13 | bwd: 4303.54 | bwd_inner: 4287.98 | bwd_allreduce: 14.69 | step: 16.33 + 80%|███████▉ | 605/759 [1:28:18<1:04:01, 24.94s/it] {'loss': 1.2424, 'learning_rate': 2.0838218876961524e-06, 'epoch': 0.8} + 80%|███████▉ | 605/759 [1:28:18<1:04:01, 24.94s/it][2024-12-31 18:38:23,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.68 | bwd_microstep: 356.81 | bwd_inner_microstep: 356.47 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:38:24,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.49 | bwd_microstep: 282.01 | bwd_inner_microstep: 281.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:38:24,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 263.02 | bwd_inner_microstep: 262.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:38:25,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 255.57 | bwd_inner_microstep: 255.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:25,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.84 | bwd_microstep: 263.92 | bwd_inner_microstep: 263.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:26,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.29 | bwd_microstep: 254.62 | bwd_inner_microstep: 254.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:26,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 250.21 | bwd_inner_microstep: 250.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:27,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 247.29 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:38:27,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:27,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:28,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.93 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:38:28,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.00 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:38:29,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:29,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.68 | bwd_microstep: 242.35 | bwd_inner_microstep: 242.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:30,033] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:38:30,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.78 | optimizer_gradients: 0.82 | optimizer_step: 3.32 +[2024-12-31 18:38:30,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.17 | bwd_microstep: 268.40 | bwd_inner_microstep: 241.99 | bwd_allreduce_microstep: 26.34 | step_microstep: 11.40 +[2024-12-31 18:38:30,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.78 | bwd: 4147.15 | bwd_inner: 4119.99 | bwd_allreduce: 26.59 | step: 13.93 + 80%|███████▉ | 606/759 [1:28:25<50:04, 19.64s/it] {'loss': 1.2211, 'learning_rate': 2.057813034264181e-06, 'epoch': 0.8} + 80%|███████▉ | 606/759 [1:28:25<50:04, 19.64s/it][2024-12-31 18:38:31,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.41 | bwd_microstep: 306.87 | bwd_inner_microstep: 306.53 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:38:31,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.93 | bwd_microstep: 266.22 | bwd_inner_microstep: 266.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:31,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.53 | bwd_microstep: 257.08 | bwd_inner_microstep: 257.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:32,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.83 | bwd_microstep: 264.72 | bwd_inner_microstep: 264.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:32,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.74 | bwd_microstep: 250.17 | bwd_inner_microstep: 250.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:38:33,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.46 | bwd_inner_microstep: 246.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:33,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 247.82 | bwd_inner_microstep: 247.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:34,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:38:34,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:35,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 243.74 | bwd_inner_microstep: 243.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:35,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.54 | bwd_microstep: 241.84 | bwd_inner_microstep: 241.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:35,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:36,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:36,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.10 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:38:37,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 241.61 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:37,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.78 | optimizer_gradients: 0.60 | optimizer_step: 3.08 +[2024-12-31 18:38:37,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 299.88 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 55.97 | step_microstep: 10.26 +[2024-12-31 18:38:37,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2782.29 | bwd: 4088.01 | bwd_inner: 4031.25 | bwd_allreduce: 56.22 | step: 13.36 + 80%|███████▉ | 607/759 [1:28:32<40:15, 15.89s/it] {'loss': 1.2308, 'learning_rate': 2.0319488858409552e-06, 'epoch': 0.8} + 80%|███████▉ | 607/759 [1:28:32<40:15, 15.89s/it][2024-12-31 18:38:38,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.34 | bwd_microstep: 354.52 | bwd_inner_microstep: 354.15 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:38:38,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.79 | bwd_microstep: 286.75 | bwd_inner_microstep: 286.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:38:39,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.63 | bwd_microstep: 262.80 | bwd_inner_microstep: 262.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:39,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 258.96 | bwd_inner_microstep: 258.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:40,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.63 | bwd_microstep: 256.66 | bwd_inner_microstep: 256.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:40,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 247.24 | bwd_inner_microstep: 247.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:40,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:41,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 247.30 | bwd_inner_microstep: 247.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:41,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:42,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:42,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:43,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:43,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:38:43,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:38:44,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.17 | bwd_microstep: 241.98 | bwd_inner_microstep: 241.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:45,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.78 | optimizer_gradients: 0.82 | optimizer_step: 3.11 +[2024-12-31 18:38:45,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.76 | bwd_microstep: 502.41 | bwd_inner_microstep: 226.09 | bwd_allreduce_microstep: 276.27 | step_microstep: 11.04 +[2024-12-31 18:38:45,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.21 | bwd: 4370.33 | bwd_inner: 4093.29 | bwd_allreduce: 276.52 | step: 13.94 + 80%|████████ | 608/759 [1:28:40<33:38, 13.36s/it] {'loss': 1.2381, 'learning_rate': 2.0062299136659203e-06, 'epoch': 0.8} + 80%|████████ | 608/759 [1:28:40<33:38, 13.36s/it][2024-12-31 18:38:45,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 206.73 | bwd_microstep: 311.79 | bwd_inner_microstep: 311.42 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:38:46,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.17 | bwd_microstep: 384.17 | bwd_inner_microstep: 384.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:46,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.96 | bwd_microstep: 280.63 | bwd_inner_microstep: 280.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:47,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.29 | bwd_microstep: 257.25 | bwd_inner_microstep: 257.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:47,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.89 | bwd_microstep: 249.00 | bwd_inner_microstep: 248.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:48,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 249.43 | bwd_inner_microstep: 249.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:38:48,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 246.09 | bwd_inner_microstep: 246.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:48,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:49,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:49,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:50,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:50,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:51,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.28 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:51,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.45 | bwd_microstep: 240.56 | bwd_inner_microstep: 240.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:51,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:38:52,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.74 | optimizer_step: 3.27 +[2024-12-31 18:38:52,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.84 | bwd_microstep: 239.74 | bwd_inner_microstep: 226.08 | bwd_allreduce_microstep: 13.56 | step_microstep: 10.74 +[2024-12-31 18:38:52,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.52 | bwd: 4165.57 | bwd_inner: 4151.14 | bwd_allreduce: 13.84 | step: 13.57 + 80%|████████ | 609/759 [1:28:47<28:51, 11.54s/it] {'loss': 1.2114, 'learning_rate': 1.980656586333449e-06, 'epoch': 0.8} + 80%|████████ | 609/759 [1:28:47<28:51, 11.54s/it][2024-12-31 18:38:52,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.78 | bwd_microstep: 308.85 | bwd_inner_microstep: 308.50 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:38:53,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.03 | bwd_microstep: 363.34 | bwd_inner_microstep: 363.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:54,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.44 | bwd_microstep: 281.12 | bwd_inner_microstep: 281.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:38:54,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.91 | bwd_microstep: 262.00 | bwd_inner_microstep: 261.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:54,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 249.19 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:55,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:55,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:56,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 248.73 | bwd_inner_microstep: 248.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:38:56,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:57,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:38:57,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:57,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:38:58,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.14 | bwd_microstep: 244.08 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:38:58,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 244.13 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:38:59,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:38:59,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.66 | optimizer_gradients: 0.61 | optimizer_step: 3.10 +[2024-12-31 18:38:59,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 336.89 | bwd_inner_microstep: 242.08 | bwd_allreduce_microstep: 94.77 | step_microstep: 11.79 +[2024-12-31 18:38:59,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.60 | bwd: 4251.48 | bwd_inner: 4155.95 | bwd_allreduce: 95.02 | step: 14.64 + 80%|████████ | 610/759 [1:28:54<25:33, 10.29s/it] {'loss': 1.2086, 'learning_rate': 1.955229369784295e-06, 'epoch': 0.8} + 80%|████████ | 610/759 [1:28:54<25:33, 10.29s/it][2024-12-31 18:39:00,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.87 | bwd_microstep: 314.72 | bwd_inner_microstep: 314.35 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.23 +[2024-12-31 18:39:00,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.84 | bwd_microstep: 346.53 | bwd_inner_microstep: 346.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:01,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.80 | bwd_microstep: 267.43 | bwd_inner_microstep: 267.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:39:01,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.58 | bwd_microstep: 255.82 | bwd_inner_microstep: 255.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:02,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.15 | bwd_microstep: 250.47 | bwd_inner_microstep: 250.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:39:02,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.46 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:03,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 257.16 | bwd_inner_microstep: 257.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:03,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:04,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:39:04,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 248.67 | bwd_inner_microstep: 248.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:39:04,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 246.14 | bwd_inner_microstep: 246.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:05,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:05,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.68 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:39:06,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 257.53 | bwd_inner_microstep: 257.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:06,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:07,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.87 | optimizer_step: 4.97 +[2024-12-31 18:39:07,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 427.67 | bwd_inner_microstep: 413.70 | bwd_allreduce_microstep: 13.84 | step_microstep: 14.26 +[2024-12-31 18:39:07,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2866.11 | bwd: 4338.82 | bwd_inner: 4324.08 | bwd_allreduce: 14.13 | step: 17.51 + 81%|████████ | 611/759 [1:29:02<23:19, 9.46s/it] {'loss': 1.2207, 'learning_rate': 1.929948727297096e-06, 'epoch': 0.8} + 81%|████████ | 611/759 [1:29:02<23:19, 9.46s/it][2024-12-31 18:39:07,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.51 | bwd_microstep: 345.36 | bwd_inner_microstep: 345.00 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:39:08,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.58 | bwd_microstep: 296.96 | bwd_inner_microstep: 296.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:08,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.02 | bwd_microstep: 284.52 | bwd_inner_microstep: 284.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:09,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.27 | bwd_microstep: 257.02 | bwd_inner_microstep: 257.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:39:09,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.55 | bwd_microstep: 255.40 | bwd_inner_microstep: 255.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:39:10,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 248.15 | bwd_inner_microstep: 248.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:39:10,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 250.72 | bwd_inner_microstep: 250.52 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.29 +[2024-12-31 18:39:11,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 246.89 | bwd_inner_microstep: 246.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:39:11,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:11,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:39:12,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:39:12,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 244.01 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.31 +[2024-12-31 18:39:13,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 243.37 | bwd_inner_microstep: 243.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:13,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.06 | bwd_microstep: 240.75 | bwd_inner_microstep: 240.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:14,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:14,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.26 | optimizer_gradients: 0.69 | optimizer_step: 3.14 +[2024-12-31 18:39:14,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.30 | bwd_microstep: 322.12 | bwd_inner_microstep: 241.69 | bwd_allreduce_microstep: 80.38 | step_microstep: 11.34 +[2024-12-31 18:39:14,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.36 | bwd: 4212.20 | bwd_inner: 4130.54 | bwd_allreduce: 80.82 | step: 14.20 + 81%|████████ | 612/759 [1:29:09<21:36, 8.82s/it] {'loss': 1.1851, 'learning_rate': 1.9048151194799435e-06, 'epoch': 0.81} + 81%|████████ | 612/759 [1:29:09<21:36, 8.82s/it][2024-12-31 18:39:15,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.82 | bwd_microstep: 356.60 | bwd_inner_microstep: 356.27 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:39:15,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.44 | bwd_microstep: 307.06 | bwd_inner_microstep: 307.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:16,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.00 | bwd_microstep: 291.10 | bwd_inner_microstep: 291.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:39:16,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.59 | bwd_microstep: 268.54 | bwd_inner_microstep: 268.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:39:17,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.67 | bwd_microstep: 267.39 | bwd_inner_microstep: 267.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:39:17,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.57 | bwd_microstep: 256.73 | bwd_inner_microstep: 256.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:18,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 248.03 | bwd_inner_microstep: 248.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:18,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.04 | bwd_microstep: 249.73 | bwd_inner_microstep: 249.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:18,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:19,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:19,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:20,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:20,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 247.56 | bwd_inner_microstep: 247.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:21,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:21,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:22,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.66 | optimizer_step: 3.54 +[2024-12-31 18:39:22,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.63 | bwd_microstep: 256.29 | bwd_inner_microstep: 242.58 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.27 +[2024-12-31 18:39:22,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2866.12 | bwd: 4221.49 | bwd_inner: 4207.01 | bwd_allreduce: 13.87 | step: 14.05 + 81%|████████ | 613/759 [1:29:16<20:25, 8.39s/it] {'loss': 1.2128, 'learning_rate': 1.8798290042619949e-06, 'epoch': 0.81} + 81%|████████ | 613/759 [1:29:16<20:25, 8.39s/it][2024-12-31 18:39:22,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 200.64 | bwd_microstep: 305.64 | bwd_inner_microstep: 305.27 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:39:23,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.03 | bwd_microstep: 267.55 | bwd_inner_microstep: 267.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:23,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.56 | bwd_microstep: 254.59 | bwd_inner_microstep: 254.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:23,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 259.45 | bwd_inner_microstep: 259.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:39:24,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:24,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 248.52 | bwd_inner_microstep: 248.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:39:25,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 247.14 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:25,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 246.48 | bwd_inner_microstep: 246.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:26,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:26,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:26,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 242.79 | bwd_inner_microstep: 242.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:27,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:39:27,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:39:28,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 242.69 | bwd_inner_microstep: 242.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:28,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.07 | bwd_microstep: 241.07 | bwd_inner_microstep: 241.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:39:29,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.16 | optimizer_gradients: 0.93 | optimizer_step: 3.27 +[2024-12-31 18:39:29,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.38 | bwd_microstep: 361.69 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 119.18 | step_microstep: 13.77 +[2024-12-31 18:39:29,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2772.46 | bwd: 4140.93 | bwd_inner: 4020.91 | bwd_allreduce: 119.43 | step: 16.80 + 81%|████████ | 614/759 [1:29:24<19:24, 8.03s/it] {'loss': 1.2371, 'learning_rate': 1.8549908368851099e-06, 'epoch': 0.81} + 81%|████████ | 614/759 [1:29:24<19:24, 8.03s/it][2024-12-31 18:39:29,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.62 | bwd_microstep: 314.35 | bwd_inner_microstep: 314.00 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:39:30,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.69 | bwd_microstep: 292.47 | bwd_inner_microstep: 292.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:39:30,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.54 | bwd_microstep: 266.63 | bwd_inner_microstep: 266.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:39:31,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.25 | bwd_microstep: 263.22 | bwd_inner_microstep: 263.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:39:31,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.45 | bwd_microstep: 255.66 | bwd_inner_microstep: 255.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:39:32,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 248.62 | bwd_inner_microstep: 248.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:39:32,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 249.90 | bwd_inner_microstep: 249.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:32,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 248.28 | bwd_inner_microstep: 248.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:39:33,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:39:33,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:34,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:34,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.20 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:39:35,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.96 | bwd_microstep: 241.33 | bwd_inner_microstep: 241.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:35,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:39:35,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.46 | bwd_microstep: 240.36 | bwd_inner_microstep: 240.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:36,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.42 | optimizer_gradients: 0.67 | optimizer_step: 3.32 +[2024-12-31 18:39:36,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 272.98 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 28.83 | step_microstep: 11.79 +[2024-12-31 18:39:36,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2807.50 | bwd: 4117.07 | bwd_inner: 4087.32 | bwd_allreduce: 29.12 | step: 14.62 + 81%|████████ | 615/759 [1:29:31<18:41, 7.79s/it] {'loss': 1.208, 'learning_rate': 1.8303010698955803e-06, 'epoch': 0.81} + 81%|████████ | 615/759 [1:29:31<18:41, 7.79s/it][2024-12-31 18:39:37,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.95 | bwd_microstep: 408.11 | bwd_inner_microstep: 407.75 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.23 +[2024-12-31 18:39:37,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.38 | bwd_microstep: 283.49 | bwd_inner_microstep: 283.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:38,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.19 | bwd_microstep: 258.03 | bwd_inner_microstep: 258.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:39:38,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 250.78 | bwd_inner_microstep: 250.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:38,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 246.83 | bwd_inner_microstep: 246.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:39:39,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.97 | bwd_microstep: 248.14 | bwd_inner_microstep: 248.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:39,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.11 +[2024-12-31 18:39:40,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:40,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 251.69 | bwd_inner_microstep: 251.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:41,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.48 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.22 +[2024-12-31 18:39:41,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.74 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:41,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 245.19 | bwd_inner_microstep: 245.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:42,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.27 | bwd_microstep: 240.61 | bwd_inner_microstep: 240.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:39:42,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.52 | bwd_microstep: 242.08 | bwd_inner_microstep: 242.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:43,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.88 | bwd_microstep: 240.94 | bwd_inner_microstep: 240.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:43,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.71 | optimizer_gradients: 0.60 | optimizer_step: 3.16 +[2024-12-31 18:39:43,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.19 | bwd_microstep: 543.51 | bwd_inner_microstep: 242.10 | bwd_allreduce_microstep: 301.37 | step_microstep: 11.53 +[2024-12-31 18:39:43,947] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2788.50 | bwd: 4438.10 | bwd_inner: 4135.43 | bwd_allreduce: 301.96 | step: 14.38 + 81%|████████ | 616/759 [1:29:38<18:22, 7.71s/it] {'loss': 1.2582, 'learning_rate': 1.8057601531358693e-06, 'epoch': 0.81} + 81%|████████ | 616/759 [1:29:38<18:22, 7.71s/it][2024-12-31 18:39:44,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.81 | bwd_microstep: 365.35 | bwd_inner_microstep: 365.00 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:39:45,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.15 | bwd_microstep: 297.89 | bwd_inner_microstep: 297.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:39:45,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.40 | bwd_microstep: 268.16 | bwd_inner_microstep: 268.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:39:45,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.02 | bwd_microstep: 263.02 | bwd_inner_microstep: 262.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:46,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.64 | bwd_microstep: 261.74 | bwd_inner_microstep: 261.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:46,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 249.23 | bwd_inner_microstep: 249.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:47,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 247.22 | bwd_inner_microstep: 247.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:47,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 247.64 | bwd_inner_microstep: 247.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:39:48,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 248.20 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:48,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.37 | bwd_microstep: 246.72 | bwd_inner_microstep: 246.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:49,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 244.06 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:49,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 242.76 | bwd_inner_microstep: 242.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:39:49,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:50,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 244.03 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:50,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:51,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.63 | optimizer_step: 3.37 +[2024-12-31 18:39:51,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 257.85 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 14.04 | step_microstep: 11.13 +[2024-12-31 18:39:51,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2864.43 | bwd: 4171.51 | bwd_inner: 4156.46 | bwd_allreduce: 14.37 | step: 14.03 + 81%|████████▏ | 617/759 [1:29:46<17:58, 7.59s/it] {'loss': 1.2378, 'learning_rate': 1.7813685337364205e-06, 'epoch': 0.81} + 81%|████████▏ | 617/759 [1:29:46<17:58, 7.59s/it][2024-12-31 18:39:51,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.09 | bwd_microstep: 308.08 | bwd_inner_microstep: 307.72 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.22 +[2024-12-31 18:39:52,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.42 | bwd_microstep: 287.18 | bwd_inner_microstep: 287.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:52,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.01 | bwd_microstep: 258.98 | bwd_inner_microstep: 258.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:39:53,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 255.57 | bwd_inner_microstep: 255.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:39:53,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 248.40 | bwd_inner_microstep: 248.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:39:54,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.85 | bwd_microstep: 247.49 | bwd_inner_microstep: 247.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:39:54,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:54,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.37 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:39:55,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.79 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:39:55,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:56,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.91 | bwd_inner_microstep: 243.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:39:56,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 243.91 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:57,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:57,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.66 | bwd_microstep: 241.17 | bwd_inner_microstep: 241.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:57,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.21 | bwd_microstep: 240.42 | bwd_inner_microstep: 240.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:39:58,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.02 | optimizer_gradients: 0.60 | optimizer_step: 3.10 +[2024-12-31 18:39:58,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.61 | bwd_microstep: 613.47 | bwd_inner_microstep: 250.97 | bwd_allreduce_microstep: 362.44 | step_microstep: 10.94 +[2024-12-31 18:39:58,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2790.51 | bwd: 4409.83 | bwd_inner: 4046.61 | bwd_allreduce: 362.70 | step: 13.78 + 81%|████████▏ | 618/759 [1:29:53<17:45, 7.56s/it] {'loss': 1.2273, 'learning_rate': 1.7571266561075073e-06, 'epoch': 0.81} + 81%|████████▏ | 618/759 [1:29:53<17:45, 7.56s/it][2024-12-31 18:39:59,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.74 | bwd_microstep: 334.93 | bwd_inner_microstep: 334.57 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:39:59,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.63 | bwd_microstep: 348.10 | bwd_inner_microstep: 348.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:00,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.15 | bwd_microstep: 267.84 | bwd_inner_microstep: 267.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:00,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.16 | bwd_microstep: 264.23 | bwd_inner_microstep: 264.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:40:01,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.95 | bwd_microstep: 255.22 | bwd_inner_microstep: 255.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:40:01,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.61 | bwd_microstep: 247.82 | bwd_inner_microstep: 247.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:02,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 248.61 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:02,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.75 | bwd_inner_microstep: 246.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:40:03,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:03,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 267.38 | bwd_inner_microstep: 267.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:03,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.04 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:04,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:40:04,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 246.47 | bwd_inner_microstep: 246.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.27 | bwd_microstep: 242.33 | bwd_inner_microstep: 242.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:05,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.36 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:06,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.32 | optimizer_gradients: 0.67 | optimizer_step: 3.38 +[2024-12-31 18:40:06,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.95 | bwd_microstep: 256.65 | bwd_inner_microstep: 242.77 | bwd_allreduce_microstep: 13.75 | step_microstep: 12.03 +[2024-12-31 18:40:06,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.44 | bwd: 4203.88 | bwd_inner: 4189.06 | bwd_allreduce: 14.03 | step: 15.15 + 82%|████████▏ | 619/759 [1:30:01<17:29, 7.50s/it] {'loss': 1.2149, 'learning_rate': 1.7330349619311415e-06, 'epoch': 0.82} + 82%|████████▏ | 619/759 [1:30:01<17:29, 7.50s/it][2024-12-31 18:40:06,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.01 | bwd_microstep: 314.96 | bwd_inner_microstep: 314.61 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:40:07,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.81 | bwd_microstep: 283.28 | bwd_inner_microstep: 283.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:40:07,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.05 | bwd_microstep: 263.54 | bwd_inner_microstep: 263.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:08,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.66 | bwd_microstep: 261.62 | bwd_inner_microstep: 261.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:40:08,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.69 | bwd_microstep: 265.48 | bwd_inner_microstep: 265.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:08,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 248.88 | bwd_inner_microstep: 248.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:09,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.59 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:09,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:10,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:10,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.21 +[2024-12-31 18:40:11,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:40:11,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:11,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:12,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:12,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.02 | bwd_microstep: 242.34 | bwd_inner_microstep: 242.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:13,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.29 | optimizer_gradients: 0.57 | optimizer_step: 3.09 +[2024-12-31 18:40:13,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 349.73 | bwd_inner_microstep: 241.68 | bwd_allreduce_microstep: 108.00 | step_microstep: 10.90 +[2024-12-31 18:40:13,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2815.03 | bwd: 4185.25 | bwd_inner: 4076.25 | bwd_allreduce: 108.31 | step: 13.67 + 82%|████████▏ | 620/759 [1:30:08<17:13, 7.44s/it] {'loss': 1.2558, 'learning_rate': 1.7090938901530264e-06, 'epoch': 0.82} + 82%|████████▏ | 620/759 [1:30:08<17:13, 7.44s/it][2024-12-31 18:40:13,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.98 | bwd_microstep: 313.68 | bwd_inner_microstep: 313.34 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:40:14,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.18 | bwd_microstep: 290.73 | bwd_inner_microstep: 290.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:14,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.87 | bwd_microstep: 268.44 | bwd_inner_microstep: 268.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:15,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.70 | bwd_microstep: 265.63 | bwd_inner_microstep: 265.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:40:15,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 250.82 | bwd_inner_microstep: 250.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.37 +[2024-12-31 18:40:16,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.72 | bwd_microstep: 265.60 | bwd_inner_microstep: 265.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:40:16,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.70 | bwd_microstep: 248.35 | bwd_inner_microstep: 248.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:40:17,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:40:17,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 244.78 | bwd_inner_microstep: 244.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:40:18,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:18,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:18,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:19,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.37 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:19,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.17 | bwd_microstep: 241.37 | bwd_inner_microstep: 241.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:40:20,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.43 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:20,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.55 | optimizer_gradients: 0.56 | optimizer_step: 3.09 +[2024-12-31 18:40:20,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 345.70 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 101.76 | step_microstep: 10.95 +[2024-12-31 18:40:20,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2823.70 | bwd: 4200.58 | bwd_inner: 4097.68 | bwd_allreduce: 102.08 | step: 13.96 + 82%|████████▏ | 621/759 [1:30:15<17:01, 7.40s/it] {'loss': 1.1991, 'learning_rate': 1.6853038769745466e-06, 'epoch': 0.82} + 82%|████████▏ | 621/759 [1:30:15<17:01, 7.40s/it][2024-12-31 18:40:21,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 263.97 | bwd_microstep: 448.15 | bwd_inner_microstep: 447.72 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:40:21,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.03 | bwd_microstep: 298.20 | bwd_inner_microstep: 298.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:22,431] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.52 | bwd_microstep: 266.53 | bwd_inner_microstep: 266.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:22,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.25 | bwd_microstep: 288.32 | bwd_inner_microstep: 288.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:40:23,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.40 | bwd_microstep: 255.35 | bwd_inner_microstep: 255.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:40:23,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.39 | bwd_microstep: 248.76 | bwd_inner_microstep: 248.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:24,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:24,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 245.81 | bwd_inner_microstep: 245.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:25,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 255.66 | bwd_inner_microstep: 255.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:25,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:26,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 317.26 | bwd_inner_microstep: 317.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:40:26,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 245.23 | bwd_inner_microstep: 245.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:26,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:27,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:40:27,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.85 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.37 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.19 +[2024-12-31 18:40:28,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.50 | optimizer_gradients: 0.66 | optimizer_step: 3.46 +[2024-12-31 18:40:28,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 311.65 | bwd_inner_microstep: 263.36 | bwd_allreduce_microstep: 48.23 | step_microstep: 11.90 +[2024-12-31 18:40:28,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2868.77 | bwd: 4477.27 | bwd_inner: 4427.95 | bwd_allreduce: 48.63 | step: 14.92 + 82%|████████▏ | 622/759 [1:30:23<17:03, 7.47s/it] {'loss': 1.2226, 'learning_rate': 1.6616653558448437e-06, 'epoch': 0.82} + 82%|████████▏ | 622/759 [1:30:23<17:03, 7.47s/it][2024-12-31 18:40:28,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.46 | bwd_microstep: 312.03 | bwd_inner_microstep: 311.69 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:40:29,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.17 | bwd_microstep: 295.49 | bwd_inner_microstep: 295.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:29,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.49 | bwd_microstep: 257.79 | bwd_inner_microstep: 257.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:40:30,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.28 | bwd_microstep: 262.90 | bwd_inner_microstep: 262.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:40:30,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.48 | bwd_microstep: 250.22 | bwd_inner_microstep: 250.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:40:31,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 245.83 | bwd_inner_microstep: 245.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:31,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.95 | bwd_inner_microstep: 245.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:40:32,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:32,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.64 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:32,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:33,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.68 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:40:33,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:34,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:40:34,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:35,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:35,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.68 | optimizer_gradients: 0.73 | optimizer_step: 3.16 +[2024-12-31 18:40:35,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 602.83 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 357.80 | step_microstep: 11.42 +[2024-12-31 18:40:35,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2791.37 | bwd: 4430.32 | bwd_inner: 4071.74 | bwd_allreduce: 358.04 | step: 14.07 + 82%|████████▏ | 623/759 [1:30:30<16:56, 7.48s/it] {'loss': 1.2173, 'learning_rate': 1.638178757452894e-06, 'epoch': 0.82} + 82%|████████▏ | 623/759 [1:30:30<16:56, 7.48s/it][2024-12-31 18:40:36,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.92 | bwd_microstep: 315.72 | bwd_inner_microstep: 315.36 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:40:36,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.41 | bwd_microstep: 293.03 | bwd_inner_microstep: 293.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:40:37,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.77 | bwd_microstep: 275.04 | bwd_inner_microstep: 275.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:37,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.79 | bwd_microstep: 267.24 | bwd_inner_microstep: 267.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:38,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.29 | bwd_microstep: 261.68 | bwd_inner_microstep: 261.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:40:38,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.19 | bwd_microstep: 254.06 | bwd_inner_microstep: 254.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:39,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 262.41 | bwd_inner_microstep: 261.14 | bwd_allreduce_microstep: 1.16 | step_microstep: 0.22 +[2024-12-31 18:40:39,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:40:40,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:40,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:40,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:40:41,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:41,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 252.12 | bwd_inner_microstep: 252.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:42,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:40:42,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 242.89 | bwd_inner_microstep: 242.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:40:43,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.59 | optimizer_gradients: 0.56 | optimizer_step: 3.11 +[2024-12-31 18:40:43,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.64 | bwd_microstep: 282.95 | bwd_inner_microstep: 226.96 | bwd_allreduce_microstep: 55.94 | step_microstep: 12.49 +[2024-12-31 18:40:43,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2807.96 | bwd: 4173.24 | bwd_inner: 4115.26 | bwd_allreduce: 57.38 | step: 15.49 + 82%|████████▏ | 624/759 [1:30:38<16:41, 7.42s/it] {'loss': 1.2116, 'learning_rate': 1.614844509719674e-06, 'epoch': 0.82} + 82%|████████▏ | 624/759 [1:30:38<16:41, 7.42s/it][2024-12-31 18:40:43,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.92 | bwd_microstep: 298.00 | bwd_inner_microstep: 297.67 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:40:44,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.67 | bwd_microstep: 281.13 | bwd_inner_microstep: 281.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:40:44,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.96 | bwd_microstep: 258.44 | bwd_inner_microstep: 258.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:45,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.03 | bwd_microstep: 260.16 | bwd_inner_microstep: 260.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:45,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.73 | bwd_microstep: 254.48 | bwd_inner_microstep: 254.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:40:45,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 253.10 | bwd_inner_microstep: 253.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:46,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 247.62 | bwd_inner_microstep: 247.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:40:46,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 246.73 | bwd_inner_microstep: 246.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:40:47,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:47,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:40:48,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 249.84 | bwd_inner_microstep: 249.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:48,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:40:48,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 244.95 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:40:49,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:49,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 247.00 | bwd_inner_microstep: 246.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:40:50,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.35 | optimizer_gradients: 0.60 | optimizer_step: 3.14 +[2024-12-31 18:40:50,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 284.83 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 40.30 | step_microstep: 11.43 +[2024-12-31 18:40:50,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2788.84 | bwd: 4106.67 | bwd_inner: 4065.60 | bwd_allreduce: 40.55 | step: 14.42 + 82%|████████▏ | 625/759 [1:30:45<16:24, 7.35s/it] {'loss': 1.2148, 'learning_rate': 1.5916630377903696e-06, 'epoch': 0.82} + 82%|████████▏ | 625/759 [1:30:45<16:24, 7.35s/it][2024-12-31 18:40:50,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.47 | bwd_microstep: 351.55 | bwd_inner_microstep: 351.16 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:40:51,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.92 | bwd_microstep: 289.98 | bwd_inner_microstep: 289.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:51,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.08 | bwd_microstep: 267.56 | bwd_inner_microstep: 267.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:40:52,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.41 | bwd_microstep: 268.56 | bwd_inner_microstep: 268.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:40:52,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.17 | bwd_microstep: 258.35 | bwd_inner_microstep: 258.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:53,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.95 | bwd_microstep: 249.58 | bwd_inner_microstep: 249.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:40:53,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 249.33 | bwd_inner_microstep: 249.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:40:54,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 248.59 | bwd_inner_microstep: 248.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:54,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.79 | bwd_microstep: 246.53 | bwd_inner_microstep: 246.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:54,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.83 | bwd_microstep: 247.79 | bwd_inner_microstep: 247.42 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.28 +[2024-12-31 18:40:55,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 245.39 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:40:55,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 245.95 | bwd_microstep: 247.64 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:56,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.23 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:40:56,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.36 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:57,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 244.65 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:57,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.65 | optimizer_gradients: 0.64 | optimizer_step: 3.44 +[2024-12-31 18:40:57,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 272.44 | bwd_inner_microstep: 248.51 | bwd_allreduce_microstep: 23.86 | step_microstep: 10.96 +[2024-12-31 18:40:57,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2923.85 | bwd: 4174.70 | bwd_inner: 4149.53 | bwd_allreduce: 24.36 | step: 13.83 + 82%|████████▏ | 626/759 [1:30:52<16:18, 7.36s/it] {'loss': 1.1965, 'learning_rate': 1.5686347640266208e-06, 'epoch': 0.82} + 82%|████████▏ | 626/759 [1:30:52<16:18, 7.36s/it][2024-12-31 18:40:58,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.08 | bwd_microstep: 340.86 | bwd_inner_microstep: 340.47 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:40:58,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.05 | bwd_microstep: 270.19 | bwd_inner_microstep: 270.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:40:59,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.33 | bwd_microstep: 258.69 | bwd_inner_microstep: 258.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:40:59,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.33 | bwd_microstep: 250.85 | bwd_inner_microstep: 250.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:00,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.52 | bwd_microstep: 254.32 | bwd_inner_microstep: 254.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:00,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 246.33 | bwd_inner_microstep: 246.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:00,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 245.82 | bwd_inner_microstep: 245.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:01,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.91 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.23 +[2024-12-31 18:41:01,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:02,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:02,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:41:03,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 255.20 | bwd_inner_microstep: 255.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:03,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.19 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:04,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.83 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:04,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.93 | bwd_microstep: 225.52 | bwd_inner_microstep: 225.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:05,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.84 | optimizer_gradients: 0.58 | optimizer_step: 3.09 +[2024-12-31 18:41:05,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.35 | bwd_microstep: 443.02 | bwd_inner_microstep: 242.55 | bwd_allreduce_microstep: 200.42 | step_microstep: 18.16 +[2024-12-31 18:41:05,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2789.30 | bwd: 4261.98 | bwd_inner: 4060.60 | bwd_allreduce: 200.75 | step: 21.04 + 83%|████████▎ | 627/759 [1:31:00<16:11, 7.36s/it] {'loss': 1.2602, 'learning_rate': 1.5457601079988226e-06, 'epoch': 0.83} + 83%|████████▎ | 627/759 [1:31:00<16:11, 7.36s/it][2024-12-31 18:41:05,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.52 | bwd_microstep: 374.44 | bwd_inner_microstep: 374.06 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:41:06,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.06 | bwd_microstep: 297.39 | bwd_inner_microstep: 297.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:41:06,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.80 | bwd_microstep: 263.13 | bwd_inner_microstep: 263.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:41:07,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.46 | bwd_microstep: 258.24 | bwd_inner_microstep: 258.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:07,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 249.11 | bwd_inner_microstep: 249.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:41:07,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 249.61 | bwd_inner_microstep: 249.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:08,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:08,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 251.24 | bwd_inner_microstep: 251.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:09,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.28 +[2024-12-31 18:41:09,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:10,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:10,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:11,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 244.40 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:11,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:11,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 241.50 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:12,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.67 | optimizer_step: 3.29 +[2024-12-31 18:41:12,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 258.03 | bwd_inner_microstep: 244.28 | bwd_allreduce_microstep: 13.64 | step_microstep: 10.56 +[2024-12-31 18:41:12,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.48 | bwd: 4155.49 | bwd_inner: 4140.73 | bwd_allreduce: 14.01 | step: 13.58 + 83%|████████▎ | 628/759 [1:31:07<16:00, 7.33s/it] {'loss': 1.1897, 'learning_rate': 1.5230394864784925e-06, 'epoch': 0.83} + 83%|████████▎ | 628/759 [1:31:07<16:00, 7.33s/it][2024-12-31 18:41:12,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.01 | bwd_microstep: 311.60 | bwd_inner_microstep: 311.24 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:41:13,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.61 | bwd_microstep: 299.61 | bwd_inner_microstep: 299.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:13,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.19 | bwd_microstep: 263.71 | bwd_inner_microstep: 263.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:14,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.06 | bwd_microstep: 261.99 | bwd_inner_microstep: 261.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:14,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.67 | bwd_microstep: 254.99 | bwd_inner_microstep: 254.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:15,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.58 | bwd_microstep: 256.50 | bwd_inner_microstep: 256.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:15,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:41:16,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 247.61 | bwd_inner_microstep: 247.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:16,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 256.23 | bwd_inner_microstep: 256.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:41:16,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:17,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:41:17,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:18,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 242.73 | bwd_inner_microstep: 242.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:41:18,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.76 | bwd_microstep: 241.68 | bwd_inner_microstep: 241.50 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.19 +[2024-12-31 18:41:19,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.01 | bwd_microstep: 242.74 | bwd_inner_microstep: 242.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:41:19,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.29 | optimizer_gradients: 0.99 | optimizer_step: 3.15 +[2024-12-31 18:41:19,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 420.20 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 176.43 | step_microstep: 11.59 +[2024-12-31 18:41:19,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2804.39 | bwd: 4279.28 | bwd_inner: 4101.54 | bwd_allreduce: 176.86 | step: 14.55 + 83%|████████▎ | 629/759 [1:31:14<15:55, 7.35s/it] {'loss': 1.2226, 'learning_rate': 1.5004733134306692e-06, 'epoch': 0.83} + 83%|████████▎ | 629/759 [1:31:14<15:55, 7.35s/it][2024-12-31 18:41:20,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.49 | bwd_microstep: 369.75 | bwd_inner_microstep: 369.40 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:41:20,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.22 | bwd_microstep: 290.46 | bwd_inner_microstep: 290.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:21,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.84 | bwd_microstep: 267.86 | bwd_inner_microstep: 267.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:41:21,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.91 | bwd_microstep: 257.72 | bwd_inner_microstep: 257.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:22,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 273.43 | bwd_microstep: 466.72 | bwd_inner_microstep: 466.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:22,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.30 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:23,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 253.51 | bwd_inner_microstep: 253.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:23,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:24,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:24,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:25,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:25,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.46 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:41:25,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 244.04 | bwd_inner_microstep: 244.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:26,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:26,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.51 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:27,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.70 | optimizer_gradients: 0.64 | optimizer_step: 4.19 +[2024-12-31 18:41:27,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.66 | bwd_microstep: 273.39 | bwd_inner_microstep: 241.79 | bwd_allreduce_microstep: 31.54 | step_microstep: 11.99 +[2024-12-31 18:41:27,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2935.09 | bwd: 4388.38 | bwd_inner: 4356.06 | bwd_allreduce: 31.79 | step: 15.05 + 83%|████████▎ | 630/759 [1:31:22<15:58, 7.43s/it] {'loss': 1.2064, 'learning_rate': 1.478062000006375e-06, 'epoch': 0.83} + 83%|████████▎ | 630/759 [1:31:22<15:58, 7.43s/it][2024-12-31 18:41:27,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.01 | bwd_microstep: 316.11 | bwd_inner_microstep: 315.76 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:41:28,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.87 | bwd_microstep: 287.76 | bwd_inner_microstep: 287.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:28,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.31 | bwd_microstep: 282.23 | bwd_inner_microstep: 282.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:41:29,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.41 | bwd_microstep: 267.66 | bwd_inner_microstep: 267.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:41:29,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.87 | bwd_microstep: 261.83 | bwd_inner_microstep: 261.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:30,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.55 | bwd_microstep: 257.61 | bwd_inner_microstep: 257.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:30,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:31,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:31,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 251.68 | bwd_inner_microstep: 251.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:31,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 246.37 | bwd_inner_microstep: 246.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:41:32,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.64 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:32,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:41:33,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:33,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.54 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:41:34,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.19 | bwd_microstep: 242.21 | bwd_inner_microstep: 242.02 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.26 +[2024-12-31 18:41:34,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.27 | optimizer_gradients: 0.55 | optimizer_step: 3.09 +[2024-12-31 18:41:34,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 314.94 | bwd_inner_microstep: 269.26 | bwd_allreduce_microstep: 45.64 | step_microstep: 11.14 +[2024-12-31 18:41:34,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2825.12 | bwd: 4199.05 | bwd_inner: 4152.39 | bwd_allreduce: 45.99 | step: 13.72 + 83%|████████▎ | 631/759 [1:31:29<15:45, 7.39s/it] {'loss': 1.217, 'learning_rate': 1.4558059545351144e-06, 'epoch': 0.83} + 83%|████████▎ | 631/759 [1:31:29<15:45, 7.39s/it][2024-12-31 18:41:35,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.44 | bwd_microstep: 360.37 | bwd_inner_microstep: 360.00 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:41:35,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.19 | bwd_microstep: 309.93 | bwd_inner_microstep: 309.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:41:36,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.64 | bwd_microstep: 283.58 | bwd_inner_microstep: 283.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:36,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.25 | bwd_microstep: 262.05 | bwd_inner_microstep: 262.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:37,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 251.35 | bwd_inner_microstep: 251.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:37,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 250.43 | bwd_inner_microstep: 250.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:38,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 247.86 | bwd_inner_microstep: 247.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:38,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 249.80 | bwd_inner_microstep: 249.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:38,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 249.32 | bwd_inner_microstep: 249.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.31 +[2024-12-31 18:41:39,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 244.08 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:39,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 245.83 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:41:40,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:41:40,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:41,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:41,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:42,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.12 | optimizer_gradients: 0.56 | optimizer_step: 3.12 +[2024-12-31 18:41:42,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.90 | bwd_microstep: 446.09 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 203.08 | step_microstep: 13.97 +[2024-12-31 18:41:42,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.48 | bwd: 4372.76 | bwd_inner: 4168.80 | bwd_allreduce: 203.34 | step: 16.97 + 83%|████████▎ | 632/759 [1:31:37<15:42, 7.42s/it] {'loss': 1.1877, 'learning_rate': 1.4337055825174506e-06, 'epoch': 0.83} + 83%|████████▎ | 632/759 [1:31:37<15:42, 7.42s/it][2024-12-31 18:41:42,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 293.41 | bwd_microstep: 367.64 | bwd_inner_microstep: 367.26 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:41:43,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.68 | bwd_microstep: 291.27 | bwd_inner_microstep: 291.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:43,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.45 | bwd_microstep: 280.72 | bwd_inner_microstep: 280.48 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.29 +[2024-12-31 18:41:44,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.51 | bwd_microstep: 261.03 | bwd_inner_microstep: 261.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:44,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.52 | bwd_microstep: 256.11 | bwd_inner_microstep: 256.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:41:45,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.77 | bwd_microstep: 261.70 | bwd_inner_microstep: 261.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:41:45,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:41:46,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 253.51 | bwd_inner_microstep: 253.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:46,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:46,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 245.38 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:41:47,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:41:47,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.25 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:48,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 256.42 | bwd_inner_microstep: 256.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:48,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:41:49,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:49,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.12 | optimizer_gradients: 0.57 | optimizer_step: 3.27 +[2024-12-31 18:41:49,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 401.76 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 158.02 | step_microstep: 10.96 +[2024-12-31 18:41:49,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2912.22 | bwd: 4342.21 | bwd_inner: 4183.11 | bwd_allreduce: 158.43 | step: 14.03 + 83%|████████▎ | 633/759 [1:31:44<15:40, 7.46s/it] {'loss': 1.2036, 'learning_rate': 1.4117612866176022e-06, 'epoch': 0.83} + 83%|████████▎ | 633/759 [1:31:44<15:40, 7.46s/it][2024-12-31 18:41:50,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.36 | bwd_microstep: 393.87 | bwd_inner_microstep: 393.52 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:41:50,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.45 | bwd_microstep: 268.70 | bwd_inner_microstep: 268.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:41:51,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.71 | bwd_microstep: 260.67 | bwd_inner_microstep: 260.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:51,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.96 | bwd_microstep: 256.53 | bwd_inner_microstep: 256.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:52,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:41:52,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 247.29 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:53,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:53,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 245.17 | bwd_inner_microstep: 245.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:41:53,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.66 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:41:54,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.28 | bwd_microstep: 251.82 | bwd_inner_microstep: 251.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:54,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 243.15 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:41:55,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:55,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:56,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.12 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:56,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.69 | bwd_microstep: 241.01 | bwd_inner_microstep: 240.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:56,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.69 | optimizer_step: 3.51 +[2024-12-31 18:41:56,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 257.81 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 13.88 | step_microstep: 11.71 +[2024-12-31 18:41:56,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.70 | bwd: 4137.27 | bwd_inner: 4122.52 | bwd_allreduce: 14.17 | step: 14.77 + 84%|████████▎ | 634/759 [1:31:51<15:26, 7.41s/it] {'loss': 1.2212, 'learning_rate': 1.3899734666561138e-06, 'epoch': 0.84} + 84%|████████▎ | 634/759 [1:31:51<15:26, 7.41s/it][2024-12-31 18:41:57,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.57 | bwd_microstep: 344.21 | bwd_inner_microstep: 343.87 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:41:58,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.49 | bwd_microstep: 296.09 | bwd_inner_microstep: 296.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:58,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.25 | bwd_microstep: 281.52 | bwd_inner_microstep: 281.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:41:59,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.01 | bwd_microstep: 261.71 | bwd_inner_microstep: 261.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:41:59,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.46 | bwd_microstep: 258.86 | bwd_inner_microstep: 258.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:41:59,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.35 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:00,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.16 | bwd_microstep: 247.07 | bwd_inner_microstep: 247.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:00,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 257.19 | bwd_inner_microstep: 257.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:42:01,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:01,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 246.75 | bwd_inner_microstep: 246.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:02,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 248.38 | bwd_inner_microstep: 248.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:02,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 242.55 | bwd_inner_microstep: 242.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:02,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 242.74 | bwd_inner_microstep: 242.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:03,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.04 | bwd_microstep: 224.46 | bwd_inner_microstep: 224.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:03,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 247.11 | bwd_inner_microstep: 247.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:04,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 1.35 | optimizer_step: 3.17 +[2024-12-31 18:42:04,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 283.57 | bwd_inner_microstep: 244.03 | bwd_allreduce_microstep: 39.49 | step_microstep: 11.35 +[2024-12-31 18:42:04,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2826.91 | bwd: 4176.04 | bwd_inner: 4135.64 | bwd_allreduce: 39.74 | step: 14.40 + 84%|████████▎ | 635/759 [1:31:59<15:14, 7.38s/it] {'loss': 1.2119, 'learning_rate': 1.3683425196025734e-06, 'epoch': 0.84} + 84%|████████▎ | 635/759 [1:31:59<15:14, 7.38s/it][2024-12-31 18:42:04,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 238.34 | bwd_microstep: 394.33 | bwd_inner_microstep: 393.98 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:42:05,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.01 | bwd_microstep: 283.67 | bwd_inner_microstep: 283.31 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.30 +[2024-12-31 18:42:05,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.97 | bwd_microstep: 262.58 | bwd_inner_microstep: 262.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:06,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.04 | bwd_microstep: 250.07 | bwd_inner_microstep: 250.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:42:06,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.86 | bwd_microstep: 247.62 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:07,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 247.96 | bwd_inner_microstep: 247.74 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.21 +[2024-12-31 18:42:07,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:42:08,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:42:08,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.26 | bwd_microstep: 244.60 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:08,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.20 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:42:09,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.21 | bwd_microstep: 244.08 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:09,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.80 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:42:10,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:10,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:11,123] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.35 | bwd_microstep: 242.58 | bwd_inner_microstep: 242.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:11,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.70 | optimizer_step: 3.54 +[2024-12-31 18:42:11,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 255.10 | bwd_inner_microstep: 241.38 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.34 +[2024-12-31 18:42:11,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.97 | bwd: 4138.67 | bwd_inner: 4123.35 | bwd_allreduce: 14.26 | step: 14.61 + 84%|████████▍ | 636/759 [1:32:06<15:04, 7.35s/it] {'loss': 1.2475, 'learning_rate': 1.3468688395683783e-06, 'epoch': 0.84} + 84%|████████▍ | 636/759 [1:32:06<15:04, 7.35s/it][2024-12-31 18:42:12,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.28 | bwd_microstep: 356.31 | bwd_inner_microstep: 355.97 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:42:12,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 207.22 | bwd_microstep: 322.88 | bwd_inner_microstep: 322.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:42:13,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.31 | bwd_microstep: 264.98 | bwd_inner_microstep: 264.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:42:13,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.51 | bwd_microstep: 262.67 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:14,087] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.75 | bwd_microstep: 253.39 | bwd_inner_microstep: 253.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:14,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 247.03 | bwd_inner_microstep: 247.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:42:14,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 252.06 | bwd_inner_microstep: 252.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:15,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.99 | bwd_inner_microstep: 246.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:15,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 246.84 | bwd_inner_microstep: 246.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:16,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 275.31 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:16,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.03 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:17,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.55 | bwd_microstep: 240.82 | bwd_inner_microstep: 240.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:17,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.84 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:18,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:42:18,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.20 | bwd_microstep: 242.32 | bwd_inner_microstep: 242.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:18,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.63 | optimizer_gradients: 1.42 | optimizer_step: 3.19 +[2024-12-31 18:42:18,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.85 | bwd_microstep: 270.86 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 27.11 | step_microstep: 13.12 +[2024-12-31 18:42:18,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2936.07 | bwd: 4182.58 | bwd_inner: 4154.67 | bwd_allreduce: 27.37 | step: 16.03 + 84%|████████▍ | 637/759 [1:32:13<14:59, 7.37s/it] {'loss': 1.2041, 'learning_rate': 1.325552817799547e-06, 'epoch': 0.84} + 84%|████████▍ | 637/759 [1:32:13<14:59, 7.37s/it][2024-12-31 18:42:19,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.72 | bwd_microstep: 383.87 | bwd_inner_microstep: 383.52 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:42:20,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.84 | bwd_microstep: 291.05 | bwd_inner_microstep: 291.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:20,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.69 | bwd_microstep: 276.48 | bwd_inner_microstep: 276.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:42:21,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 254.92 | bwd_inner_microstep: 254.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:21,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 248.20 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:21,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.78 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:22,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.08 | bwd_microstep: 256.38 | bwd_inner_microstep: 256.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:22,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:23,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:23,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.32 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:24,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.65 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:24,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:42:25,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:25,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:42:25,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.66 | bwd_microstep: 247.56 | bwd_inner_microstep: 247.25 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.22 +[2024-12-31 18:42:26,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.11 | optimizer_gradients: 0.64 | optimizer_step: 3.42 +[2024-12-31 18:42:26,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 263.99 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 15.47 | step_microstep: 11.28 +[2024-12-31 18:42:26,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.03 | bwd: 4177.27 | bwd_inner: 4160.59 | bwd_allreduce: 15.95 | step: 14.20 + 84%|████████▍ | 638/759 [1:32:21<14:50, 7.36s/it] {'loss': 1.2492, 'learning_rate': 1.3043948426696019e-06, 'epoch': 0.84} + 84%|████████▍ | 638/759 [1:32:21<14:50, 7.36s/it][2024-12-31 18:42:26,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.79 | bwd_microstep: 358.55 | bwd_inner_microstep: 358.19 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:42:27,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.09 | bwd_microstep: 283.33 | bwd_inner_microstep: 283.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:27,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.38 | bwd_microstep: 292.07 | bwd_inner_microstep: 292.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:28,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 250.11 | bwd_inner_microstep: 250.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:28,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 249.71 | bwd_inner_microstep: 249.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:29,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 246.57 | bwd_inner_microstep: 246.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:29,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 246.82 | bwd_inner_microstep: 246.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:30,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:30,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:42:30,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:31,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.63 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:31,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:42:32,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:32,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 244.03 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:33,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.45 | bwd_microstep: 242.06 | bwd_inner_microstep: 242.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:33,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.00 | optimizer_gradients: 0.66 | optimizer_step: 3.28 +[2024-12-31 18:42:33,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.05 | bwd_microstep: 255.45 | bwd_inner_microstep: 241.84 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.55 +[2024-12-31 18:42:33,591] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2819.87 | bwd: 4135.53 | bwd_inner: 4121.05 | bwd_allreduce: 13.80 | step: 14.62 + 84%|████████▍ | 639/759 [1:32:28<14:39, 7.33s/it] {'loss': 1.2403, 'learning_rate': 1.2833952996724864e-06, 'epoch': 0.84} + 84%|████████▍ | 639/759 [1:32:28<14:39, 7.33s/it][2024-12-31 18:42:34,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.30 | bwd_microstep: 354.42 | bwd_inner_microstep: 354.05 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:42:34,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.13 | bwd_microstep: 290.91 | bwd_inner_microstep: 290.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:35,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.99 | bwd_microstep: 266.70 | bwd_inner_microstep: 266.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:35,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.14 | bwd_microstep: 263.36 | bwd_inner_microstep: 263.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:36,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 250.23 | bwd_inner_microstep: 250.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:36,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.64 | bwd_microstep: 248.42 | bwd_inner_microstep: 248.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:36,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 246.95 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:42:37,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:42:37,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 246.82 | bwd_inner_microstep: 246.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:38,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:38,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 244.72 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:39,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:39,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 253.19 | bwd_inner_microstep: 253.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:39,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 242.84 | bwd_inner_microstep: 242.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:40,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:40,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.68 | optimizer_step: 3.37 +[2024-12-31 18:42:40,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 285.68 | bwd_inner_microstep: 272.00 | bwd_allreduce_microstep: 13.56 | step_microstep: 10.83 +[2024-12-31 18:42:40,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.91 | bwd: 4171.80 | bwd_inner: 4157.33 | bwd_allreduce: 13.84 | step: 13.72 + 84%|████████▍ | 640/759 [1:32:35<14:30, 7.31s/it] {'loss': 1.2125, 'learning_rate': 1.2625545714155474e-06, 'epoch': 0.84} + 84%|████████▍ | 640/759 [1:32:35<14:30, 7.31s/it][2024-12-31 18:42:41,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 206.13 | bwd_microstep: 312.16 | bwd_inner_microstep: 311.80 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:42:41,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.99 | bwd_microstep: 287.24 | bwd_inner_microstep: 287.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:42:42,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.02 | bwd_microstep: 263.07 | bwd_inner_microstep: 263.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:42:42,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.98 | bwd_microstep: 267.79 | bwd_inner_microstep: 267.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:42:43,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.62 | bwd_microstep: 261.66 | bwd_inner_microstep: 261.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:43,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 247.18 | bwd_inner_microstep: 247.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:42:44,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 246.61 | bwd_inner_microstep: 246.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:44,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:42:45,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 246.25 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:45,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 249.59 | bwd_inner_microstep: 249.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:45,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:46,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:42:46,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:47,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:47,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:48,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.63 | optimizer_gradients: 0.55 | optimizer_step: 3.71 +[2024-12-31 18:42:48,112] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 300.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 55.99 | step_microstep: 11.06 +[2024-12-31 18:42:48,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2810.46 | bwd: 4145.14 | bwd_inner: 4088.36 | bwd_allreduce: 56.25 | step: 14.12 + 84%|████████▍ | 641/759 [1:32:43<14:20, 7.29s/it] {'loss': 1.2063, 'learning_rate': 1.24187303761255e-06, 'epoch': 0.84} + 84%|████████▍ | 641/759 [1:32:43<14:20, 7.29s/it][2024-12-31 18:42:48,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.38 | bwd_microstep: 314.44 | bwd_inner_microstep: 314.08 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:42:49,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.84 | bwd_microstep: 311.58 | bwd_inner_microstep: 311.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:42:49,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.29 | bwd_microstep: 261.72 | bwd_inner_microstep: 261.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:50,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 251.37 | bwd_inner_microstep: 251.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:50,562] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.04 | bwd_microstep: 256.28 | bwd_inner_microstep: 256.14 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.20 +[2024-12-31 18:42:51,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 248.64 | bwd_inner_microstep: 248.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:51,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 247.65 | bwd_inner_microstep: 247.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:42:51,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 246.64 | bwd_inner_microstep: 246.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:42:52,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 244.50 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:52,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.29 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:53,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:53,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:54,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.47 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:54,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 279.02 | bwd_inner_microstep: 278.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:42:54,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:55,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.77 | optimizer_gradients: 0.71 | optimizer_step: 3.15 +[2024-12-31 18:42:55,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 425.18 | bwd_inner_microstep: 243.19 | bwd_allreduce_microstep: 181.94 | step_microstep: 12.42 +[2024-12-31 18:42:55,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2843.16 | bwd: 4309.97 | bwd_inner: 4127.11 | bwd_allreduce: 182.27 | step: 15.52 + 85%|████████▍ | 642/759 [1:32:50<14:18, 7.34s/it] {'loss': 1.2495, 'learning_rate': 1.221351075076781e-06, 'epoch': 0.85} + 85%|████████▍ | 642/759 [1:32:50<14:18, 7.34s/it][2024-12-31 18:42:56,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 246.54 | bwd_microstep: 386.24 | bwd_inner_microstep: 385.89 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:42:56,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.99 | bwd_microstep: 519.92 | bwd_inner_microstep: 519.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:42:57,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.39 | bwd_microstep: 281.83 | bwd_inner_microstep: 281.62 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.23 +[2024-12-31 18:42:57,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.08 | bwd_microstep: 261.64 | bwd_inner_microstep: 261.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:42:58,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.56 | bwd_microstep: 255.87 | bwd_inner_microstep: 255.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:58,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 247.32 | bwd_inner_microstep: 247.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:42:59,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.59 | bwd_microstep: 249.26 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:42:59,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.08 | bwd_microstep: 250.06 | bwd_inner_microstep: 250.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:00,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 246.99 | bwd_inner_microstep: 246.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:43:00,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:01,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:01,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 245.66 | bwd_inner_microstep: 245.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:43:01,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.29 | bwd_microstep: 242.36 | bwd_inner_microstep: 242.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:02,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 253.75 | bwd_inner_microstep: 253.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:43:02,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.15 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:03,201] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.06 | optimizer_gradients: 0.61 | optimizer_step: 3.22 +[2024-12-31 18:43:03,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.69 | bwd_microstep: 265.22 | bwd_inner_microstep: 240.79 | bwd_allreduce_microstep: 24.36 | step_microstep: 11.25 +[2024-12-31 18:43:03,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2908.95 | bwd: 4440.46 | bwd_inner: 4415.00 | bwd_allreduce: 24.73 | step: 14.36 + 85%|████████▍ | 643/759 [1:32:58<14:21, 7.43s/it] {'loss': 1.2197, 'learning_rate': 1.2009890577141625e-06, 'epoch': 0.85} + 85%|████████▍ | 643/759 [1:32:58<14:21, 7.43s/it][2024-12-31 18:43:03,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.12 | bwd_microstep: 311.95 | bwd_inner_microstep: 311.58 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:43:04,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.43 | bwd_microstep: 340.48 | bwd_inner_microstep: 340.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:43:04,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 252.07 | bwd_microstep: 278.47 | bwd_inner_microstep: 278.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:43:05,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 253.40 | bwd_inner_microstep: 253.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:05,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.92 | bwd_microstep: 253.71 | bwd_inner_microstep: 253.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:06,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 247.97 | bwd_inner_microstep: 247.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:06,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 245.37 | bwd_inner_microstep: 245.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:07,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:43:07,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 251.33 | bwd_inner_microstep: 251.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:07,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.16 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:08,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:08,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.21 | bwd_microstep: 240.64 | bwd_inner_microstep: 240.45 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.27 +[2024-12-31 18:43:09,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.06 | bwd_microstep: 240.19 | bwd_inner_microstep: 240.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:09,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 241.14 | bwd_inner_microstep: 241.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:43:10,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:10,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.69 | optimizer_gradients: 0.90 | optimizer_step: 3.41 +[2024-12-31 18:43:10,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 255.76 | bwd_inner_microstep: 242.06 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.83 +[2024-12-31 18:43:10,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2903.68 | bwd: 4138.12 | bwd_inner: 4123.42 | bwd_allreduce: 13.96 | step: 14.77 + 85%|████████▍ | 644/759 [1:33:05<14:11, 7.40s/it] {'loss': 1.2521, 'learning_rate': 1.1807873565164507e-06, 'epoch': 0.85} + 85%|████████▍ | 644/759 [1:33:05<14:11, 7.40s/it][2024-12-31 18:43:11,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.78 | bwd_microstep: 348.46 | bwd_inner_microstep: 348.13 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:43:11,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.32 | bwd_microstep: 292.99 | bwd_inner_microstep: 292.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:12,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.92 | bwd_microstep: 283.59 | bwd_inner_microstep: 283.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:43:12,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.76 | bwd_microstep: 284.04 | bwd_inner_microstep: 284.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:13,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 249.89 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:13,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.07 | bwd_microstep: 255.18 | bwd_inner_microstep: 255.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:13,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.60 | bwd_microstep: 248.15 | bwd_inner_microstep: 248.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:14,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.14 | bwd_microstep: 247.85 | bwd_inner_microstep: 247.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:43:14,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 247.83 | bwd_inner_microstep: 247.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:15,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:15,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.04 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:16,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 241.04 | bwd_inner_microstep: 241.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:16,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:16,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:17,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:17,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.30 | optimizer_gradients: 0.57 | optimizer_step: 3.09 +[2024-12-31 18:43:17,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 309.26 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 67.61 | step_microstep: 11.38 +[2024-12-31 18:43:17,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2851.25 | bwd: 4233.72 | bwd_inner: 4165.35 | bwd_allreduce: 67.85 | step: 14.07 + 85%|████████▍ | 645/759 [1:33:12<14:02, 7.39s/it] {'loss': 1.2254, 'learning_rate': 1.1607463395544782e-06, 'epoch': 0.85} + 85%|████████▍ | 645/759 [1:33:12<14:02, 7.39s/it][2024-12-31 18:43:18,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.20 | bwd_microstep: 361.22 | bwd_inner_microstep: 360.84 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.20 +[2024-12-31 18:43:19,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.31 | bwd_microstep: 291.55 | bwd_inner_microstep: 291.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:19,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.76 | bwd_microstep: 283.08 | bwd_inner_microstep: 283.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:19,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.69 | bwd_microstep: 266.40 | bwd_inner_microstep: 266.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:43:20,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.29 | bwd_microstep: 298.39 | bwd_inner_microstep: 298.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:43:20,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 247.71 | bwd_inner_microstep: 247.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:21,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 247.07 | bwd_inner_microstep: 247.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:43:21,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:43:22,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:22,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.12 | bwd_microstep: 245.39 | bwd_inner_microstep: 245.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:23,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 244.44 | bwd_inner_microstep: 244.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:23,544] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:23,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.18 | bwd_microstep: 242.66 | bwd_inner_microstep: 242.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:43:24,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.29 | bwd_microstep: 240.68 | bwd_inner_microstep: 240.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:24,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.78 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:25,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.55 | optimizer_gradients: 0.70 | optimizer_step: 3.63 +[2024-12-31 18:43:25,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 261.54 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 17.47 | step_microstep: 11.87 +[2024-12-31 18:43:25,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2898.61 | bwd: 4210.19 | bwd_inner: 4191.80 | bwd_allreduce: 17.77 | step: 14.84 + 85%|████████▌ | 646/759 [1:33:20<13:55, 7.39s/it] {'loss': 1.2357, 'learning_rate': 1.1408663719714418e-06, 'epoch': 0.85} + 85%|████████▌ | 646/759 [1:33:20<13:55, 7.39s/it][2024-12-31 18:43:26,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 321.47 | bwd_microstep: 392.86 | bwd_inner_microstep: 392.51 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:43:26,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.37 | bwd_microstep: 300.95 | bwd_inner_microstep: 300.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:27,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.14 | bwd_microstep: 267.94 | bwd_inner_microstep: 267.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:43:27,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.68 | bwd_microstep: 262.81 | bwd_inner_microstep: 262.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:43:27,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 259.14 | bwd_inner_microstep: 259.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:43:28,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.59 | bwd_microstep: 255.51 | bwd_inner_microstep: 255.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:43:28,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 256.58 | bwd_inner_microstep: 256.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:29,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.86 | bwd_inner_microstep: 244.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:43:29,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.59 | bwd_inner_microstep: 245.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:30,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.18 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:30,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.03 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:30,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:43:31,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 242.86 | bwd_inner_microstep: 242.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:43:31,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 242.63 | bwd_inner_microstep: 242.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:32,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:32,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.31 | optimizer_gradients: 0.59 | optimizer_step: 3.19 +[2024-12-31 18:43:32,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 265.48 | bwd_inner_microstep: 241.10 | bwd_allreduce_microstep: 24.33 | step_microstep: 11.76 +[2024-12-31 18:43:32,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2940.84 | bwd: 4210.62 | bwd_inner: 4185.51 | bwd_allreduce: 24.58 | step: 13.76 + 85%|████████▌ | 647/759 [1:33:27<13:48, 7.40s/it] {'loss': 1.1756, 'learning_rate': 1.121147815976248e-06, 'epoch': 0.85} + 85%|████████▌ | 647/759 [1:33:27<13:48, 7.40s/it][2024-12-31 18:43:33,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 235.59 | bwd_microstep: 384.37 | bwd_inner_microstep: 384.02 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:43:33,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.92 | bwd_microstep: 374.82 | bwd_inner_microstep: 374.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:43:34,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.25 | bwd_microstep: 266.47 | bwd_inner_microstep: 266.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:43:34,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.11 | bwd_microstep: 258.23 | bwd_inner_microstep: 258.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:35,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.56 | bwd_microstep: 248.48 | bwd_inner_microstep: 248.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:43:35,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 248.36 | bwd_inner_microstep: 248.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:36,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:43:36,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 246.20 | bwd_inner_microstep: 246.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:37,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:37,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:37,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:43:38,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 340.28 | bwd_inner_microstep: 340.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:38,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:39,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 242.90 | bwd_inner_microstep: 242.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:43:39,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:40,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.91 | optimizer_gradients: 0.70 | optimizer_step: 3.13 +[2024-12-31 18:43:40,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.92 | bwd_microstep: 266.38 | bwd_inner_microstep: 226.52 | bwd_allreduce_microstep: 39.81 | step_microstep: 11.29 +[2024-12-31 18:43:40,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2874.79 | bwd: 4341.39 | bwd_inner: 4300.77 | bwd_allreduce: 40.06 | step: 14.41 + 85%|████████▌ | 648/759 [1:33:35<13:44, 7.43s/it] {'loss': 1.1827, 'learning_rate': 1.1015910308369239e-06, 'epoch': 0.85} + 85%|████████▌ | 648/759 [1:33:35<13:44, 7.43s/it][2024-12-31 18:43:40,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.84 | bwd_microstep: 333.44 | bwd_inner_microstep: 333.08 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:43:41,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.45 | bwd_microstep: 317.86 | bwd_inner_microstep: 317.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:41,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.50 | bwd_microstep: 265.25 | bwd_inner_microstep: 265.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:43:42,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.13 | bwd_microstep: 258.68 | bwd_inner_microstep: 258.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:43:42,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 248.83 | bwd_inner_microstep: 248.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:43,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 255.35 | bwd_inner_microstep: 255.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:43,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:44,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.09 | bwd_microstep: 245.04 | bwd_inner_microstep: 245.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:43:44,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.34 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:43:44,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.05 | bwd_microstep: 243.35 | bwd_inner_microstep: 243.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:45,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:45,725] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:43:46,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:46,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.60 | bwd_microstep: 242.70 | bwd_inner_microstep: 242.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:47,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 242.80 | bwd_inner_microstep: 242.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:47,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.71 | optimizer_gradients: 0.72 | optimizer_step: 3.42 +[2024-12-31 18:43:47,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.15 | bwd_microstep: 527.69 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 284.21 | step_microstep: 11.23 +[2024-12-31 18:43:47,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2875.86 | bwd: 4401.95 | bwd_inner: 4116.92 | bwd_allreduce: 284.46 | step: 14.02 + 86%|████████▌ | 649/759 [1:33:42<13:41, 7.47s/it] {'loss': 1.2071, 'learning_rate': 1.0821963728740626e-06, 'epoch': 0.86} + 86%|████████▌ | 649/759 [1:33:42<13:41, 7.47s/it][2024-12-31 18:43:48,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.67 | bwd_microstep: 297.66 | bwd_inner_microstep: 297.46 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.11 +[2024-12-31 18:43:48,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.78 | bwd_microstep: 286.95 | bwd_inner_microstep: 286.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:43:49,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.90 | bwd_microstep: 263.86 | bwd_inner_microstep: 263.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:49,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.05 | bwd_microstep: 257.93 | bwd_inner_microstep: 257.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:50,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 250.79 | bwd_inner_microstep: 250.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:50,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 248.43 | bwd_inner_microstep: 248.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:51,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:51,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 248.13 | bwd_inner_microstep: 248.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:51,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:52,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 256.38 | bwd_inner_microstep: 256.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:52,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 243.84 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:43:53,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 242.87 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:43:53,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:54,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:54,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:55,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.34 | optimizer_gradients: 0.55 | optimizer_step: 3.10 +[2024-12-31 18:43:55,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 159.45 | bwd_microstep: 360.78 | bwd_inner_microstep: 228.86 | bwd_allreduce_microstep: 131.87 | step_microstep: 11.05 +[2024-12-31 18:43:55,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2788.82 | bwd: 4185.71 | bwd_inner: 4053.21 | bwd_allreduce: 132.06 | step: 13.90 + 86%|████████▌ | 650/759 [1:33:49<13:27, 7.40s/it] {'loss': 1.2552, 'learning_rate': 1.062964195454329e-06, 'epoch': 0.86} + 86%|████████▌ | 650/759 [1:33:49<13:27, 7.40s/it][2024-12-31 18:43:55,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.47 | bwd_microstep: 368.39 | bwd_inner_microstep: 368.04 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:43:56,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 205.75 | bwd_microstep: 325.27 | bwd_inner_microstep: 325.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:43:56,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.67 | bwd_microstep: 281.53 | bwd_inner_microstep: 281.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:57,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.65 | bwd_microstep: 282.48 | bwd_inner_microstep: 282.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:57,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.38 | bwd_microstep: 256.51 | bwd_inner_microstep: 256.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:58,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:43:58,487] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 248.71 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:58,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:43:59,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:43:59,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 245.05 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.26 +[2024-12-31 18:44:00,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.70 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:00,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 241.09 | bwd_inner_microstep: 241.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:01,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:01,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 245.44 | bwd_inner_microstep: 245.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:44:01,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.41 | bwd_microstep: 242.59 | bwd_inner_microstep: 242.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:02,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.46 | optimizer_gradients: 0.72 | optimizer_step: 3.25 +[2024-12-31 18:44:02,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.97 | bwd_microstep: 255.00 | bwd_inner_microstep: 241.35 | bwd_allreduce_microstep: 13.55 | step_microstep: 12.55 +[2024-12-31 18:44:02,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2866.14 | bwd: 4220.80 | bwd_inner: 4206.10 | bwd_allreduce: 13.91 | step: 15.64 + 86%|████████▌ | 651/759 [1:33:57<13:19, 7.40s/it] {'loss': 1.1921, 'learning_rate': 1.0438948489840327e-06, 'epoch': 0.86} + 86%|████████▌ | 651/759 [1:33:57<13:19, 7.40s/it][2024-12-31 18:44:03,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.70 | bwd_microstep: 353.49 | bwd_inner_microstep: 353.10 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:44:03,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.24 | bwd_microstep: 296.96 | bwd_inner_microstep: 296.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:04,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.09 | bwd_microstep: 283.58 | bwd_inner_microstep: 283.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:44:04,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.13 | bwd_microstep: 257.47 | bwd_inner_microstep: 257.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:04,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.58 | bwd_microstep: 254.33 | bwd_inner_microstep: 254.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:05,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 256.19 | bwd_inner_microstep: 256.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:05,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 246.64 | bwd_inner_microstep: 246.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:06,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 257.40 | bwd_inner_microstep: 257.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:06,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 246.84 | bwd_inner_microstep: 246.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:44:07,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.89 | bwd_inner_microstep: 245.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:07,561] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 253.87 | bwd_inner_microstep: 253.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:07,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:08,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:44:08,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.43 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:44:09,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:09,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.24 | optimizer_gradients: 0.62 | optimizer_step: 3.67 +[2024-12-31 18:44:09,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 469.21 | bwd_inner_microstep: 248.92 | bwd_allreduce_microstep: 220.25 | step_microstep: 12.28 +[2024-12-31 18:44:09,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2863.68 | bwd: 4399.61 | bwd_inner: 4178.53 | bwd_allreduce: 220.50 | step: 15.54 + 86%|████████▌ | 652/759 [1:34:04<13:16, 7.45s/it] {'loss': 1.2148, 'learning_rate': 1.0249886809027355e-06, 'epoch': 0.86} + 86%|████████▌ | 652/759 [1:34:04<13:16, 7.45s/it][2024-12-31 18:44:10,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 254.29 | bwd_microstep: 342.40 | bwd_inner_microstep: 342.01 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:44:11,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.98 | bwd_microstep: 269.03 | bwd_inner_microstep: 269.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:11,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.59 | bwd_microstep: 267.26 | bwd_inner_microstep: 267.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:44:11,977] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.96 | bwd_microstep: 262.13 | bwd_inner_microstep: 262.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:12,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.87 | bwd_microstep: 255.58 | bwd_inner_microstep: 255.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:12,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.41 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:13,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.25 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:13,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:44:14,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:14,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:15,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:44:15,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 293.68 | bwd_inner_microstep: 293.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:15,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:44:16,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.82 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:44:16,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.84 | bwd_microstep: 241.30 | bwd_inner_microstep: 241.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:17,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.77 | optimizer_step: 3.61 +[2024-12-31 18:44:17,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 378.64 | bwd_inner_microstep: 242.13 | bwd_allreduce_microstep: 136.46 | step_microstep: 11.88 +[2024-12-31 18:44:17,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2859.80 | bwd: 4269.93 | bwd_inner: 4132.61 | bwd_allreduce: 136.73 | step: 14.74 + 86%|████████▌ | 653/759 [1:34:12<13:08, 7.44s/it] {'loss': 1.2207, 'learning_rate': 1.0062460356769189e-06, 'epoch': 0.86} + 86%|████████▌ | 653/759 [1:34:12<13:08, 7.44s/it][2024-12-31 18:44:17,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 211.63 | bwd_microstep: 348.34 | bwd_inner_microstep: 346.79 | bwd_allreduce_microstep: 1.33 | step_microstep: 0.24 +[2024-12-31 18:44:18,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 241.57 | bwd_microstep: 401.42 | bwd_inner_microstep: 401.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:19,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.92 | bwd_microstep: 282.14 | bwd_inner_microstep: 282.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:19,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.30 | bwd_microstep: 264.39 | bwd_inner_microstep: 264.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:20,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.13 | bwd_microstep: 248.54 | bwd_inner_microstep: 248.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:20,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.36 | bwd_microstep: 249.40 | bwd_inner_microstep: 249.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:20,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 247.52 | bwd_inner_microstep: 247.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:44:21,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.88 | bwd_microstep: 244.20 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:21,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.56 | bwd_microstep: 246.01 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:23,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.18 | bwd_microstep: 1204.14 | bwd_inner_microstep: 1204.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:23,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.82 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:44:24,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:24,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:44:24,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.92 | bwd_microstep: 242.07 | bwd_inner_microstep: 242.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:44:25,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.67 | bwd_microstep: 239.96 | bwd_inner_microstep: 239.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:44:25,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 0.64 | optimizer_step: 3.32 +[2024-12-31 18:44:25,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 254.89 | bwd_inner_microstep: 241.24 | bwd_allreduce_microstep: 13.54 | step_microstep: 11.92 +[2024-12-31 18:44:25,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2887.02 | bwd: 5204.00 | bwd_inner: 5188.36 | bwd_allreduce: 15.01 | step: 14.70 + 86%|████████▌ | 654/759 [1:34:20<13:30, 7.72s/it] {'loss': 1.1999, 'learning_rate': 9.876672547937117e-07, 'epoch': 0.86} + 86%|████████▌ | 654/759 [1:34:20<13:30, 7.72s/it][2024-12-31 18:44:26,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.41 | bwd_microstep: 373.20 | bwd_inner_microstep: 372.86 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:44:26,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.33 | bwd_microstep: 286.02 | bwd_inner_microstep: 285.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:44:27,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.00 | bwd_microstep: 291.53 | bwd_inner_microstep: 291.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:44:27,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.88 | bwd_microstep: 264.09 | bwd_inner_microstep: 264.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:28,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.44 | bwd_microstep: 264.21 | bwd_inner_microstep: 264.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:44:28,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 249.75 | bwd_inner_microstep: 249.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:29,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 246.29 | bwd_inner_microstep: 245.95 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:44:29,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 245.75 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:30,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 249.88 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:30,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 244.16 | bwd_inner_microstep: 244.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:30,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:44:31,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.79 | bwd_microstep: 242.64 | bwd_inner_microstep: 242.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:31,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.14 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:32,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:32,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.53 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:44:33,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.79 | optimizer_gradients: 0.80 | optimizer_step: 3.27 +[2024-12-31 18:44:33,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.71 | bwd_microstep: 255.70 | bwd_inner_microstep: 226.50 | bwd_allreduce_microstep: 29.12 | step_microstep: 11.20 +[2024-12-31 18:44:33,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.36 | bwd: 4192.42 | bwd_inner: 4162.16 | bwd_allreduce: 29.51 | step: 13.79 + 86%|████████▋ | 655/759 [1:34:28<13:10, 7.60s/it] {'loss': 1.2209, 'learning_rate': 9.692526767546727e-07, 'epoch': 0.86} + 86%|████████▋ | 655/759 [1:34:28<13:10, 7.60s/it][2024-12-31 18:44:33,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.76 | bwd_microstep: 350.98 | bwd_inner_microstep: 350.64 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.21 +[2024-12-31 18:44:34,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.38 | bwd_microstep: 286.45 | bwd_inner_microstep: 286.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:44:34,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.19 | bwd_microstep: 287.03 | bwd_inner_microstep: 287.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:35,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.69 | bwd_microstep: 278.13 | bwd_inner_microstep: 278.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:35,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.85 | bwd_microstep: 256.10 | bwd_inner_microstep: 256.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:36,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 249.45 | bwd_inner_microstep: 249.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:44:36,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 248.55 | bwd_inner_microstep: 248.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:36,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 245.60 | bwd_inner_microstep: 245.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:37,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:37,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 295.37 | bwd_inner_microstep: 295.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:38,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:38,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:39,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 242.98 | bwd_inner_microstep: 242.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:39,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:39,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.81 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:40,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.99 | optimizer_gradients: 0.64 | optimizer_step: 3.30 +[2024-12-31 18:44:40,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.81 | bwd_microstep: 258.23 | bwd_inner_microstep: 244.22 | bwd_allreduce_microstep: 13.90 | step_microstep: 13.46 +[2024-12-31 18:44:40,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2840.13 | bwd: 4219.98 | bwd_inner: 4205.17 | bwd_allreduce: 14.17 | step: 16.37 + 86%|████████▋ | 656/759 [1:34:35<12:55, 7.53s/it] {'loss': 1.2208, 'learning_rate': 9.51002637069619e-07, 'epoch': 0.86} + 86%|████████▋ | 656/759 [1:34:35<12:55, 7.53s/it][2024-12-31 18:44:41,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.58 | bwd_microstep: 364.72 | bwd_inner_microstep: 364.38 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:44:41,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.14 | bwd_microstep: 303.78 | bwd_inner_microstep: 303.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:42,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.34 | bwd_microstep: 282.09 | bwd_inner_microstep: 282.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:42,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.68 | bwd_microstep: 261.89 | bwd_inner_microstep: 261.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:42,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 250.00 | bwd_inner_microstep: 249.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:43,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 247.70 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:44:43,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.49 | bwd_inner_microstep: 245.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:44:44,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 246.13 | bwd_inner_microstep: 246.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:44:44,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.01 | bwd_microstep: 246.20 | bwd_inner_microstep: 246.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:45,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.81 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:45,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:46,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:46,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:44:46,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.29 +[2024-12-31 18:44:47,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 249.48 | bwd_inner_microstep: 249.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:44:47,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 15.33 | optimizer_step: 13.04 +[2024-12-31 18:44:47,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 456.81 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 212.16 | step_microstep: 35.37 +[2024-12-31 18:44:47,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2851.60 | bwd: 4373.89 | bwd_inner: 4160.66 | bwd_allreduce: 212.59 | step: 38.31 + 87%|████████▋ | 657/759 [1:34:42<12:47, 7.53s/it] {'loss': 1.2426, 'learning_rate': 9.32917468250506e-07, 'epoch': 0.87} + 87%|████████▋ | 657/759 [1:34:42<12:47, 7.53s/it][2024-12-31 18:44:48,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.39 | bwd_microstep: 366.31 | bwd_inner_microstep: 365.93 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:44:49,114] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.46 | bwd_microstep: 297.23 | bwd_inner_microstep: 297.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:49,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.38 | bwd_microstep: 283.72 | bwd_inner_microstep: 283.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:50,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.02 | bwd_microstep: 274.99 | bwd_inner_microstep: 274.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:50,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.59 | bwd_microstep: 253.98 | bwd_inner_microstep: 253.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:50,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 247.29 | bwd_inner_microstep: 247.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:51,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 247.97 | bwd_inner_microstep: 247.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:44:51,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:44:52,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.49 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:52,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:53,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 249.60 | bwd_inner_microstep: 249.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:53,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.61 | bwd_microstep: 251.76 | bwd_inner_microstep: 251.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:54,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.71 | bwd_microstep: 241.36 | bwd_inner_microstep: 241.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:44:54,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.19 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.23 +[2024-12-31 18:44:54,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 242.77 | bwd_inner_microstep: 242.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:55,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.70 | optimizer_step: 3.17 +[2024-12-31 18:44:55,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 473.03 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 229.56 | step_microstep: 10.63 +[2024-12-31 18:44:55,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2860.13 | bwd: 4411.77 | bwd_inner: 4181.16 | bwd_allreduce: 229.94 | step: 13.74 + 87%|████████▋ | 658/759 [1:34:50<12:41, 7.54s/it] {'loss': 1.2129, 'learning_rate': 9.149974998053823e-07, 'epoch': 0.87} + 87%|████████▋ | 658/759 [1:34:50<12:41, 7.54s/it][2024-12-31 18:44:56,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.72 | bwd_microstep: 363.97 | bwd_inner_microstep: 363.63 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:44:56,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.25 | bwd_microstep: 267.13 | bwd_inner_microstep: 267.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:57,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.72 | bwd_microstep: 267.48 | bwd_inner_microstep: 267.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:44:57,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.37 | bwd_microstep: 263.31 | bwd_inner_microstep: 263.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:44:57,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 250.20 | bwd_inner_microstep: 250.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:58,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 247.72 | bwd_inner_microstep: 247.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:44:58,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 249.12 | bwd_inner_microstep: 249.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:59,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.32 | bwd_microstep: 246.54 | bwd_inner_microstep: 246.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:44:59,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.34 | bwd_microstep: 264.89 | bwd_inner_microstep: 264.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:00,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.37 | bwd_microstep: 244.60 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:00,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:45:01,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 243.68 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:01,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.55 | bwd_microstep: 242.61 | bwd_inner_microstep: 242.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:01,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.65 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:02,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:02,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.65 | optimizer_step: 3.21 +[2024-12-31 18:45:02,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 317.47 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 73.49 | step_microstep: 10.52 +[2024-12-31 18:45:02,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2805.60 | bwd: 4199.81 | bwd_inner: 4125.21 | bwd_allreduce: 73.88 | step: 13.53 + 87%|████████▋ | 659/759 [1:34:57<12:26, 7.47s/it] {'loss': 1.2453, 'learning_rate': 8.972430582323788e-07, 'epoch': 0.87} + 87%|████████▋ | 659/759 [1:34:57<12:26, 7.47s/it][2024-12-31 18:45:03,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 250.56 | bwd_microstep: 412.45 | bwd_inner_microstep: 412.08 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:45:04,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 243.23 | bwd_microstep: 404.23 | bwd_inner_microstep: 404.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:45:04,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.78 | bwd_microstep: 283.49 | bwd_inner_microstep: 283.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:05,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 262.59 | bwd_inner_microstep: 262.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:05,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.91 | bwd_microstep: 256.50 | bwd_inner_microstep: 256.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:06,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.49 | bwd_microstep: 261.53 | bwd_inner_microstep: 261.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:45:06,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 250.20 | bwd_inner_microstep: 249.26 | bwd_allreduce_microstep: 0.41 | step_microstep: 0.25 +[2024-12-31 18:45:06,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.25 | bwd_microstep: 248.97 | bwd_inner_microstep: 248.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:45:07,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:07,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:08,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.57 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:08,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.89 | bwd_microstep: 243.79 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:09,088] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:09,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.93 | bwd_microstep: 250.94 | bwd_inner_microstep: 250.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:09,950] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 240.98 | bwd_inner_microstep: 240.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:45:10,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.06 | optimizer_gradients: 0.63 | optimizer_step: 3.27 +[2024-12-31 18:45:10,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 255.20 | bwd_inner_microstep: 241.48 | bwd_allreduce_microstep: 13.63 | step_microstep: 11.86 +[2024-12-31 18:45:10,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2932.17 | bwd: 4348.69 | bwd_inner: 4333.31 | bwd_allreduce: 14.32 | step: 14.72 + 87%|████████▋ | 660/759 [1:35:05<12:21, 7.49s/it] {'loss': 1.1991, 'learning_rate': 8.796544670137574e-07, 'epoch': 0.87} + 87%|████████▋ | 660/759 [1:35:05<12:21, 7.49s/it][2024-12-31 18:45:10,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.19 | bwd_microstep: 316.06 | bwd_inner_microstep: 315.71 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:45:11,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.31 | bwd_microstep: 288.09 | bwd_inner_microstep: 288.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:45:11,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.65 | bwd_microstep: 266.25 | bwd_inner_microstep: 266.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:45:12,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 259.14 | bwd_inner_microstep: 259.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:45:12,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 249.82 | bwd_inner_microstep: 249.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:13,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 256.58 | bwd_inner_microstep: 256.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:45:13,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:45:14,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.29 | bwd_microstep: 249.62 | bwd_inner_microstep: 249.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:14,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:14,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 243.95 | bwd_inner_microstep: 243.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:45:15,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 248.81 | bwd_inner_microstep: 248.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:45:15,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 249.03 | bwd_inner_microstep: 249.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:16,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.28 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:16,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:45:17,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:17,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.87 | optimizer_gradients: 0.59 | optimizer_step: 3.09 +[2024-12-31 18:45:17,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 390.81 | bwd_inner_microstep: 307.03 | bwd_allreduce_microstep: 83.73 | step_microstep: 12.60 +[2024-12-31 18:45:17,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2803.61 | bwd: 4241.08 | bwd_inner: 4156.39 | bwd_allreduce: 84.03 | step: 15.65 + 87%|████████▋ | 661/759 [1:35:12<12:10, 7.45s/it] {'loss': 1.2346, 'learning_rate': 8.622320466100242e-07, 'epoch': 0.87} + 87%|████████▋ | 661/759 [1:35:12<12:10, 7.45s/it][2024-12-31 18:45:18,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.16 | bwd_microstep: 344.26 | bwd_inner_microstep: 343.93 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:45:18,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.25 | bwd_microstep: 297.01 | bwd_inner_microstep: 296.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:19,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.32 | bwd_microstep: 290.30 | bwd_inner_microstep: 290.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:45:19,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.98 | bwd_microstep: 263.54 | bwd_inner_microstep: 263.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:20,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.25 | bwd_microstep: 265.63 | bwd_inner_microstep: 265.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:45:20,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.70 | bwd_microstep: 250.64 | bwd_inner_microstep: 250.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:21,159] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.82 | bwd_microstep: 249.90 | bwd_inner_microstep: 249.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:45:21,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 256.61 | bwd_inner_microstep: 256.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:22,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 245.56 | bwd_inner_microstep: 245.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:22,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.27 +[2024-12-31 18:45:22,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 244.18 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:23,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 251.09 | bwd_inner_microstep: 251.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:45:23,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 243.47 | bwd_inner_microstep: 243.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:45:24,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 307.23 | bwd_inner_microstep: 307.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:24,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:45:25,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.57 | optimizer_gradients: 0.67 | optimizer_step: 3.18 +[2024-12-31 18:45:25,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.86 | bwd_microstep: 406.35 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 162.18 | step_microstep: 31.77 +[2024-12-31 18:45:25,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.07 | bwd: 4404.93 | bwd_inner: 4241.63 | bwd_allreduce: 162.67 | step: 34.70 + 87%|████████▋ | 662/759 [1:35:20<12:06, 7.49s/it] {'loss': 1.2006, 'learning_rate': 8.449761144540869e-07, 'epoch': 0.87} + 87%|████████▋ | 662/759 [1:35:20<12:06, 7.49s/it][2024-12-31 18:45:25,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.72 | bwd_microstep: 345.84 | bwd_inner_microstep: 345.58 | bwd_allreduce_microstep: 0.10 | step_microstep: 0.16 +[2024-12-31 18:45:26,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.16 | bwd_microstep: 297.20 | bwd_inner_microstep: 297.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:45:26,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.23 | bwd_microstep: 282.97 | bwd_inner_microstep: 282.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:45:27,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.09 | bwd_microstep: 254.42 | bwd_inner_microstep: 254.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:27,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 247.87 | bwd_inner_microstep: 247.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:28,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 247.05 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:28,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.53 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:29,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.77 | bwd_microstep: 246.34 | bwd_inner_microstep: 246.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:29,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 273.77 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:30,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.03 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:45:30,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:45:30,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.50 | bwd_microstep: 243.79 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:31,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 244.44 | bwd_inner_microstep: 244.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:45:31,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:32,284] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.12 | bwd_microstep: 250.12 | bwd_inner_microstep: 250.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:45:32,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.72 | optimizer_step: 3.54 +[2024-12-31 18:45:32,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.34 | bwd_microstep: 257.41 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 13.72 | step_microstep: 11.88 +[2024-12-31 18:45:32,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2950.33 | bwd: 4144.62 | bwd_inner: 4129.99 | bwd_allreduce: 14.01 | step: 14.65 + 87%|████████▋ | 663/759 [1:35:27<11:56, 7.47s/it] {'loss': 1.2171, 'learning_rate': 8.278869849454718e-07, 'epoch': 0.87} + 87%|████████▋ | 663/759 [1:35:27<11:56, 7.47s/it][2024-12-31 18:45:33,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.31 | bwd_microstep: 363.01 | bwd_inner_microstep: 362.66 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:45:33,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.76 | bwd_microstep: 288.57 | bwd_inner_microstep: 288.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:45:34,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.35 | bwd_microstep: 273.28 | bwd_inner_microstep: 273.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:34,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.09 | bwd_microstep: 258.05 | bwd_inner_microstep: 258.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:45:35,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.60 | bwd_microstep: 250.57 | bwd_inner_microstep: 250.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:45:35,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 248.24 | bwd_inner_microstep: 248.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:45:36,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.34 | bwd_microstep: 250.45 | bwd_inner_microstep: 250.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:45:36,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 246.23 | bwd_inner_microstep: 246.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:36,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 253.53 | bwd_inner_microstep: 253.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:45:37,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:37,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.89 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:38,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.06 | bwd_microstep: 241.35 | bwd_inner_microstep: 241.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:38,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:45:39,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.30 | bwd_microstep: 341.68 | bwd_inner_microstep: 341.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:39,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.61 | bwd_inner_microstep: 245.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:40,535] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.74 | optimizer_step: 17.39 +[2024-12-31 18:45:40,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 659.18 | bwd_inner_microstep: 241.58 | bwd_allreduce_microstep: 417.56 | step_microstep: 25.13 +[2024-12-31 18:45:40,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2838.29 | bwd: 4650.76 | bwd_inner: 4232.37 | bwd_allreduce: 417.81 | step: 27.90 + 87%|████████▋ | 664/759 [1:35:35<11:58, 7.57s/it] {'loss': 1.2201, 'learning_rate': 8.109649694445898e-07, 'epoch': 0.87} + 87%|████████▋ | 664/759 [1:35:35<11:58, 7.57s/it][2024-12-31 18:45:41,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.75 | bwd_microstep: 308.25 | bwd_inner_microstep: 307.90 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:45:41,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.16 | bwd_microstep: 305.67 | bwd_inner_microstep: 305.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:42,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.64 | bwd_microstep: 280.57 | bwd_inner_microstep: 280.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:42,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 249.96 | bwd_inner_microstep: 249.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:42,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.60 | bwd_microstep: 250.91 | bwd_inner_microstep: 250.55 | bwd_allreduce_microstep: 0.26 | step_microstep: 0.32 +[2024-12-31 18:45:43,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 249.34 | bwd_inner_microstep: 249.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:43,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.74 | bwd_microstep: 248.22 | bwd_inner_microstep: 248.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:44,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.14 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.22 +[2024-12-31 18:45:44,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 254.51 | bwd_inner_microstep: 254.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:45,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:45,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 243.35 | bwd_inner_microstep: 243.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:46,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.22 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:46,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:46,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.09 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.28 +[2024-12-31 18:45:47,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.95 | bwd_microstep: 242.56 | bwd_inner_microstep: 242.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:47,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.12 | optimizer_gradients: 0.62 | optimizer_step: 3.13 +[2024-12-31 18:45:47,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 355.98 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 111.76 | step_microstep: 11.18 +[2024-12-31 18:45:47,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2849.30 | bwd: 4208.99 | bwd_inner: 4095.69 | bwd_allreduce: 112.42 | step: 14.52 + 88%|████████▊ | 665/759 [1:35:42<11:45, 7.50s/it] {'loss': 1.2213, 'learning_rate': 7.942103762670783e-07, 'epoch': 0.88} + 88%|████████▊ | 665/759 [1:35:42<11:45, 7.50s/it][2024-12-31 18:45:48,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.12 | bwd_microstep: 347.89 | bwd_inner_microstep: 347.51 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.19 +[2024-12-31 18:45:48,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.93 | bwd_microstep: 296.63 | bwd_inner_microstep: 296.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:49,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.59 | bwd_microstep: 282.52 | bwd_inner_microstep: 282.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:45:49,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.08 | bwd_microstep: 262.67 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:50,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.01 | bwd_microstep: 248.57 | bwd_inner_microstep: 248.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:50,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 256.41 | bwd_inner_microstep: 256.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:45:51,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 253.71 | bwd_inner_microstep: 253.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:51,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:52,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:45:52,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.04 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:45:52,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:53,430] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:45:53,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 252.40 | bwd_inner_microstep: 252.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:45:54,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:54,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.65 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.20 +[2024-12-31 18:45:55,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.64 | optimizer_gradients: 0.66 | optimizer_step: 3.39 +[2024-12-31 18:45:55,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 257.53 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.21 +[2024-12-31 18:45:55,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2839.79 | bwd: 4171.85 | bwd_inner: 4157.11 | bwd_allreduce: 13.98 | step: 14.03 + 88%|████████▊ | 666/759 [1:35:50<11:32, 7.44s/it] {'loss': 1.2146, 'learning_rate': 7.776235106781704e-07, 'epoch': 0.88} + 88%|████████▊ | 666/759 [1:35:50<11:32, 7.44s/it][2024-12-31 18:45:55,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.74 | bwd_microstep: 394.75 | bwd_inner_microstep: 394.36 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.20 +[2024-12-31 18:45:56,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.59 | bwd_microstep: 287.86 | bwd_inner_microstep: 287.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:56,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 256.60 | bwd_inner_microstep: 256.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:45:57,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.18 | bwd_microstep: 247.74 | bwd_inner_microstep: 247.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:57,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 248.10 | bwd_inner_microstep: 248.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:58,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 264.40 | bwd_inner_microstep: 264.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:45:58,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.30 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:58,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.61 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:45:59,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 245.78 | bwd_inner_microstep: 245.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:45:59,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:00,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:00,711] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 244.27 | bwd_inner_microstep: 244.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:01,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 245.30 | bwd_inner_microstep: 245.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:01,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:02,026] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.93 | bwd_microstep: 242.45 | bwd_inner_microstep: 242.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:02,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.80 | optimizer_step: 3.39 +[2024-12-31 18:46:02,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 257.34 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.40 +[2024-12-31 18:46:02,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2837.15 | bwd: 4156.52 | bwd_inner: 4142.02 | bwd_allreduce: 13.89 | step: 14.51 + 88%|████████▊ | 667/759 [1:35:57<11:20, 7.40s/it] {'loss': 1.2159, 'learning_rate': 7.612046748871327e-07, 'epoch': 0.88} + 88%|████████▊ | 667/759 [1:35:57<11:20, 7.40s/it][2024-12-31 18:46:03,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.98 | bwd_microstep: 351.92 | bwd_inner_microstep: 351.58 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.23 +[2024-12-31 18:46:03,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.77 | bwd_microstep: 268.02 | bwd_inner_microstep: 267.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:04,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.96 | bwd_microstep: 285.05 | bwd_inner_microstep: 285.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:46:04,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.93 | bwd_microstep: 254.72 | bwd_inner_microstep: 254.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:04,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 246.90 | bwd_inner_microstep: 246.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:05,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:46:05,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:06,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.61 | bwd_microstep: 250.31 | bwd_inner_microstep: 249.99 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.33 +[2024-12-31 18:46:06,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:46:07,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 257.58 | bwd_inner_microstep: 257.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:07,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.97 | bwd_microstep: 255.07 | bwd_inner_microstep: 254.94 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.28 +[2024-12-31 18:46:08,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:08,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 243.86 | bwd_inner_microstep: 243.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:46:08,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 247.05 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:46:09,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.16 | bwd_microstep: 242.91 | bwd_inner_microstep: 242.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:09,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 33.15 | optimizer_gradients: 8.45 | optimizer_step: 3.74 +[2024-12-31 18:46:09,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 258.81 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 13.70 | step_microstep: 48.08 +[2024-12-31 18:46:09,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2838.05 | bwd: 4143.10 | bwd_inner: 4127.80 | bwd_allreduce: 14.28 | step: 51.21 + 88%|████████▊ | 668/759 [1:36:04<11:11, 7.38s/it] {'loss': 1.2434, 'learning_rate': 7.449541680417704e-07, 'epoch': 0.88} + 88%|████████▊ | 668/759 [1:36:04<11:11, 7.38s/it][2024-12-31 18:46:10,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 236.60 | bwd_microstep: 389.43 | bwd_inner_microstep: 389.08 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:46:10,982] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.09 | bwd_microstep: 290.60 | bwd_inner_microstep: 290.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:11,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.41 | bwd_microstep: 282.45 | bwd_inner_microstep: 282.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:11,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.39 | bwd_microstep: 268.52 | bwd_inner_microstep: 268.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:46:12,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.83 | bwd_microstep: 266.20 | bwd_inner_microstep: 266.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:12,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.04 | bwd_microstep: 277.87 | bwd_inner_microstep: 277.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:13,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.98 | bwd_microstep: 249.16 | bwd_inner_microstep: 248.94 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.28 +[2024-12-31 18:46:13,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.68 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:14,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 258.69 | bwd_inner_microstep: 258.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:14,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:46:15,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 244.30 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:46:15,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:15,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.67 | bwd_microstep: 240.99 | bwd_inner_microstep: 240.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:16,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:16,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:17,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.76 | optimizer_step: 3.59 +[2024-12-31 18:46:17,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.19 | bwd_microstep: 256.33 | bwd_inner_microstep: 242.02 | bwd_allreduce_microstep: 14.19 | step_microstep: 12.19 +[2024-12-31 18:46:17,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2871.07 | bwd: 4248.30 | bwd_inner: 4232.75 | bwd_allreduce: 14.66 | step: 15.20 + 88%|████████▊ | 669/759 [1:36:12<11:05, 7.40s/it] {'loss': 1.1792, 'learning_rate': 7.288722862229691e-07, 'epoch': 0.88} + 88%|████████▊ | 669/759 [1:36:12<11:05, 7.40s/it][2024-12-31 18:46:17,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.62 | bwd_microstep: 304.97 | bwd_inner_microstep: 304.62 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:46:18,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.41 | bwd_microstep: 291.56 | bwd_inner_microstep: 291.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:18,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.30 | bwd_microstep: 264.05 | bwd_inner_microstep: 264.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:46:19,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.12 | bwd_microstep: 257.19 | bwd_inner_microstep: 257.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:46:19,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.89 | bwd_microstep: 254.74 | bwd_inner_microstep: 254.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:20,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 249.96 | bwd_inner_microstep: 249.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:20,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.10 | bwd_microstep: 248.13 | bwd_inner_microstep: 248.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:46:20,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:21,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 246.39 | bwd_inner_microstep: 246.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:21,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 246.94 | bwd_inner_microstep: 246.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:46:22,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:22,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 246.06 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:23,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 249.17 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:23,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:23,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 243.19 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:46:24,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.73 | optimizer_gradients: 0.61 | optimizer_step: 3.09 +[2024-12-31 18:46:24,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 343.66 | bwd_inner_microstep: 244.21 | bwd_allreduce_microstep: 99.41 | step_microstep: 15.20 +[2024-12-31 18:46:24,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.07 | bwd: 4182.84 | bwd_inner: 4082.55 | bwd_allreduce: 99.66 | step: 18.00 + 88%|████████▊ | 670/759 [1:36:19<10:55, 7.36s/it] {'loss': 1.2433, 'learning_rate': 7.12959322439295e-07, 'epoch': 0.88} + 88%|████████▊ | 670/759 [1:36:19<10:55, 7.36s/it][2024-12-31 18:46:25,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.62 | bwd_microstep: 344.79 | bwd_inner_microstep: 344.46 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:46:25,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.94 | bwd_microstep: 291.66 | bwd_inner_microstep: 291.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:26,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.05 | bwd_microstep: 262.96 | bwd_inner_microstep: 262.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:26,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.03 | bwd_microstep: 262.23 | bwd_inner_microstep: 262.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:46:26,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.50 | bwd_microstep: 258.33 | bwd_inner_microstep: 258.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:27,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.66 | bwd_microstep: 254.86 | bwd_inner_microstep: 254.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:46:27,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 261.18 | bwd_inner_microstep: 261.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:28,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:28,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 283.76 | bwd_inner_microstep: 283.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:46:29,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.22 | step_microstep: 0.26 +[2024-12-31 18:46:29,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.67 | bwd_microstep: 243.83 | bwd_inner_microstep: 243.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:30,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 244.17 | bwd_inner_microstep: 244.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:30,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 244.54 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:30,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 241.25 | bwd_inner_microstep: 241.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:46:31,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.87 | bwd_microstep: 243.00 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:31,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.67 | optimizer_step: 3.45 +[2024-12-31 18:46:31,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 257.82 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 13.75 | step_microstep: 11.23 +[2024-12-31 18:46:31,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2853.96 | bwd: 4188.23 | bwd_inner: 4173.24 | bwd_allreduce: 14.27 | step: 14.38 + 88%|████████▊ | 671/759 [1:36:26<10:47, 7.36s/it] {'loss': 1.2162, 'learning_rate': 6.972155666216684e-07, 'epoch': 0.88} + 88%|████████▊ | 671/759 [1:36:26<10:47, 7.36s/it][2024-12-31 18:46:32,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.67 | bwd_microstep: 316.70 | bwd_inner_microstep: 316.32 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:46:32,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.27 | bwd_microstep: 283.46 | bwd_inner_microstep: 283.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:46:33,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.12 | bwd_microstep: 260.92 | bwd_inner_microstep: 260.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:46:33,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.74 | bwd_microstep: 255.20 | bwd_inner_microstep: 255.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:46:34,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:34,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 268.22 | bwd_inner_microstep: 268.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:35,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.47 | bwd_microstep: 247.18 | bwd_inner_microstep: 247.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:46:35,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.22 | bwd_inner_microstep: 246.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:36,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:36,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 263.08 | bwd_inner_microstep: 263.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:46:36,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:37,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.13 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:37,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.94 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:38,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:38,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:39,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.92 | optimizer_gradients: 0.65 | optimizer_step: 3.22 +[2024-12-31 18:46:39,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 309.73 | bwd_inner_microstep: 241.83 | bwd_allreduce_microstep: 67.86 | step_microstep: 11.27 +[2024-12-31 18:46:39,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2807.58 | bwd: 4162.68 | bwd_inner: 4094.05 | bwd_allreduce: 68.10 | step: 14.10 + 89%|████████▊ | 672/759 [1:36:34<10:38, 7.34s/it] {'loss': 1.2319, 'learning_rate': 6.816413056180748e-07, 'epoch': 0.89} + 89%|████████▊ | 672/759 [1:36:34<10:38, 7.34s/it][2024-12-31 18:46:39,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.53 | bwd_microstep: 370.52 | bwd_inner_microstep: 370.09 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.27 +[2024-12-31 18:46:40,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.13 | bwd_microstep: 292.46 | bwd_inner_microstep: 292.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:40,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.44 | bwd_microstep: 266.15 | bwd_inner_microstep: 266.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:41,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.53 | bwd_microstep: 260.62 | bwd_inner_microstep: 260.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:41,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.75 | bwd_microstep: 257.35 | bwd_inner_microstep: 257.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:42,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.33 | bwd_microstep: 254.86 | bwd_inner_microstep: 254.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:42,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.97 | bwd_microstep: 249.35 | bwd_inner_microstep: 249.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:46:43,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 247.06 | bwd_inner_microstep: 247.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:43,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 245.77 | bwd_inner_microstep: 245.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:46:43,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:46:44,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:44,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 245.48 | bwd_inner_microstep: 245.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:45,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.16 | bwd_microstep: 255.05 | bwd_inner_microstep: 255.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:45,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:46:46,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 278.80 | bwd_inner_microstep: 278.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:46:46,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.67 | optimizer_gradients: 0.57 | optimizer_step: 3.13 +[2024-12-31 18:46:46,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.23 | bwd_microstep: 455.32 | bwd_inner_microstep: 226.53 | bwd_allreduce_microstep: 228.74 | step_microstep: 11.12 +[2024-12-31 18:46:46,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2843.59 | bwd: 4413.53 | bwd_inner: 4183.93 | bwd_allreduce: 229.03 | step: 14.30 + 89%|████████▊ | 673/759 [1:36:41<10:37, 7.41s/it] {'loss': 1.2424, 'learning_rate': 6.662368231883388e-07, 'epoch': 0.89} + 89%|████████▊ | 673/759 [1:36:41<10:37, 7.41s/it][2024-12-31 18:46:47,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.11 | bwd_microstep: 352.60 | bwd_inner_microstep: 352.24 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:46:47,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.15 | bwd_microstep: 364.58 | bwd_inner_microstep: 364.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:46:48,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.81 | bwd_microstep: 268.99 | bwd_inner_microstep: 268.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:46:48,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.09 | bwd_microstep: 264.30 | bwd_inner_microstep: 264.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:49,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.94 | bwd_microstep: 263.27 | bwd_inner_microstep: 263.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:46:49,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.54 | bwd_microstep: 250.93 | bwd_inner_microstep: 250.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:46:50,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.80 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:46:50,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 250.67 | bwd_inner_microstep: 250.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:51,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:51,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 246.31 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:51,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.49 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:46:52,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:46:52,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 242.23 | bwd_inner_microstep: 242.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:46:53,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 245.90 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:53,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:54,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.36 | optimizer_gradients: 0.56 | optimizer_step: 3.10 +[2024-12-31 18:46:54,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 497.74 | bwd_inner_microstep: 244.58 | bwd_allreduce_microstep: 253.12 | step_microstep: 11.23 +[2024-12-31 18:46:54,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2900.56 | bwd: 4476.31 | bwd_inner: 4222.43 | bwd_allreduce: 253.36 | step: 14.35 + 89%|████████▉ | 674/759 [1:36:49<10:36, 7.48s/it] {'loss': 1.2052, 'learning_rate': 6.510023999989501e-07, 'epoch': 0.89} + 89%|████████▉ | 674/759 [1:36:49<10:36, 7.48s/it][2024-12-31 18:46:55,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.07 | bwd_microstep: 372.69 | bwd_inner_microstep: 372.34 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:46:55,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.42 | bwd_microstep: 309.39 | bwd_inner_microstep: 309.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:56,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.44 | bwd_microstep: 258.20 | bwd_inner_microstep: 258.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:56,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 252.39 | bwd_inner_microstep: 252.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:56,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.38 | bwd_microstep: 249.44 | bwd_inner_microstep: 249.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:46:57,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 250.11 | bwd_inner_microstep: 250.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:46:57,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 246.27 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:58,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 246.63 | bwd_inner_microstep: 246.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:58,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 251.06 | bwd_inner_microstep: 250.72 | bwd_allreduce_microstep: 0.23 | step_microstep: 0.29 +[2024-12-31 18:46:59,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 242.86 | bwd_inner_microstep: 242.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:46:59,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:46:59,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 255.63 | bwd_inner_microstep: 255.48 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.21 +[2024-12-31 18:47:00,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.38 | bwd_microstep: 243.58 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:00,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.81 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:01,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 242.89 | bwd_inner_microstep: 242.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:01,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.89 | optimizer_gradients: 0.77 | optimizer_step: 3.44 +[2024-12-31 18:47:01,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.45 | bwd_microstep: 239.25 | bwd_inner_microstep: 225.46 | bwd_allreduce_microstep: 13.67 | step_microstep: 11.66 +[2024-12-31 18:47:01,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2811.03 | bwd: 4148.37 | bwd_inner: 4133.22 | bwd_allreduce: 14.28 | step: 14.82 + 89%|████████▉ | 675/759 [1:36:56<10:23, 7.42s/it] {'loss': 1.2292, 'learning_rate': 6.359383136179598e-07, 'epoch': 0.89} + 89%|████████▉ | 675/759 [1:36:56<10:23, 7.42s/it][2024-12-31 18:47:02,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.29 | bwd_microstep: 364.48 | bwd_inner_microstep: 364.13 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:47:02,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.09 | bwd_microstep: 290.59 | bwd_inner_microstep: 290.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:03,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.22 | bwd_microstep: 268.69 | bwd_inner_microstep: 268.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:03,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.71 | bwd_microstep: 261.69 | bwd_inner_microstep: 261.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:04,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 250.44 | bwd_inner_microstep: 250.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:04,603] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 264.40 | bwd_inner_microstep: 264.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:05,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 247.51 | bwd_inner_microstep: 247.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:05,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 255.36 | bwd_inner_microstep: 255.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:05,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:06,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 245.21 | bwd_inner_microstep: 245.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:47:06,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 244.47 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:07,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 244.59 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:07,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 247.91 | bwd_inner_microstep: 247.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:08,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.91 | bwd_microstep: 251.79 | bwd_inner_microstep: 251.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:08,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.24 | bwd_microstep: 268.26 | bwd_inner_microstep: 268.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:47:09,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.72 | optimizer_gradients: 0.69 | optimizer_step: 3.72 +[2024-12-31 18:47:09,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 511.72 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 268.49 | step_microstep: 11.12 +[2024-12-31 18:47:09,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.64 | bwd: 4463.95 | bwd_inner: 4194.49 | bwd_allreduce: 268.78 | step: 14.33 + 89%|████████▉ | 676/759 [1:37:04<10:20, 7.47s/it] {'loss': 1.2327, 'learning_rate': 6.210448385099177e-07, 'epoch': 0.89} + 89%|████████▉ | 676/759 [1:37:04<10:20, 7.47s/it][2024-12-31 18:47:09,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.15 | bwd_microstep: 347.22 | bwd_inner_microstep: 346.85 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:47:10,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.56 | bwd_microstep: 286.41 | bwd_inner_microstep: 286.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:10,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.27 | bwd_microstep: 266.28 | bwd_inner_microstep: 266.02 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.29 +[2024-12-31 18:47:11,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.98 | bwd_microstep: 263.76 | bwd_inner_microstep: 263.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:11,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.10 | bwd_microstep: 256.78 | bwd_inner_microstep: 256.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:12,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 248.83 | bwd_inner_microstep: 248.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:12,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.82 | bwd_microstep: 254.98 | bwd_inner_microstep: 254.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:47:13,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.98 | bwd_microstep: 247.75 | bwd_inner_microstep: 247.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:47:13,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:13,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.05 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:14,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:47:14,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.14 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:47:15,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.70 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:15,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 243.04 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:16,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 157.64 | bwd_microstep: 225.14 | bwd_inner_microstep: 225.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:16,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.04 | optimizer_gradients: 0.62 | optimizer_step: 3.23 +[2024-12-31 18:47:16,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 565.78 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 322.09 | step_microstep: 13.91 +[2024-12-31 18:47:16,834] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2834.96 | bwd: 4426.04 | bwd_inner: 4102.57 | bwd_allreduce: 322.62 | step: 17.01 + 89%|████████▉ | 677/759 [1:37:11<10:14, 7.50s/it] {'loss': 1.2206, 'learning_rate': 6.063222460308649e-07, 'epoch': 0.89} + 89%|████████▉ | 677/759 [1:37:11<10:14, 7.50s/it][2024-12-31 18:47:17,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 244.87 | bwd_microstep: 402.72 | bwd_inner_microstep: 402.36 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:47:17,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.97 | bwd_microstep: 281.94 | bwd_inner_microstep: 281.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:18,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.30 | bwd_microstep: 267.21 | bwd_inner_microstep: 267.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:18,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.85 | bwd_microstep: 263.36 | bwd_inner_microstep: 263.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:47:19,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.55 | bwd_microstep: 256.48 | bwd_inner_microstep: 256.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:47:19,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.66 | bwd_microstep: 253.27 | bwd_inner_microstep: 253.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:20,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 248.49 | bwd_inner_microstep: 248.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:20,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 249.64 | bwd_inner_microstep: 249.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:21,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.54 | bwd_microstep: 248.19 | bwd_inner_microstep: 248.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:21,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 245.42 | bwd_inner_microstep: 245.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:21,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:22,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.42 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:47:22,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:47:23,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.93 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:23,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:24,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.87 | optimizer_gradients: 0.84 | optimizer_step: 15.75 +[2024-12-31 18:47:24,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.92 | bwd_microstep: 255.34 | bwd_inner_microstep: 241.63 | bwd_allreduce_microstep: 13.61 | step_microstep: 35.00 +[2024-12-31 18:47:24,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2870.15 | bwd: 4191.14 | bwd_inner: 4176.62 | bwd_allreduce: 13.90 | step: 38.08 + 89%|████████▉ | 678/759 [1:37:19<10:04, 7.46s/it] {'loss': 1.2348, 'learning_rate': 5.917708044234017e-07, 'epoch': 0.89} + 89%|████████▉ | 678/759 [1:37:19<10:04, 7.46s/it][2024-12-31 18:47:24,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.74 | bwd_microstep: 340.20 | bwd_inner_microstep: 339.87 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:47:25,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.16 | bwd_microstep: 297.92 | bwd_inner_microstep: 297.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:47:25,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.93 | bwd_microstep: 258.40 | bwd_inner_microstep: 258.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:26,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.27 | bwd_microstep: 261.66 | bwd_inner_microstep: 261.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:26,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.30 | bwd_microstep: 263.43 | bwd_inner_microstep: 263.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:47:27,113] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 250.13 | bwd_inner_microstep: 250.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:47:27,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.52 | bwd_microstep: 260.30 | bwd_inner_microstep: 260.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:28,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.95 | bwd_microstep: 248.17 | bwd_inner_microstep: 248.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:47:28,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 244.33 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:28,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 245.10 | bwd_inner_microstep: 245.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:47:29,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 245.40 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:29,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:30,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:30,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:31,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.86 | bwd_microstep: 241.58 | bwd_inner_microstep: 241.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:31,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 0.68 | optimizer_step: 3.46 +[2024-12-31 18:47:31,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 260.59 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 15.92 | step_microstep: 11.58 +[2024-12-31 18:47:31,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.53 | bwd: 4148.39 | bwd_inner: 4131.39 | bwd_allreduce: 16.23 | step: 14.72 + 89%|████████▉ | 679/759 [1:37:26<09:53, 7.41s/it] {'loss': 1.2034, 'learning_rate': 5.77390778811796e-07, 'epoch': 0.89} + 89%|████████▉ | 679/759 [1:37:26<09:53, 7.41s/it][2024-12-31 18:47:32,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.40 | bwd_microstep: 337.44 | bwd_inner_microstep: 337.06 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.21 +[2024-12-31 18:47:32,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.41 | bwd_microstep: 283.94 | bwd_inner_microstep: 283.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:47:33,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.69 | bwd_microstep: 266.70 | bwd_inner_microstep: 266.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:47:33,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.39 | bwd_microstep: 254.57 | bwd_inner_microstep: 254.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:33,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.85 | bwd_microstep: 254.97 | bwd_inner_microstep: 254.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:34,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.66 | bwd_inner_microstep: 245.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:47:34,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 248.63 | bwd_inner_microstep: 248.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:47:35,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.72 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:35,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.71 | bwd_microstep: 245.50 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:36,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:36,604] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:47:37,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.82 | bwd_microstep: 241.39 | bwd_inner_microstep: 241.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:47:37,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.51 | bwd_microstep: 243.38 | bwd_inner_microstep: 243.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:37,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.43 | bwd_microstep: 242.29 | bwd_inner_microstep: 242.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:38,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.14 | bwd_microstep: 241.50 | bwd_inner_microstep: 241.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:47:38,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.64 | optimizer_gradients: 0.59 | optimizer_step: 3.24 +[2024-12-31 18:47:38,806] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 283.00 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 39.16 | step_microstep: 12.81 +[2024-12-31 18:47:38,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.05 | bwd: 4121.49 | bwd_inner: 4081.53 | bwd_allreduce: 39.44 | step: 15.71 + 90%|████████▉ | 680/759 [1:37:33<09:42, 7.38s/it] {'loss': 1.2399, 'learning_rate': 5.631824311971456e-07, 'epoch': 0.9} + 90%|████████▉ | 680/759 [1:37:33<09:42, 7.38s/it][2024-12-31 18:47:39,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 244.66 | bwd_microstep: 401.39 | bwd_inner_microstep: 401.03 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:47:40,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.03 | bwd_microstep: 320.79 | bwd_inner_microstep: 320.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:47:40,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.42 | bwd_microstep: 288.76 | bwd_inner_microstep: 288.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:41,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.20 | bwd_microstep: 286.42 | bwd_inner_microstep: 286.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:41,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.71 | bwd_microstep: 263.42 | bwd_inner_microstep: 263.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:41,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.74 | bwd_microstep: 255.27 | bwd_inner_microstep: 255.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:42,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 247.98 | bwd_inner_microstep: 247.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:42,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 248.43 | bwd_inner_microstep: 248.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:43,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 250.03 | bwd_inner_microstep: 250.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:43,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:44,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:44,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:44,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 244.01 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:47:45,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:45,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 244.23 | bwd_inner_microstep: 244.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:46,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.64 | optimizer_step: 3.38 +[2024-12-31 18:47:46,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.68 | bwd_microstep: 263.66 | bwd_inner_microstep: 249.99 | bwd_allreduce_microstep: 13.57 | step_microstep: 10.81 +[2024-12-31 18:47:46,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2890.65 | bwd: 4290.24 | bwd_inner: 4275.73 | bwd_allreduce: 13.86 | step: 13.87 + 90%|████████▉ | 681/759 [1:37:41<09:37, 7.41s/it] {'loss': 1.1637, 'learning_rate': 5.491460204526156e-07, 'epoch': 0.9} + 90%|████████▉ | 681/759 [1:37:41<09:37, 7.41s/it][2024-12-31 18:47:46,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.20 | bwd_microstep: 311.55 | bwd_inner_microstep: 311.19 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:47:47,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 254.24 | bwd_microstep: 425.78 | bwd_inner_microstep: 425.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:47:48,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.45 | bwd_microstep: 288.15 | bwd_inner_microstep: 288.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:48,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.11 | bwd_microstep: 262.89 | bwd_inner_microstep: 262.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:48,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.08 | bwd_microstep: 257.64 | bwd_inner_microstep: 257.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:49,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.48 | bwd_microstep: 260.59 | bwd_inner_microstep: 260.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:49,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.58 | bwd_microstep: 250.09 | bwd_inner_microstep: 250.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:50,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 249.81 | bwd_inner_microstep: 249.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:50,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:51,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.28 +[2024-12-31 18:47:51,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 261.71 | bwd_inner_microstep: 261.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:47:52,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.21 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:47:52,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 243.48 | bwd_inner_microstep: 243.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:52,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.35 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:47:53,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.29 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:47:53,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.04 | optimizer_gradients: 0.79 | optimizer_step: 3.23 +[2024-12-31 18:47:53,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 159.98 | bwd_microstep: 437.40 | bwd_inner_microstep: 228.99 | bwd_allreduce_microstep: 208.36 | step_microstep: 14.37 +[2024-12-31 18:47:53,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2884.19 | bwd: 4474.44 | bwd_inner: 4264.72 | bwd_allreduce: 208.87 | step: 17.31 + 90%|████████▉ | 682/759 [1:37:48<09:36, 7.48s/it] {'loss': 1.1979, 'learning_rate': 5.352818023187167e-07, 'epoch': 0.9} + 90%|████████▉ | 682/759 [1:37:48<09:36, 7.48s/it][2024-12-31 18:47:54,472] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.49 | bwd_microstep: 314.00 | bwd_inner_microstep: 313.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:47:54,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.05 | bwd_microstep: 282.89 | bwd_inner_microstep: 282.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:55,469] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.86 | bwd_microstep: 267.91 | bwd_inner_microstep: 267.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:55,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 256.35 | bwd_inner_microstep: 256.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:56,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.74 | bwd_microstep: 254.67 | bwd_inner_microstep: 254.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:47:56,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 267.27 | bwd_inner_microstep: 267.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:47:57,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 247.13 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:47:57,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.64 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:58,134] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:47:58,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.78 | bwd_microstep: 246.00 | bwd_inner_microstep: 245.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:59,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 242.88 | bwd_inner_microstep: 242.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:47:59,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 244.12 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.23 +[2024-12-31 18:47:59,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:00,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 241.23 | bwd_inner_microstep: 241.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:48:00,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:01,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.86 | optimizer_gradients: 0.64 | optimizer_step: 3.12 +[2024-12-31 18:48:01,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 537.65 | bwd_inner_microstep: 244.92 | bwd_allreduce_microstep: 292.68 | step_microstep: 11.62 +[2024-12-31 18:48:01,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.95 | bwd: 4382.92 | bwd_inner: 4089.19 | bwd_allreduce: 293.05 | step: 14.79 + 90%|████████▉ | 683/759 [1:37:56<09:29, 7.50s/it] {'loss': 1.233, 'learning_rate': 5.215900293986431e-07, 'epoch': 0.9} + 90%|████████▉ | 683/759 [1:37:56<09:29, 7.50s/it][2024-12-31 18:48:02,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.82 | bwd_microstep: 343.45 | bwd_inner_microstep: 343.23 | bwd_allreduce_microstep: 0.09 | step_microstep: 0.12 +[2024-12-31 18:48:02,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 288.47 | bwd_microstep: 508.44 | bwd_inner_microstep: 508.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:48:03,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.95 | bwd_microstep: 267.69 | bwd_inner_microstep: 267.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:03,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.09 | bwd_microstep: 262.68 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:04,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.65 | bwd_microstep: 252.94 | bwd_inner_microstep: 252.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:04,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.13 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:48:05,115] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 262.32 | bwd_inner_microstep: 262.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:48:05,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 247.13 | bwd_inner_microstep: 247.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:05,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 252.82 | bwd_inner_microstep: 252.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:48:06,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.40 | bwd_inner_microstep: 243.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:06,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.60 | bwd_microstep: 242.51 | bwd_inner_microstep: 242.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:07,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:07,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.52 | bwd_microstep: 247.85 | bwd_inner_microstep: 247.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:48:08,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:08,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.93 | bwd_microstep: 256.98 | bwd_inner_microstep: 256.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:09,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 0.62 | optimizer_step: 3.37 +[2024-12-31 18:48:09,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.83 | bwd_microstep: 257.24 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 13.52 | step_microstep: 11.78 +[2024-12-31 18:48:09,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2928.14 | bwd: 4378.07 | bwd_inner: 4363.83 | bwd_allreduce: 13.75 | step: 14.69 + 90%|█████████ | 684/759 [1:38:04<09:24, 7.53s/it] {'loss': 1.2375, 'learning_rate': 5.08070951153673e-07, 'epoch': 0.9} + 90%|█████████ | 684/759 [1:38:04<09:24, 7.53s/it][2024-12-31 18:48:09,648] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.59 | bwd_microstep: 344.11 | bwd_inner_microstep: 343.75 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:48:10,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.97 | bwd_microstep: 348.08 | bwd_inner_microstep: 348.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:48:10,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.74 | bwd_microstep: 281.36 | bwd_inner_microstep: 281.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:11,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.12 | bwd_microstep: 256.72 | bwd_inner_microstep: 256.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:11,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.89 | bwd_microstep: 254.50 | bwd_inner_microstep: 254.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:12,057] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 247.47 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:48:12,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.24 | bwd_microstep: 250.97 | bwd_inner_microstep: 250.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:48:12,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 245.27 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:48:13,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 248.83 | bwd_inner_microstep: 247.81 | bwd_allreduce_microstep: 0.32 | step_microstep: 0.62 +[2024-12-31 18:48:13,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.21 | bwd_microstep: 245.72 | bwd_inner_microstep: 245.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:48:14,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:14,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:15,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:15,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 258.32 | bwd_inner_microstep: 258.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:48:15,964] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.10 | bwd_microstep: 240.85 | bwd_inner_microstep: 240.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:48:16,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.79 | optimizer_gradients: 0.64 | optimizer_step: 3.30 +[2024-12-31 18:48:16,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 263.52 | bwd_inner_microstep: 249.68 | bwd_allreduce_microstep: 13.71 | step_microstep: 11.24 +[2024-12-31 18:48:16,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2863.09 | bwd: 4218.01 | bwd_inner: 4202.58 | bwd_allreduce: 14.30 | step: 14.32 + 90%|█████████ | 685/759 [1:38:11<09:13, 7.47s/it] {'loss': 1.2333, 'learning_rate': 4.947248138986249e-07, 'epoch': 0.9} + 90%|█████████ | 685/759 [1:38:11<09:13, 7.47s/it][2024-12-31 18:48:17,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.14 | bwd_microstep: 353.11 | bwd_inner_microstep: 352.75 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:48:17,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.77 | bwd_microstep: 292.11 | bwd_inner_microstep: 292.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:17,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.21 | bwd_microstep: 267.56 | bwd_inner_microstep: 267.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:18,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.47 | bwd_microstep: 267.41 | bwd_inner_microstep: 267.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:48:18,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.86 | bwd_microstep: 261.47 | bwd_inner_microstep: 261.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:19,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 260.21 | bwd_inner_microstep: 260.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:48:19,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.84 | bwd_microstep: 248.08 | bwd_inner_microstep: 248.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:20,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.90 | bwd_microstep: 248.69 | bwd_inner_microstep: 248.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:20,685] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.48 | bwd_microstep: 246.60 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.27 +[2024-12-31 18:48:21,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 246.08 | bwd_inner_microstep: 246.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:21,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:21,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:48:22,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:22,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 245.44 | bwd_inner_microstep: 245.26 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.21 +[2024-12-31 18:48:23,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:23,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.73 | optimizer_step: 3.26 +[2024-12-31 18:48:23,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 158.30 | bwd_microstep: 366.84 | bwd_inner_microstep: 225.75 | bwd_allreduce_microstep: 141.05 | step_microstep: 10.64 +[2024-12-31 18:48:23,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2844.81 | bwd: 4285.53 | bwd_inner: 4143.15 | bwd_allreduce: 141.65 | step: 13.66 + 90%|█████████ | 686/759 [1:38:18<09:04, 7.46s/it] {'loss': 1.2321, 'learning_rate': 4.81551860797369e-07, 'epoch': 0.9} + 90%|█████████ | 686/759 [1:38:18<09:04, 7.46s/it][2024-12-31 18:48:24,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.65 | bwd_microstep: 312.71 | bwd_inner_microstep: 312.35 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:48:24,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.75 | bwd_microstep: 286.00 | bwd_inner_microstep: 285.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:25,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.54 | bwd_microstep: 266.98 | bwd_inner_microstep: 266.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:25,797] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.72 | bwd_microstep: 250.89 | bwd_inner_microstep: 250.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:26,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.71 | bwd_microstep: 250.27 | bwd_inner_microstep: 250.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:26,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.42 | bwd_microstep: 247.24 | bwd_inner_microstep: 247.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:27,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.05 | bwd_microstep: 253.37 | bwd_inner_microstep: 253.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:27,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:48:28,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.25 +[2024-12-31 18:48:28,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:28,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:29,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.46 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:29,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:30,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 246.74 | bwd_inner_microstep: 246.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:48:30,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.64 | bwd_microstep: 241.13 | bwd_inner_microstep: 241.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:31,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.68 | optimizer_step: 3.16 +[2024-12-31 18:48:31,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 341.34 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 96.99 | step_microstep: 10.49 +[2024-12-31 18:48:31,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2852.36 | bwd: 4161.70 | bwd_inner: 4063.66 | bwd_allreduce: 97.31 | step: 13.68 + 91%|█████████ | 687/759 [1:38:26<08:53, 7.41s/it] {'loss': 1.2235, 'learning_rate': 4.6855233185839175e-07, 'epoch': 0.91} + 91%|█████████ | 687/759 [1:38:26<08:53, 7.41s/it][2024-12-31 18:48:31,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.61 | bwd_microstep: 314.75 | bwd_inner_microstep: 314.39 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:48:32,171] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.78 | bwd_microstep: 286.61 | bwd_inner_microstep: 286.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:32,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.82 | bwd_microstep: 309.16 | bwd_inner_microstep: 309.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:48:33,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.26 | bwd_microstep: 257.22 | bwd_inner_microstep: 257.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:48:33,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.35 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:34,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:34,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:34,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 245.30 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:35,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:35,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 242.91 | bwd_inner_microstep: 242.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:48:36,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.55 | bwd_microstep: 257.70 | bwd_inner_microstep: 257.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:48:36,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:48:37,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 242.94 | bwd_inner_microstep: 242.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:48:37,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:37,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.24 | bwd_microstep: 249.54 | bwd_inner_microstep: 249.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:48:38,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.51 | optimizer_gradients: 0.59 | optimizer_step: 3.27 +[2024-12-31 18:48:38,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 264.84 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 19.75 | step_microstep: 21.79 +[2024-12-31 18:48:38,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2795.59 | bwd: 4143.92 | bwd_inner: 4123.34 | bwd_allreduce: 20.02 | step: 24.62 + 91%|█████████ | 688/759 [1:38:33<08:42, 7.36s/it] {'loss': 1.2285, 'learning_rate': 4.557264639304315e-07, 'epoch': 0.91} + 91%|█████████ | 688/759 [1:38:33<08:42, 7.36s/it][2024-12-31 18:48:38,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.70 | bwd_microstep: 342.88 | bwd_inner_microstep: 342.54 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:48:39,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.59 | bwd_microstep: 287.00 | bwd_inner_microstep: 286.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:39,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.08 | bwd_microstep: 263.21 | bwd_inner_microstep: 263.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:40,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.88 | bwd_microstep: 256.20 | bwd_inner_microstep: 256.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:40,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.89 | bwd_microstep: 254.79 | bwd_inner_microstep: 254.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:41,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 248.84 | bwd_inner_microstep: 248.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:41,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.13 | bwd_microstep: 254.26 | bwd_inner_microstep: 254.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:48:42,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:42,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 252.46 | bwd_inner_microstep: 252.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:43,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.43 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:43,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:43,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 244.39 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:48:44,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.39 | bwd_microstep: 244.11 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:48:44,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 246.30 | bwd_inner_microstep: 246.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:45,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.99 | bwd_microstep: 240.61 | bwd_inner_microstep: 240.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:48:45,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.74 | optimizer_step: 7.00 +[2024-12-31 18:48:45,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.32 | bwd_microstep: 535.71 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 291.91 | step_microstep: 16.85 +[2024-12-31 18:48:45,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2832.84 | bwd: 4402.80 | bwd_inner: 4110.12 | bwd_allreduce: 292.15 | step: 19.64 + 91%|█████████ | 689/759 [1:38:40<08:38, 7.41s/it] {'loss': 1.2414, 'learning_rate': 4.430744906981577e-07, 'epoch': 0.91} + 91%|█████████ | 689/759 [1:38:40<08:38, 7.41s/it][2024-12-31 18:48:46,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 249.08 | bwd_microstep: 413.46 | bwd_inner_microstep: 413.11 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:48:47,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.35 | bwd_microstep: 267.58 | bwd_inner_microstep: 267.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:47,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.41 | bwd_microstep: 267.86 | bwd_inner_microstep: 267.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:47,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.37 | bwd_microstep: 256.84 | bwd_inner_microstep: 256.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:48,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.75 | bwd_microstep: 254.48 | bwd_inner_microstep: 254.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:48,858] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 245.89 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:49,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 245.35 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:49,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:50,156] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:48:50,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.94 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:51,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.08 | bwd_microstep: 243.76 | bwd_inner_microstep: 243.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:48:51,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 250.99 | bwd_inner_microstep: 250.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:51,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.17 | bwd_microstep: 242.43 | bwd_inner_microstep: 242.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:52,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 242.85 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:52,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 242.22 | bwd_inner_microstep: 242.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:48:53,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.81 | optimizer_gradients: 0.69 | optimizer_step: 3.32 +[2024-12-31 18:48:53,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 257.64 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.16 +[2024-12-31 18:48:53,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.54 | bwd: 4166.40 | bwd_inner: 4151.92 | bwd_allreduce: 13.86 | step: 14.15 + 91%|█████████ | 690/759 [1:38:48<08:29, 7.38s/it] {'loss': 1.2246, 'learning_rate': 4.305966426779118e-07, 'epoch': 0.91} + 91%|█████████ | 690/759 [1:38:48<08:29, 7.38s/it][2024-12-31 18:48:53,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.09 | bwd_microstep: 354.30 | bwd_inner_microstep: 353.93 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:48:54,267] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.15 | bwd_microstep: 265.52 | bwd_inner_microstep: 265.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:54,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.19 | bwd_microstep: 268.54 | bwd_inner_microstep: 268.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:55,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.65 | bwd_microstep: 264.39 | bwd_inner_microstep: 264.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:55,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.53 | bwd_microstep: 255.09 | bwd_inner_microstep: 255.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:56,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 248.12 | bwd_inner_microstep: 248.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:56,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 253.64 | bwd_inner_microstep: 253.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:56,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 246.43 | bwd_inner_microstep: 246.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:57,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.87 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:57,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:48:58,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.77 | bwd_inner_microstep: 244.51 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.24 +[2024-12-31 18:48:58,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:59,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.21 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:48:59,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:48:59,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 246.89 | bwd_inner_microstep: 246.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:00,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.20 | optimizer_gradients: 0.65 | optimizer_step: 3.13 +[2024-12-31 18:49:00,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 348.25 | bwd_inner_microstep: 242.02 | bwd_allreduce_microstep: 106.18 | step_microstep: 11.60 +[2024-12-31 18:49:00,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2815.93 | bwd: 4218.53 | bwd_inner: 4111.22 | bwd_allreduce: 106.62 | step: 14.53 + 91%|█████████ | 691/759 [1:38:55<08:20, 7.36s/it] {'loss': 1.23, 'learning_rate': 4.1829314721351213e-07, 'epoch': 0.91} + 91%|█████████ | 691/759 [1:38:55<08:20, 7.36s/it][2024-12-31 18:49:01,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.00 | bwd_microstep: 362.66 | bwd_inner_microstep: 362.54 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.08 +[2024-12-31 18:49:01,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.76 | bwd_microstep: 291.26 | bwd_inner_microstep: 291.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:02,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.99 | bwd_microstep: 256.00 | bwd_inner_microstep: 255.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:49:02,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.84 | bwd_microstep: 248.95 | bwd_inner_microstep: 248.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:49:02,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.34 | bwd_microstep: 246.97 | bwd_inner_microstep: 246.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:49:03,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.67 | bwd_microstep: 247.29 | bwd_inner_microstep: 247.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:03,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 245.58 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:49:04,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 245.20 | bwd_inner_microstep: 245.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:04,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.57 | bwd_microstep: 244.64 | bwd_inner_microstep: 244.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:05,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 246.17 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:49:05,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 243.96 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:49:05,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.94 | bwd_microstep: 242.56 | bwd_inner_microstep: 242.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:06,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.73 | bwd_microstep: 241.88 | bwd_inner_microstep: 241.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:06,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 240.57 | bwd_inner_microstep: 240.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:49:07,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.10 | bwd_microstep: 242.67 | bwd_inner_microstep: 242.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:07,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.64 | optimizer_gradients: 0.56 | optimizer_step: 3.09 +[2024-12-31 18:49:07,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.15 | bwd_microstep: 315.13 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 71.82 | step_microstep: 12.73 +[2024-12-31 18:49:07,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.46 | bwd: 4161.60 | bwd_inner: 4089.03 | bwd_allreduce: 72.00 | step: 15.29 + 91%|█████████ | 692/759 [1:39:02<08:11, 7.33s/it] {'loss': 1.2381, 'learning_rate': 4.0616422847211013e-07, 'epoch': 0.91} + 91%|█████████ | 692/759 [1:39:02<08:11, 7.33s/it][2024-12-31 18:49:08,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.45 | bwd_microstep: 356.98 | bwd_inner_microstep: 356.62 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:49:08,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.23 | bwd_microstep: 299.05 | bwd_inner_microstep: 299.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:09,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.56 | bwd_microstep: 290.07 | bwd_inner_microstep: 290.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:49:09,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.99 | bwd_microstep: 284.00 | bwd_inner_microstep: 283.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:10,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.59 | bwd_microstep: 266.58 | bwd_inner_microstep: 266.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:49:10,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.60 | bwd_microstep: 255.05 | bwd_inner_microstep: 255.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:11,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.27 | bwd_microstep: 262.43 | bwd_inner_microstep: 262.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:49:11,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.30 | bwd_microstep: 252.73 | bwd_inner_microstep: 252.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:49:12,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 247.06 | bwd_inner_microstep: 247.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:49:12,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.45 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:12,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.54 | bwd_microstep: 244.41 | bwd_inner_microstep: 244.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:49:13,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:13,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:14,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:14,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.13 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:15,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.50 | optimizer_gradients: 0.60 | optimizer_step: 3.19 +[2024-12-31 18:49:15,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 267.56 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 23.45 | step_microstep: 12.42 +[2024-12-31 18:49:15,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2882.58 | bwd: 4248.64 | bwd_inner: 4224.25 | bwd_allreduce: 23.74 | step: 14.95 + 91%|█████████▏| 693/759 [1:39:10<08:05, 7.35s/it] {'loss': 1.2074, 'learning_rate': 3.942101074401028e-07, 'epoch': 0.91} + 91%|█████████▏| 693/759 [1:39:10<08:05, 7.35s/it][2024-12-31 18:49:15,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.32 | bwd_microstep: 355.12 | bwd_inner_microstep: 354.76 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:49:16,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.30 | bwd_microstep: 291.54 | bwd_inner_microstep: 291.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:16,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.18 | bwd_microstep: 278.38 | bwd_inner_microstep: 278.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:17,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.28 | bwd_microstep: 255.82 | bwd_inner_microstep: 255.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:49:17,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.28 | bwd_microstep: 254.99 | bwd_inner_microstep: 254.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:49:18,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.10 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:18,522] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.60 | bwd_microstep: 253.03 | bwd_inner_microstep: 253.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:18,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 245.78 | bwd_inner_microstep: 245.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:19,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 245.68 | bwd_inner_microstep: 245.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:49:19,826] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.26 | bwd_microstep: 245.06 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:49:20,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:49:20,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.21 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:21,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:21,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 245.97 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:49:22,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 241.89 | bwd_inner_microstep: 241.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:49:23,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.77 | optimizer_step: 3.20 +[2024-12-31 18:49:23,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.01 | bwd_microstep: 1007.79 | bwd_inner_microstep: 242.25 | bwd_allreduce_microstep: 765.49 | step_microstep: 10.22 +[2024-12-31 18:49:23,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2864.65 | bwd: 4902.40 | bwd_inner: 4136.13 | bwd_allreduce: 765.73 | step: 12.40 + 91%|█████████▏| 694/759 [1:39:18<08:10, 7.55s/it] {'loss': 1.1853, 'learning_rate': 3.824310019191102e-07, 'epoch': 0.91} + 91%|█████████▏| 694/759 [1:39:18<08:10, 7.55s/it][2024-12-31 18:49:23,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.74 | bwd_microstep: 296.91 | bwd_inner_microstep: 296.55 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:49:24,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.14 | bwd_microstep: 291.83 | bwd_inner_microstep: 291.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:24,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.48 | bwd_microstep: 262.77 | bwd_inner_microstep: 262.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:25,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.27 | bwd_microstep: 276.75 | bwd_inner_microstep: 276.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:25,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.25 | bwd_microstep: 250.02 | bwd_inner_microstep: 249.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:26,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:49:26,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 246.55 | bwd_inner_microstep: 246.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:49:26,892] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:27,361] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.00 | bwd_microstep: 280.15 | bwd_inner_microstep: 280.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:49:27,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 253.02 | bwd_inner_microstep: 252.62 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.36 +[2024-12-31 18:49:28,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.33 | bwd_microstep: 244.35 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:28,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 245.34 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:29,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.37 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.33 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.32 +[2024-12-31 18:49:29,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.31 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.34 +[2024-12-31 18:49:29,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:49:30,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.08 | optimizer_gradients: 0.67 | optimizer_step: 3.18 +[2024-12-31 18:49:30,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 458.23 | bwd_inner_microstep: 245.64 | bwd_allreduce_microstep: 212.53 | step_microstep: 11.42 +[2024-12-31 18:49:30,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2789.00 | bwd: 4328.49 | bwd_inner: 4114.00 | bwd_allreduce: 213.39 | step: 14.75 + 92%|█████████▏| 695/759 [1:39:25<08:01, 7.52s/it] {'loss': 1.2447, 'learning_rate': 3.708271265220087e-07, 'epoch': 0.92} + 92%|█████████▏| 695/759 [1:39:25<08:01, 7.52s/it][2024-12-31 18:49:31,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.79 | bwd_microstep: 349.64 | bwd_inner_microstep: 349.28 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:49:31,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.53 | bwd_microstep: 294.32 | bwd_inner_microstep: 294.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:49:32,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.29 | bwd_microstep: 266.83 | bwd_inner_microstep: 266.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:32,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.02 | bwd_microstep: 263.73 | bwd_inner_microstep: 263.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:33,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 251.01 | bwd_inner_microstep: 250.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:33,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 250.37 | bwd_inner_microstep: 250.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:49:33,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 247.23 | bwd_inner_microstep: 247.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:34,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.39 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:34,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 246.93 | bwd_inner_microstep: 246.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:35,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.37 | bwd_microstep: 246.25 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:35,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 246.33 | bwd_inner_microstep: 246.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:36,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:36,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:37,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:37,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.35 | bwd_microstep: 243.17 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:49:37,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.72 | optimizer_step: 3.79 +[2024-12-31 18:49:37,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.41 | bwd_microstep: 265.22 | bwd_inner_microstep: 250.77 | bwd_allreduce_microstep: 14.33 | step_microstep: 12.26 +[2024-12-31 18:49:37,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.45 | bwd: 4147.12 | bwd_inner: 4131.91 | bwd_allreduce: 14.62 | step: 15.41 + 92%|█████████▏| 696/759 [1:39:32<07:49, 7.45s/it] {'loss': 1.2242, 'learning_rate': 3.5939869266901073e-07, 'epoch': 0.92} + 92%|█████████▏| 696/759 [1:39:32<07:49, 7.45s/it][2024-12-31 18:49:38,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.69 | bwd_microstep: 346.92 | bwd_inner_microstep: 346.51 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:49:39,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.36 | bwd_microstep: 297.79 | bwd_inner_microstep: 297.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:39,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.45 | bwd_microstep: 281.55 | bwd_inner_microstep: 281.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:49:39,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.98 | bwd_microstep: 268.70 | bwd_inner_microstep: 268.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:49:40,439] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.75 | bwd_microstep: 255.61 | bwd_inner_microstep: 255.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:40,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.66 | bwd_microstep: 249.49 | bwd_inner_microstep: 249.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:41,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 247.36 | bwd_inner_microstep: 247.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:41,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 246.41 | bwd_inner_microstep: 246.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:42,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:42,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 244.74 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:43,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:43,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.24 | bwd_microstep: 247.10 | bwd_inner_microstep: 247.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:43,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.26 | bwd_inner_microstep: 245.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:44,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.61 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:44,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.12 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:45,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.07 | optimizer_gradients: 0.84 | optimizer_step: 3.15 +[2024-12-31 18:49:45,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 384.26 | bwd_inner_microstep: 275.13 | bwd_allreduce_microstep: 109.08 | step_microstep: 10.85 +[2024-12-31 18:49:45,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2851.79 | bwd: 4291.20 | bwd_inner: 4181.22 | bwd_allreduce: 109.33 | step: 13.92 + 92%|█████████▏| 697/759 [1:39:40<07:41, 7.45s/it] {'loss': 1.2555, 'learning_rate': 3.481459085838268e-07, 'epoch': 0.92} + 92%|█████████▏| 697/759 [1:39:40<07:41, 7.45s/it][2024-12-31 18:49:45,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.59 | bwd_microstep: 371.31 | bwd_inner_microstep: 370.95 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:49:46,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.81 | bwd_microstep: 297.65 | bwd_inner_microstep: 297.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:46,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.66 | bwd_microstep: 269.02 | bwd_inner_microstep: 268.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:47,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.06 | bwd_microstep: 263.18 | bwd_inner_microstep: 263.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:47,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 250.97 | bwd_inner_microstep: 250.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:48,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 248.62 | bwd_inner_microstep: 248.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:49:48,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.07 | bwd_microstep: 247.24 | bwd_inner_microstep: 247.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:49,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 245.29 | bwd_inner_microstep: 245.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:49:49,617] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 256.56 | bwd_inner_microstep: 256.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:49:50,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 243.70 | bwd_inner_microstep: 243.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:50,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 253.23 | bwd_inner_microstep: 253.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:50,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.86 | bwd_microstep: 243.16 | bwd_inner_microstep: 243.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:51,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:51,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 260.31 | bwd_inner_microstep: 260.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:49:52,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 243.32 | bwd_inner_microstep: 243.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:49:52,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.80 | optimizer_gradients: 0.71 | optimizer_step: 3.39 +[2024-12-31 18:49:52,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 258.53 | bwd_inner_microstep: 244.84 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.15 +[2024-12-31 18:49:52,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.65 | bwd: 4196.15 | bwd_inner: 4181.63 | bwd_allreduce: 13.88 | step: 14.00 + 92%|█████████▏| 698/759 [1:39:47<07:31, 7.40s/it] {'loss': 1.2489, 'learning_rate': 3.370689792898618e-07, 'epoch': 0.92} + 92%|█████████▏| 698/759 [1:39:47<07:31, 7.40s/it][2024-12-31 18:49:53,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.02 | bwd_microstep: 335.70 | bwd_inner_microstep: 335.33 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:49:53,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.98 | bwd_microstep: 280.95 | bwd_inner_microstep: 280.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:49:54,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.53 | bwd_microstep: 266.87 | bwd_inner_microstep: 266.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:54,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 268.02 | bwd_inner_microstep: 267.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:49:55,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 247.22 | bwd_inner_microstep: 247.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:55,527] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.69 | bwd_microstep: 248.60 | bwd_inner_microstep: 248.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:55,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.41 | bwd_microstep: 248.00 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:56,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:56,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 247.21 | bwd_inner_microstep: 247.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:57,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:49:57,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:49:58,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.14 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:58,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 244.73 | bwd_inner_microstep: 244.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:49:59,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.75 | bwd_microstep: 243.39 | bwd_inner_microstep: 243.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:49:59,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.81 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:00,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.03 | optimizer_gradients: 0.63 | optimizer_step: 3.15 +[2024-12-31 18:50:00,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.96 | bwd_microstep: 1106.81 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 864.08 | step_microstep: 10.89 +[2024-12-31 18:50:00,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2835.41 | bwd: 4958.36 | bwd_inner: 4093.44 | bwd_allreduce: 864.33 | step: 13.91 + 92%|█████████▏| 699/759 [1:39:55<07:36, 7.61s/it] {'loss': 1.242, 'learning_rate': 3.261681066064859e-07, 'epoch': 0.92} + 92%|█████████▏| 699/759 [1:39:55<07:36, 7.61s/it][2024-12-31 18:50:01,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 231.68 | bwd_microstep: 371.00 | bwd_inner_microstep: 370.65 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:50:01,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.61 | bwd_microstep: 291.69 | bwd_inner_microstep: 291.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:02,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.83 | bwd_microstep: 284.36 | bwd_inner_microstep: 284.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:02,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.17 | bwd_microstep: 262.69 | bwd_inner_microstep: 262.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:03,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.85 | bwd_microstep: 257.34 | bwd_inner_microstep: 257.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:03,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.40 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:04,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 246.23 | bwd_inner_microstep: 246.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:50:04,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 248.48 | bwd_inner_microstep: 248.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:05,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 252.91 | bwd_inner_microstep: 252.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:05,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 245.52 | bwd_inner_microstep: 245.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:05,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 251.65 | bwd_inner_microstep: 251.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:06,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 245.09 | bwd_inner_microstep: 245.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:06,760] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 243.21 | bwd_inner_microstep: 243.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:07,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.18 | bwd_microstep: 242.92 | bwd_inner_microstep: 242.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:07,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.70 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:50:08,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.77 | optimizer_gradients: 0.95 | optimizer_step: 17.12 +[2024-12-31 18:50:08,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 258.19 | bwd_inner_microstep: 244.47 | bwd_allreduce_microstep: 13.60 | step_microstep: 25.28 +[2024-12-31 18:50:08,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.03 | bwd: 4192.94 | bwd_inner: 4178.46 | bwd_allreduce: 13.88 | step: 28.21 + 92%|█████████▏| 700/759 [1:40:03<07:24, 7.53s/it] {'loss': 1.2281, 'learning_rate': 3.154434891453473e-07, 'epoch': 0.92} + 92%|█████████▏| 700/759 [1:40:03<07:24, 7.53s/it][2024-12-31 18:50:08,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.75 | bwd_microstep: 317.10 | bwd_inner_microstep: 316.69 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.27 +[2024-12-31 18:50:09,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 241.88 | bwd_microstep: 401.80 | bwd_inner_microstep: 401.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:50:09,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.26 | bwd_microstep: 291.40 | bwd_inner_microstep: 291.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:10,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.60 | bwd_microstep: 262.59 | bwd_inner_microstep: 262.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:10,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.07 | bwd_microstep: 249.65 | bwd_inner_microstep: 249.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:11,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.09 | bwd_microstep: 247.92 | bwd_inner_microstep: 247.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:11,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 251.87 | bwd_inner_microstep: 251.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:12,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:12,432] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:12,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.04 | bwd_microstep: 259.01 | bwd_inner_microstep: 258.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:13,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 248.51 | bwd_inner_microstep: 248.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:13,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 244.00 | bwd_inner_microstep: 243.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:14,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:14,607] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.88 | bwd_microstep: 241.63 | bwd_inner_microstep: 241.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:15,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.03 | bwd_microstep: 303.85 | bwd_inner_microstep: 303.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:15,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.73 | optimizer_gradients: 0.67 | optimizer_step: 6.29 +[2024-12-31 18:50:15,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.52 | bwd_microstep: 254.90 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 13.64 | step_microstep: 16.90 +[2024-12-31 18:50:15,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2865.26 | bwd: 4307.44 | bwd_inner: 4292.83 | bwd_allreduce: 13.96 | step: 19.70 + 92%|█████████▏| 701/759 [1:40:10<07:15, 7.51s/it] {'loss': 1.1931, 'learning_rate': 3.0489532230676744e-07, 'epoch': 0.92} + 92%|█████████▏| 701/759 [1:40:10<07:15, 7.51s/it][2024-12-31 18:50:16,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.93 | bwd_microstep: 361.24 | bwd_inner_microstep: 360.89 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:50:16,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.58 | bwd_microstep: 282.33 | bwd_inner_microstep: 282.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:17,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.86 | bwd_microstep: 262.65 | bwd_inner_microstep: 262.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:17,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.37 | bwd_microstep: 258.42 | bwd_inner_microstep: 258.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:17,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.27 | bwd_microstep: 254.41 | bwd_inner_microstep: 254.23 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.23 +[2024-12-31 18:50:18,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 247.81 | bwd_inner_microstep: 247.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:18,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.65 | bwd_microstep: 249.64 | bwd_inner_microstep: 249.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:50:19,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.63 | bwd_microstep: 245.98 | bwd_inner_microstep: 245.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:19,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.52 | bwd_inner_microstep: 244.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:20,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.04 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:20,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.67 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:21,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 282.45 | bwd_inner_microstep: 282.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:21,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 244.56 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.35 +[2024-12-31 18:50:21,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.60 | bwd_inner_microstep: 243.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:50:22,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:22,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.53 | optimizer_gradients: 0.83 | optimizer_step: 3.10 +[2024-12-31 18:50:22,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 316.83 | bwd_inner_microstep: 246.61 | bwd_allreduce_microstep: 70.18 | step_microstep: 11.95 +[2024-12-31 18:50:22,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2823.11 | bwd: 4227.07 | bwd_inner: 4155.39 | bwd_allreduce: 70.80 | step: 14.93 + 92%|█████████▏| 702/759 [1:40:17<07:05, 7.46s/it] {'loss': 1.189, 'learning_rate': 2.945237982761706e-07, 'epoch': 0.92} + 92%|█████████▏| 702/759 [1:40:17<07:05, 7.46s/it][2024-12-31 18:50:23,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.79 | bwd_microstep: 346.86 | bwd_inner_microstep: 346.50 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:50:24,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.05 | bwd_microstep: 318.28 | bwd_inner_microstep: 318.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:24,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.07 | bwd_microstep: 267.73 | bwd_inner_microstep: 267.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:24,955] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.96 | bwd_microstep: 266.52 | bwd_inner_microstep: 266.11 | bwd_allreduce_microstep: 0.25 | step_microstep: 0.28 +[2024-12-31 18:50:25,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.23 | bwd_microstep: 260.61 | bwd_inner_microstep: 260.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:50:25,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.83 | bwd_microstep: 248.58 | bwd_inner_microstep: 248.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:26,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:26,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.49 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:27,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.32 | bwd_inner_microstep: 245.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:27,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.23 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:28,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 244.49 | bwd_inner_microstep: 244.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:50:28,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 243.27 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:28,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.00 | bwd_microstep: 254.78 | bwd_inner_microstep: 254.57 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.23 +[2024-12-31 18:50:29,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 243.66 | bwd_inner_microstep: 243.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:29,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:30,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.78 | optimizer_step: 3.69 +[2024-12-31 18:50:30,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 434.53 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 190.46 | step_microstep: 12.24 +[2024-12-31 18:50:30,428] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2870.20 | bwd: 4359.32 | bwd_inner: 4167.34 | bwd_allreduce: 191.14 | step: 15.31 + 93%|█████████▎| 703/759 [1:40:25<06:58, 7.48s/it] {'loss': 1.2069, 'learning_rate': 2.843291060205855e-07, 'epoch': 0.93} + 93%|█████████▎| 703/759 [1:40:25<06:58, 7.48s/it][2024-12-31 18:50:30,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.01 | bwd_microstep: 290.38 | bwd_inner_microstep: 290.04 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:50:31,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.02 | bwd_microstep: 294.64 | bwd_inner_microstep: 294.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:50:31,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.09 | bwd_microstep: 268.26 | bwd_inner_microstep: 268.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:32,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.31 | bwd_microstep: 262.69 | bwd_inner_microstep: 262.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:32,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.06 | bwd_microstep: 256.11 | bwd_inner_microstep: 256.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:33,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.22 | bwd_microstep: 256.03 | bwd_inner_microstep: 255.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:33,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.40 | bwd_microstep: 248.14 | bwd_inner_microstep: 248.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:34,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 245.73 | bwd_inner_microstep: 245.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:34,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:35,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.02 | bwd_inner_microstep: 243.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:35,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.63 | bwd_microstep: 253.76 | bwd_inner_microstep: 253.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:50:35,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:36,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 260.29 | bwd_inner_microstep: 260.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:36,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.56 | bwd_microstep: 243.51 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:50:37,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.31 | bwd_microstep: 243.14 | bwd_inner_microstep: 243.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:50:37,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.30 | optimizer_gradients: 0.56 | optimizer_step: 3.12 +[2024-12-31 18:50:37,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.65 | bwd_microstep: 367.07 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 122.53 | step_microstep: 11.20 +[2024-12-31 18:50:37,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2796.47 | bwd: 4223.89 | bwd_inner: 4100.55 | bwd_allreduce: 122.77 | step: 14.18 + 93%|█████████▎| 704/759 [1:40:32<06:48, 7.43s/it] {'loss': 1.1773, 'learning_rate': 2.7431143128520243e-07, 'epoch': 0.93} + 93%|█████████▎| 704/759 [1:40:32<06:48, 7.43s/it][2024-12-31 18:50:38,352] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.62 | bwd_microstep: 370.37 | bwd_inner_microstep: 370.02 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:50:38,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.49 | bwd_microstep: 282.22 | bwd_inner_microstep: 282.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:39,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.59 | bwd_microstep: 265.22 | bwd_inner_microstep: 265.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:50:39,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.74 | bwd_microstep: 260.14 | bwd_inner_microstep: 260.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:40,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 269.74 | bwd_inner_microstep: 266.72 | bwd_allreduce_microstep: 2.89 | step_microstep: 0.20 +[2024-12-31 18:50:40,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.86 | bwd_microstep: 246.66 | bwd_inner_microstep: 246.16 | bwd_allreduce_microstep: 0.21 | step_microstep: 0.22 +[2024-12-31 18:50:41,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.49 | bwd_microstep: 287.99 | bwd_inner_microstep: 287.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:41,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:42,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:42,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.80 | bwd_microstep: 247.41 | bwd_inner_microstep: 247.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:42,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.49 | bwd_inner_microstep: 243.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:43,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.18 | bwd_inner_microstep: 245.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:43,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.44 | bwd_microstep: 246.58 | bwd_inner_microstep: 246.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:44,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.34 | bwd_inner_microstep: 244.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:44,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.55 | bwd_microstep: 241.16 | bwd_inner_microstep: 241.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:45,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.16 | optimizer_gradients: 0.61 | optimizer_step: 3.25 +[2024-12-31 18:50:45,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 257.44 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.91 +[2024-12-31 18:50:45,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2833.41 | bwd: 4197.65 | bwd_inner: 4179.67 | bwd_allreduce: 17.00 | step: 14.65 + 93%|█████████▎| 705/759 [1:40:40<06:39, 7.40s/it] {'loss': 1.2184, 'learning_rate': 2.6447095658999054e-07, 'epoch': 0.93} + 93%|█████████▎| 705/759 [1:40:40<06:39, 7.40s/it][2024-12-31 18:50:45,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.62 | bwd_microstep: 319.51 | bwd_inner_microstep: 319.16 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:50:46,098] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.25 | bwd_microstep: 290.96 | bwd_inner_microstep: 290.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:50:46,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.61 | bwd_microstep: 282.04 | bwd_inner_microstep: 282.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:50:47,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.70 | bwd_microstep: 262.92 | bwd_inner_microstep: 262.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:47,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.13 | bwd_microstep: 260.38 | bwd_inner_microstep: 260.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:50:47,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 248.70 | bwd_inner_microstep: 248.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:48,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 247.04 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:50:48,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 246.96 | bwd_inner_microstep: 246.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:50:49,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 243.94 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:49,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 257.51 | bwd_inner_microstep: 257.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:50,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 243.73 | bwd_inner_microstep: 243.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:50:50,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.05 | bwd_microstep: 242.96 | bwd_inner_microstep: 242.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:50,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.06 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:51,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 244.19 | bwd_inner_microstep: 244.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:51,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 249.24 | bwd_inner_microstep: 249.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:50:52,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.66 | optimizer_step: 3.29 +[2024-12-31 18:50:52,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 282.43 | bwd_inner_microstep: 268.73 | bwd_allreduce_microstep: 13.58 | step_microstep: 11.12 +[2024-12-31 18:50:52,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2810.15 | bwd: 4171.83 | bwd_inner: 4157.38 | bwd_allreduce: 13.86 | step: 14.10 + 93%|█████████▎| 706/759 [1:40:47<06:29, 7.36s/it] {'loss': 1.2064, 'learning_rate': 2.5480786122636713e-07, 'epoch': 0.93} + 93%|█████████▎| 706/759 [1:40:47<06:29, 7.36s/it][2024-12-31 18:50:52,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.14 | bwd_microstep: 344.95 | bwd_inner_microstep: 344.79 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.10 +[2024-12-31 18:50:53,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.95 | bwd_microstep: 268.38 | bwd_inner_microstep: 268.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:50:53,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.75 | bwd_microstep: 299.85 | bwd_inner_microstep: 299.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:54,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.48 | bwd_microstep: 261.30 | bwd_inner_microstep: 261.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:50:54,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 256.93 | bwd_inner_microstep: 256.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:50:55,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 247.03 | bwd_inner_microstep: 247.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:50:55,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 245.90 | bwd_inner_microstep: 245.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:50:56,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 272.70 | bwd_inner_microstep: 272.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:56,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 243.61 | bwd_inner_microstep: 243.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:56,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 244.43 | bwd_inner_microstep: 244.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:57,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 254.24 | bwd_inner_microstep: 254.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:57,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 242.82 | bwd_inner_microstep: 242.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:58,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:58,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:50:59,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.97 | bwd_microstep: 243.29 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:50:59,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.96 | optimizer_gradients: 0.58 | optimizer_step: 3.09 +[2024-12-31 18:50:59,623] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 337.34 | bwd_inner_microstep: 244.98 | bwd_allreduce_microstep: 92.32 | step_microstep: 11.81 +[2024-12-31 18:50:59,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2814.03 | bwd: 4249.18 | bwd_inner: 4156.27 | bwd_allreduce: 92.49 | step: 13.96 + 93%|█████████▎| 707/759 [1:40:54<06:21, 7.34s/it] {'loss': 1.2445, 'learning_rate': 2.453223212539391e-07, 'epoch': 0.93} + 93%|█████████▎| 707/759 [1:40:54<06:21, 7.34s/it][2024-12-31 18:51:00,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 208.96 | bwd_microstep: 314.02 | bwd_inner_microstep: 313.66 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:51:00,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.24 | bwd_microstep: 290.33 | bwd_inner_microstep: 290.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:51:01,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.94 | bwd_microstep: 267.28 | bwd_inner_microstep: 267.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:51:01,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.90 | bwd_microstep: 261.77 | bwd_inner_microstep: 261.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:51:02,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.33 | bwd_microstep: 257.83 | bwd_inner_microstep: 257.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:51:02,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 248.68 | bwd_inner_microstep: 248.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:02,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.22 | bwd_microstep: 248.39 | bwd_inner_microstep: 248.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:03,385] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 247.02 | bwd_inner_microstep: 246.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:51:03,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.33 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:51:04,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.32 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:04,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 244.63 | bwd_inner_microstep: 244.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:05,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 252.18 | bwd_inner_microstep: 252.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.69 +[2024-12-31 18:51:05,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.13 | bwd_microstep: 244.40 | bwd_inner_microstep: 244.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:05,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 243.69 | bwd_inner_microstep: 243.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:51:06,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.74 | bwd_microstep: 240.79 | bwd_inner_microstep: 240.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:07,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.57 | optimizer_step: 3.12 +[2024-12-31 18:51:07,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.79 | bwd_microstep: 624.38 | bwd_inner_microstep: 256.09 | bwd_allreduce_microstep: 368.24 | step_microstep: 11.25 +[2024-12-31 18:51:07,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2815.75 | bwd: 4476.43 | bwd_inner: 4107.40 | bwd_allreduce: 368.51 | step: 14.53 + 93%|█████████▎| 708/759 [1:41:02<06:18, 7.42s/it] {'loss': 1.1938, 'learning_rate': 2.3601450949728876e-07, 'epoch': 0.93} + 93%|█████████▎| 708/759 [1:41:02<06:18, 7.42s/it][2024-12-31 18:51:07,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 234.70 | bwd_microstep: 368.72 | bwd_inner_microstep: 368.35 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:51:08,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.10 | bwd_microstep: 299.17 | bwd_inner_microstep: 299.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:51:08,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.72 | bwd_microstep: 285.62 | bwd_inner_microstep: 285.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:51:09,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.31 | bwd_microstep: 265.28 | bwd_inner_microstep: 265.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:09,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.68 | bwd_microstep: 255.53 | bwd_inner_microstep: 255.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:10,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 256.76 | bwd_inner_microstep: 256.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:10,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.60 | bwd_microstep: 257.43 | bwd_inner_microstep: 257.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:51:11,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.99 | bwd_microstep: 246.78 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:11,548] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 249.02 | bwd_inner_microstep: 248.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:11,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 257.38 | bwd_inner_microstep: 257.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:12,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:51:12,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 243.41 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:13,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.36 | bwd_microstep: 243.88 | bwd_inner_microstep: 243.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:13,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.88 | bwd_microstep: 240.95 | bwd_inner_microstep: 240.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:14,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 241.52 | bwd_inner_microstep: 241.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:14,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.70 | optimizer_gradients: 0.78 | optimizer_step: 3.29 +[2024-12-31 18:51:14,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.93 | bwd_microstep: 256.79 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.34 +[2024-12-31 18:51:14,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2878.29 | bwd: 4212.74 | bwd_inner: 4198.30 | bwd_allreduce: 13.82 | step: 14.32 + 93%|█████████▎| 709/759 [1:41:09<06:10, 7.41s/it] {'loss': 1.1684, 'learning_rate': 2.2688459554282673e-07, 'epoch': 0.93} + 93%|█████████▎| 709/759 [1:41:09<06:10, 7.41s/it][2024-12-31 18:51:15,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.28 | bwd_microstep: 335.09 | bwd_inner_microstep: 334.74 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:51:15,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.64 | bwd_microstep: 291.41 | bwd_inner_microstep: 291.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:16,142] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.36 | bwd_microstep: 265.70 | bwd_inner_microstep: 265.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:16,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.63 | bwd_microstep: 254.09 | bwd_inner_microstep: 254.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:17,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.45 | bwd_microstep: 254.72 | bwd_inner_microstep: 254.37 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.27 +[2024-12-31 18:51:17,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.90 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:51:17,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.79 | bwd_microstep: 254.51 | bwd_inner_microstep: 254.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:51:18,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:18,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:19,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:19,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.39 | bwd_microstep: 253.10 | bwd_inner_microstep: 253.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:51:20,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:20,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.40 | bwd_microstep: 243.22 | bwd_inner_microstep: 243.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:51:20,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.66 | bwd_microstep: 242.51 | bwd_inner_microstep: 242.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:51:21,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.68 | bwd_microstep: 242.65 | bwd_inner_microstep: 242.28 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.35 +[2024-12-31 18:51:21,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 1.09 | optimizer_step: 3.56 +[2024-12-31 18:51:21,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.67 | bwd_microstep: 257.59 | bwd_inner_microstep: 243.19 | bwd_allreduce_microstep: 14.29 | step_microstep: 14.64 +[2024-12-31 18:51:21,821] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2816.60 | bwd: 4117.90 | bwd_inner: 4101.96 | bwd_allreduce: 15.08 | step: 17.50 + 94%|█████████▎| 710/759 [1:41:16<06:00, 7.35s/it] {'loss': 1.2272, 'learning_rate': 2.1793274573570166e-07, 'epoch': 0.94} + 94%|█████████▎| 710/759 [1:41:16<06:00, 7.35s/it][2024-12-31 18:51:22,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 217.05 | bwd_microstep: 349.45 | bwd_inner_microstep: 349.04 | bwd_allreduce_microstep: 0.18 | step_microstep: 0.20 +[2024-12-31 18:51:22,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.32 | bwd_microstep: 296.25 | bwd_inner_microstep: 296.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:23,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.97 | bwd_microstep: 267.91 | bwd_inner_microstep: 267.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:23,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.22 | bwd_microstep: 261.36 | bwd_inner_microstep: 261.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:24,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.42 | bwd_microstep: 249.86 | bwd_inner_microstep: 249.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:24,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.72 | bwd_microstep: 250.21 | bwd_inner_microstep: 250.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:51:25,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:25,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.38 | bwd_microstep: 255.94 | bwd_inner_microstep: 255.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:26,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.36 | bwd_microstep: 248.86 | bwd_inner_microstep: 248.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:51:26,499] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.18 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:51:26,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 245.15 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:51:27,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.11 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:27,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.74 | bwd_microstep: 243.09 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:28,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 244.31 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.40 | step_microstep: 0.72 +[2024-12-31 18:51:28,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.75 | bwd_microstep: 242.13 | bwd_inner_microstep: 242.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:51:29,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.74 | optimizer_gradients: 0.58 | optimizer_step: 13.77 +[2024-12-31 18:51:29,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 762.94 | bwd_inner_microstep: 242.46 | bwd_allreduce_microstep: 520.43 | step_microstep: 21.94 +[2024-12-31 18:51:29,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2846.60 | bwd: 4655.38 | bwd_inner: 4133.17 | bwd_allreduce: 521.15 | step: 25.36 + 94%|█████████▎| 711/759 [1:41:24<05:59, 7.49s/it] {'loss': 1.2135, 'learning_rate': 2.091591231767709e-07, 'epoch': 0.94} + 94%|█████████▎| 711/759 [1:41:24<05:59, 7.49s/it][2024-12-31 18:51:30,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.41 | bwd_microstep: 363.01 | bwd_inner_microstep: 362.67 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:51:30,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.94 | bwd_microstep: 282.53 | bwd_inner_microstep: 282.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:31,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.85 | bwd_microstep: 290.79 | bwd_inner_microstep: 290.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:51:31,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.45 | bwd_microstep: 261.53 | bwd_inner_microstep: 261.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:51:32,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.15 | bwd_microstep: 249.48 | bwd_inner_microstep: 249.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:32,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.67 | bwd_microstep: 248.44 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:33,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 249.02 | bwd_inner_microstep: 248.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:51:33,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.50 | bwd_microstep: 245.74 | bwd_inner_microstep: 245.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:33,879] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:34,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.68 | bwd_microstep: 246.34 | bwd_inner_microstep: 246.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:34,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 247.28 | bwd_inner_microstep: 247.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:51:35,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:35,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.04 | bwd_microstep: 243.07 | bwd_inner_microstep: 243.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:36,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.43 | bwd_microstep: 253.07 | bwd_inner_microstep: 253.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:36,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.06 | bwd_microstep: 241.15 | bwd_inner_microstep: 241.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:36,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.70 | optimizer_gradients: 0.85 | optimizer_step: 3.35 +[2024-12-31 18:51:36,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.94 | bwd_microstep: 255.84 | bwd_inner_microstep: 242.18 | bwd_allreduce_microstep: 13.55 | step_microstep: 11.36 +[2024-12-31 18:51:36,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2838.91 | bwd: 4165.76 | bwd_inner: 4151.23 | bwd_allreduce: 13.82 | step: 14.05 + 94%|█████████▍| 712/759 [1:41:31<05:49, 7.43s/it] {'loss': 1.2497, 'learning_rate': 2.005638877196303e-07, 'epoch': 0.94} + 94%|█████████▍| 712/759 [1:41:31<05:49, 7.43s/it][2024-12-31 18:51:37,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.60 | bwd_microstep: 369.94 | bwd_inner_microstep: 369.59 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.29 +[2024-12-31 18:51:38,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.47 | bwd_microstep: 297.63 | bwd_inner_microstep: 297.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:51:38,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.79 | bwd_microstep: 317.06 | bwd_inner_microstep: 317.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:51:39,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.24 | bwd_microstep: 280.09 | bwd_inner_microstep: 280.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:39,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.68 | bwd_microstep: 261.62 | bwd_inner_microstep: 261.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:39,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.35 | bwd_microstep: 255.54 | bwd_inner_microstep: 255.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:51:40,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.17 | bwd_microstep: 257.04 | bwd_inner_microstep: 257.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:40,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.32 | bwd_microstep: 248.13 | bwd_inner_microstep: 248.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:41,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 245.64 | bwd_inner_microstep: 245.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:41,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 274.81 | bwd_inner_microstep: 274.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.29 +[2024-12-31 18:51:42,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 306.61 | bwd_inner_microstep: 306.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:42,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:43,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.73 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:43,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 246.60 | bwd_inner_microstep: 245.76 | bwd_allreduce_microstep: 0.38 | step_microstep: 0.24 +[2024-12-31 18:51:43,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.39 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:51:44,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.94 | optimizer_gradients: 0.59 | optimizer_step: 3.23 +[2024-12-31 18:51:44,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.13 | bwd_microstep: 257.39 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 13.52 | step_microstep: 11.33 +[2024-12-31 18:51:44,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2872.87 | bwd: 4352.26 | bwd_inner: 4336.91 | bwd_allreduce: 14.22 | step: 14.48 + 94%|█████████▍| 713/759 [1:41:39<05:42, 7.45s/it] {'loss': 1.1679, 'learning_rate': 1.921471959676957e-07, 'epoch': 0.94} + 94%|█████████▍| 713/759 [1:41:39<05:42, 7.45s/it][2024-12-31 18:51:45,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 288.26 | bwd_microstep: 502.74 | bwd_inner_microstep: 502.38 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:51:45,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.68 | bwd_microstep: 356.92 | bwd_inner_microstep: 356.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:51:46,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.04 | bwd_microstep: 299.26 | bwd_inner_microstep: 299.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:46,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.33 | bwd_microstep: 262.92 | bwd_inner_microstep: 262.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:47,252] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.47 | bwd_microstep: 249.56 | bwd_inner_microstep: 249.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:51:47,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.88 | bwd_microstep: 249.81 | bwd_inner_microstep: 249.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:48,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.41 | bwd_microstep: 250.06 | bwd_inner_microstep: 250.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:51:48,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 245.45 | bwd_inner_microstep: 245.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:48,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 185.40 | bwd_microstep: 246.44 | bwd_inner_microstep: 246.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:49,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 245.97 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:49,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.79 | bwd_microstep: 245.82 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:51:50,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.63 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:51:50,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 242.99 | bwd_inner_microstep: 242.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:51:51,177] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.64 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:51:51,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:51:52,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.46 | optimizer_gradients: 0.77 | optimizer_step: 8.05 +[2024-12-31 18:51:52,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 258.01 | bwd_inner_microstep: 244.29 | bwd_allreduce_microstep: 13.62 | step_microstep: 16.38 +[2024-12-31 18:51:52,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2970.64 | bwd: 4386.85 | bwd_inner: 4372.27 | bwd_allreduce: 13.90 | step: 18.92 + 94%|█████████▍| 714/759 [1:41:47<05:37, 7.50s/it] {'loss': 1.2072, 'learning_rate': 1.8390920127135613e-07, 'epoch': 0.94} + 94%|█████████▍| 714/759 [1:41:47<05:37, 7.50s/it][2024-12-31 18:51:52,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 229.29 | bwd_microstep: 369.58 | bwd_inner_microstep: 369.22 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:51:53,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.49 | bwd_microstep: 410.95 | bwd_inner_microstep: 410.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:53,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 198.33 | bwd_microstep: 298.80 | bwd_inner_microstep: 298.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:51:54,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.00 | bwd_microstep: 280.00 | bwd_inner_microstep: 279.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:54,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.06 | bwd_microstep: 266.43 | bwd_inner_microstep: 266.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:55,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.04 | bwd_microstep: 263.00 | bwd_inner_microstep: 262.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:51:55,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.68 | bwd_microstep: 254.94 | bwd_inner_microstep: 254.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:51:56,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.38 | bwd_microstep: 247.10 | bwd_inner_microstep: 247.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:51:56,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 245.57 | bwd_inner_microstep: 245.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:57,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.85 | bwd_microstep: 246.88 | bwd_inner_microstep: 246.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:57,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 248.48 | bwd_inner_microstep: 248.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:57,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 246.32 | bwd_inner_microstep: 246.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:58,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.65 | bwd_microstep: 245.31 | bwd_inner_microstep: 245.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:51:58,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.45 | bwd_microstep: 242.79 | bwd_inner_microstep: 242.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:59,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 253.21 | bwd_inner_microstep: 253.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:51:59,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.02 | optimizer_gradients: 0.61 | optimizer_step: 3.21 +[2024-12-31 18:51:59,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.59 | bwd_microstep: 265.42 | bwd_inner_microstep: 243.86 | bwd_allreduce_microstep: 21.49 | step_microstep: 11.32 +[2024-12-31 18:51:59,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2935.14 | bwd: 4384.86 | bwd_inner: 4362.55 | bwd_allreduce: 21.75 | step: 14.00 + 94%|█████████▍| 715/759 [1:41:54<05:31, 7.53s/it] {'loss': 1.2149, 'learning_rate': 1.7585005372517504e-07, 'epoch': 0.94} + 94%|█████████▍| 715/759 [1:41:54<05:31, 7.53s/it][2024-12-31 18:52:00,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.88 | bwd_microstep: 327.14 | bwd_inner_microstep: 326.80 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:52:00,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.63 | bwd_microstep: 349.34 | bwd_inner_microstep: 349.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:52:01,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.22 | bwd_microstep: 263.10 | bwd_inner_microstep: 263.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:01,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.20 | bwd_microstep: 254.98 | bwd_inner_microstep: 254.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:02,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.14 | bwd_microstep: 248.86 | bwd_inner_microstep: 248.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:02,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.07 | bwd_microstep: 247.76 | bwd_inner_microstep: 247.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:03,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.30 | bwd_microstep: 245.54 | bwd_inner_microstep: 245.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:03,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 245.39 | bwd_inner_microstep: 245.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:52:03,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.69 | bwd_microstep: 245.94 | bwd_inner_microstep: 245.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:04,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:04,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 243.77 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:52:05,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:05,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.70 | bwd_microstep: 240.69 | bwd_inner_microstep: 240.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:52:06,034] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.71 | bwd_microstep: 242.52 | bwd_inner_microstep: 242.19 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.37 +[2024-12-31 18:52:06,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.25 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:52:06,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.69 | optimizer_gradients: 0.87 | optimizer_step: 3.42 +[2024-12-31 18:52:06,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.77 | bwd_microstep: 255.52 | bwd_inner_microstep: 241.02 | bwd_allreduce_microstep: 14.40 | step_microstep: 11.39 +[2024-12-31 18:52:06,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2850.60 | bwd: 4141.70 | bwd_inner: 4125.83 | bwd_allreduce: 15.03 | step: 14.51 + 94%|█████████▍| 716/759 [1:42:01<05:20, 7.45s/it] {'loss': 1.2245, 'learning_rate': 1.6796990016515914e-07, 'epoch': 0.94} + 94%|█████████▍| 716/759 [1:42:01<05:20, 7.45s/it][2024-12-31 18:52:07,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.05 | bwd_microstep: 345.30 | bwd_inner_microstep: 344.95 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:52:08,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.20 | bwd_microstep: 287.80 | bwd_inner_microstep: 287.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:08,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.10 | bwd_microstep: 285.83 | bwd_inner_microstep: 285.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:52:08,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.30 | bwd_microstep: 261.24 | bwd_inner_microstep: 261.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:52:09,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.55 | bwd_microstep: 262.61 | bwd_inner_microstep: 262.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:52:09,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.80 | bwd_microstep: 255.63 | bwd_inner_microstep: 255.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:52:10,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 248.45 | bwd_inner_microstep: 248.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:52:10,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.85 | bwd_microstep: 248.25 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:11,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.45 | bwd_microstep: 258.68 | bwd_inner_microstep: 258.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:11,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.72 | bwd_microstep: 247.48 | bwd_inner_microstep: 247.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:12,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 244.90 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:52:12,500] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 243.33 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:12,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.09 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:13,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.08 | bwd_microstep: 243.99 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:13,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 159.92 | bwd_microstep: 228.60 | bwd_inner_microstep: 228.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:14,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.44 | optimizer_gradients: 0.59 | optimizer_step: 3.32 +[2024-12-31 18:52:14,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.18 | bwd_microstep: 290.74 | bwd_inner_microstep: 251.09 | bwd_allreduce_microstep: 39.60 | step_microstep: 11.42 +[2024-12-31 18:52:14,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2838.07 | bwd: 4196.44 | bwd_inner: 4156.02 | bwd_allreduce: 39.84 | step: 14.47 + 94%|█████████▍| 717/759 [1:42:09<05:11, 7.42s/it] {'loss': 1.2107, 'learning_rate': 1.6026888416608267e-07, 'epoch': 0.94} + 94%|█████████▍| 717/759 [1:42:09<05:11, 7.42s/it][2024-12-31 18:52:14,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.90 | bwd_microstep: 342.02 | bwd_inner_microstep: 341.68 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:52:15,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.44 | bwd_microstep: 290.76 | bwd_inner_microstep: 290.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:15,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.44 | bwd_microstep: 304.16 | bwd_inner_microstep: 304.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:16,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.18 | bwd_microstep: 257.42 | bwd_inner_microstep: 257.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:16,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.31 | bwd_microstep: 249.19 | bwd_inner_microstep: 249.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:52:17,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 247.46 | bwd_inner_microstep: 247.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:52:17,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.59 | bwd_microstep: 286.73 | bwd_inner_microstep: 286.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:18,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 246.46 | bwd_inner_microstep: 246.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:52:18,503] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.83 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:18,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 248.39 | bwd_inner_microstep: 248.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:52:19,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:19,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:20,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.15 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:20,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 245.96 | bwd_inner_microstep: 245.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:52:21,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.26 | bwd_inner_microstep: 243.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:21,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.44 | optimizer_gradients: 0.61 | optimizer_step: 3.15 +[2024-12-31 18:52:21,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.81 | bwd_microstep: 305.35 | bwd_inner_microstep: 241.75 | bwd_allreduce_microstep: 63.56 | step_microstep: 11.53 +[2024-12-31 18:52:21,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2817.38 | bwd: 4243.05 | bwd_inner: 4178.74 | bwd_allreduce: 63.80 | step: 14.31 + 95%|█████████▍| 718/759 [1:42:16<05:03, 7.39s/it] {'loss': 1.2172, 'learning_rate': 1.5274714603886742e-07, 'epoch': 0.95} + 95%|█████████▍| 718/759 [1:42:16<05:03, 7.39s/it][2024-12-31 18:52:22,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.94 | bwd_microstep: 318.45 | bwd_inner_microstep: 318.09 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:52:22,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.41 | bwd_microstep: 297.65 | bwd_inner_microstep: 297.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:52:23,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 240.12 | bwd_microstep: 265.13 | bwd_inner_microstep: 265.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:23,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 256.64 | bwd_inner_microstep: 256.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:24,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.89 | bwd_microstep: 256.43 | bwd_inner_microstep: 256.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:24,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.19 | bwd_microstep: 248.78 | bwd_inner_microstep: 248.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:24,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 248.31 | bwd_inner_microstep: 248.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:25,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 246.05 | bwd_inner_microstep: 246.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:52:25,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.00 | bwd_microstep: 249.06 | bwd_inner_microstep: 249.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:26,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 254.62 | bwd_inner_microstep: 254.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:26,692] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 244.46 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:52:27,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 245.03 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:27,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.90 | bwd_microstep: 244.07 | bwd_inner_microstep: 244.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:27,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:28,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.69 | bwd_microstep: 242.58 | bwd_inner_microstep: 242.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:52:29,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.69 | optimizer_step: 3.16 +[2024-12-31 18:52:29,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 609.30 | bwd_inner_microstep: 243.81 | bwd_allreduce_microstep: 365.44 | step_microstep: 10.54 +[2024-12-31 18:52:29,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2865.58 | bwd: 4470.51 | bwd_inner: 4104.08 | bwd_allreduce: 365.75 | step: 13.41 + 95%|█████████▍| 719/759 [1:42:24<04:58, 7.46s/it] {'loss': 1.2217, 'learning_rate': 1.4540482282803136e-07, 'epoch': 0.95} + 95%|█████████▍| 719/759 [1:42:24<04:58, 7.46s/it][2024-12-31 18:52:29,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 243.49 | bwd_microstep: 398.65 | bwd_inner_microstep: 398.28 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:52:30,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.24 | bwd_microstep: 291.93 | bwd_inner_microstep: 291.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:52:30,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.03 | bwd_microstep: 267.72 | bwd_inner_microstep: 267.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:31,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.04 | bwd_microstep: 262.45 | bwd_inner_microstep: 262.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:31,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.59 | bwd_microstep: 255.36 | bwd_inner_microstep: 255.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:52:32,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.08 | bwd_microstep: 256.80 | bwd_inner_microstep: 256.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:32,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 247.02 | bwd_inner_microstep: 246.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:52:33,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 246.36 | bwd_inner_microstep: 246.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:33,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.91 | bwd_microstep: 246.66 | bwd_inner_microstep: 246.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:33,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 248.22 | bwd_inner_microstep: 248.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:52:34,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:34,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 243.31 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:35,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:35,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.65 | bwd_microstep: 246.36 | bwd_inner_microstep: 246.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:36,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.91 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:36,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.66 | optimizer_step: 3.27 +[2024-12-31 18:52:36,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.93 | bwd_microstep: 257.08 | bwd_inner_microstep: 243.38 | bwd_allreduce_microstep: 13.57 | step_microstep: 11.19 +[2024-12-31 18:52:36,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2870.57 | bwd: 4200.87 | bwd_inner: 4186.24 | bwd_allreduce: 13.89 | step: 14.14 + 95%|████████��▍| 720/759 [1:42:31<04:49, 7.43s/it] {'loss': 1.235, 'learning_rate': 1.3824204830918952e-07, 'epoch': 0.95} + 95%|█████████▍| 720/759 [1:42:31<04:49, 7.43s/it][2024-12-31 18:52:37,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 237.90 | bwd_microstep: 389.37 | bwd_inner_microstep: 389.02 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:52:37,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.46 | bwd_microstep: 301.98 | bwd_inner_microstep: 301.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:38,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.87 | bwd_microstep: 292.72 | bwd_inner_microstep: 292.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:52:38,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.63 | bwd_microstep: 268.32 | bwd_inner_microstep: 268.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:39,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.18 | bwd_microstep: 267.50 | bwd_inner_microstep: 267.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:39,633] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.02 | bwd_microstep: 254.72 | bwd_inner_microstep: 254.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:40,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 248.73 | bwd_inner_microstep: 248.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:40,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.10 | bwd_microstep: 248.11 | bwd_inner_microstep: 248.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:40,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.62 | bwd_microstep: 245.68 | bwd_inner_microstep: 245.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:41,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 254.93 | bwd_inner_microstep: 254.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:52:41,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 253.63 | bwd_inner_microstep: 253.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:52:42,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.07 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.74 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.20 +[2024-12-31 18:52:42,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 245.18 | bwd_inner_microstep: 245.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:43,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.09 | bwd_microstep: 241.37 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:43,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.96 | bwd_microstep: 240.95 | bwd_inner_microstep: 240.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:44,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.73 | optimizer_gradients: 0.63 | optimizer_step: 3.32 +[2024-12-31 18:52:44,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.59 | bwd_microstep: 255.78 | bwd_inner_microstep: 242.16 | bwd_allreduce_microstep: 13.53 | step_microstep: 10.56 +[2024-12-31 18:52:44,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2874.48 | bwd: 4254.10 | bwd_inner: 4239.34 | bwd_allreduce: 13.96 | step: 13.58 + 95%|█████████▍| 721/759 [1:42:38<04:42, 7.43s/it] {'loss': 1.228, 'learning_rate': 1.3125895298661705e-07, 'epoch': 0.95} + 95%|█████████▍| 721/759 [1:42:38<04:42, 7.43s/it][2024-12-31 18:52:44,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.03 | bwd_microstep: 340.48 | bwd_inner_microstep: 340.10 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.19 +[2024-12-31 18:52:45,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 252.82 | bwd_microstep: 422.13 | bwd_inner_microstep: 422.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:45,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.99 | bwd_microstep: 287.19 | bwd_inner_microstep: 287.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:52:46,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.85 | bwd_microstep: 267.27 | bwd_inner_microstep: 267.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:46,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.15 | bwd_microstep: 259.01 | bwd_inner_microstep: 258.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:47,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.00 | bwd_microstep: 261.58 | bwd_inner_microstep: 261.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:47,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.60 | bwd_microstep: 249.12 | bwd_inner_microstep: 249.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:48,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.73 | bwd_microstep: 245.70 | bwd_inner_microstep: 245.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:48,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 253.36 | bwd_inner_microstep: 253.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:48,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.23 | bwd_microstep: 251.79 | bwd_inner_microstep: 251.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:49,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:49,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:50,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.77 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:50,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 250.20 | bwd_inner_microstep: 250.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:51,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.38 | bwd_microstep: 242.78 | bwd_inner_microstep: 242.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:51,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.72 | optimizer_step: 6.36 +[2024-12-31 18:52:51,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.58 | bwd_microstep: 255.17 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 13.66 | step_microstep: 14.42 +[2024-12-31 18:52:51,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2900.42 | bwd: 4317.80 | bwd_inner: 4303.16 | bwd_allreduce: 14.01 | step: 17.46 + 95%|█████████▌| 722/759 [1:42:46<04:35, 7.46s/it] {'loss': 1.2113, 'learning_rate': 1.244556640908712e-07, 'epoch': 0.95} + 95%|█████████▌| 722/759 [1:42:46<04:35, 7.46s/it][2024-12-31 18:52:52,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.19 | bwd_microstep: 340.26 | bwd_inner_microstep: 339.89 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:52:52,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.02 | bwd_microstep: 287.34 | bwd_inner_microstep: 287.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:52:53,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.53 | bwd_microstep: 269.76 | bwd_inner_microstep: 269.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:52:53,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 257.30 | bwd_inner_microstep: 257.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:53,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.79 | bwd_microstep: 268.08 | bwd_inner_microstep: 268.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:54,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.93 | bwd_microstep: 247.50 | bwd_inner_microstep: 247.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:52:54,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 248.65 | bwd_inner_microstep: 248.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:52:55,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 247.55 | bwd_inner_microstep: 247.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:55,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.61 | bwd_microstep: 246.34 | bwd_inner_microstep: 246.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:52:56,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 246.97 | bwd_inner_microstep: 246.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:56,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.92 | bwd_microstep: 258.64 | bwd_inner_microstep: 258.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:52:57,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 243.98 | bwd_inner_microstep: 243.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:52:57,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:52:57,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.25 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:52:58,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.40 | bwd_microstep: 385.30 | bwd_inner_microstep: 385.11 | bwd_allreduce_microstep: 0.08 | step_microstep: 0.22 +[2024-12-31 18:52:58,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.83 | optimizer_gradients: 1.12 | optimizer_step: 8.49 +[2024-12-31 18:52:58,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.95 | bwd_microstep: 256.00 | bwd_inner_microstep: 242.11 | bwd_allreduce_microstep: 13.73 | step_microstep: 18.57 +[2024-12-31 18:52:58,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2822.94 | bwd: 4293.43 | bwd_inner: 4278.53 | bwd_allreduce: 14.15 | step: 21.24 + 95%|█████████▌| 723/759 [1:42:53<04:27, 7.44s/it] {'loss': 1.1956, 'learning_rate': 1.1783230557647075e-07, 'epoch': 0.95} + 95%|█████████▌| 723/759 [1:42:53<04:27, 7.44s/it][2024-12-31 18:52:59,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.64 | bwd_microstep: 345.55 | bwd_inner_microstep: 345.19 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:53:00,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.27 | bwd_microstep: 362.96 | bwd_inner_microstep: 362.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:00,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.23 | bwd_microstep: 269.46 | bwd_inner_microstep: 269.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:01,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.73 | bwd_microstep: 266.24 | bwd_inner_microstep: 266.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:53:01,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 248.37 | bwd_inner_microstep: 248.22 | bwd_allreduce_microstep: 0.04 | step_microstep: 0.20 +[2024-12-31 18:53:01,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.93 | bwd_microstep: 263.93 | bwd_inner_microstep: 263.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:02,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:02,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.47 | bwd_inner_microstep: 245.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:03,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.15 | bwd_inner_microstep: 244.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:03,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:04,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.16 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:04,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 312.24 | bwd_inner_microstep: 312.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:04,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 242.71 | bwd_inner_microstep: 242.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:05,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.12 | bwd_microstep: 242.97 | bwd_inner_microstep: 242.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:05,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 241.18 | bwd_inner_microstep: 241.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:06,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.64 | optimizer_gradients: 1.30 | optimizer_step: 3.13 +[2024-12-31 18:53:06,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.96 | bwd_microstep: 376.50 | bwd_inner_microstep: 242.12 | bwd_allreduce_microstep: 134.34 | step_microstep: 11.42 +[2024-12-31 18:53:06,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2874.17 | bwd: 4394.01 | bwd_inner: 4258.58 | bwd_allreduce: 134.68 | step: 12.81 + 95%|█████████▌| 724/759 [1:43:01<04:20, 7.45s/it] {'loss': 1.1961, 'learning_rate': 1.1138899811964477e-07, 'epoch': 0.95} + 95%|█████████▌| 724/759 [1:43:01<04:20, 7.45s/it][2024-12-31 18:53:06,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.06 | bwd_microstep: 344.08 | bwd_inner_microstep: 343.97 | bwd_allreduce_microstep: 0.05 | step_microstep: 0.07 +[2024-12-31 18:53:07,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 199.44 | bwd_microstep: 305.43 | bwd_inner_microstep: 305.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:07,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 186.67 | bwd_microstep: 306.12 | bwd_inner_microstep: 306.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:08,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.82 | bwd_microstep: 261.77 | bwd_inner_microstep: 261.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:08,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.53 | bwd_microstep: 261.49 | bwd_inner_microstep: 261.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:09,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.48 | bwd_microstep: 255.34 | bwd_inner_microstep: 255.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:53:09,754] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.26 | bwd_microstep: 249.15 | bwd_inner_microstep: 249.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:53:10,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.53 | bwd_microstep: 246.42 | bwd_inner_microstep: 246.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:10,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 245.14 | bwd_inner_microstep: 245.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:11,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:11,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 244.38 | bwd_inner_microstep: 244.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:11,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.21 | bwd_microstep: 254.54 | bwd_inner_microstep: 254.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:12,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.66 | bwd_microstep: 242.72 | bwd_inner_microstep: 242.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:12,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.80 | bwd_microstep: 243.30 | bwd_inner_microstep: 243.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:13,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.81 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:14,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.67 | optimizer_gradients: 0.64 | optimizer_step: 3.09 +[2024-12-31 18:53:14,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.44 | bwd_microstep: 726.72 | bwd_inner_microstep: 243.95 | bwd_allreduce_microstep: 482.73 | step_microstep: 11.74 +[2024-12-31 18:53:14,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2903.64 | bwd: 4675.09 | bwd_inner: 4191.81 | bwd_allreduce: 482.88 | step: 12.39 + 96%|█████████▌| 725/759 [1:43:09<04:15, 7.52s/it] {'loss': 1.2284, 'learning_rate': 1.0512585911612416e-07, 'epoch': 0.96} + 96%|█████████▌| 725/759 [1:43:09<04:15, 7.52s/it][2024-12-31 18:53:14,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.56 | bwd_microstep: 345.22 | bwd_inner_microstep: 345.05 | bwd_allreduce_microstep: 0.07 | step_microstep: 0.08 +[2024-12-31 18:53:15,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.35 | bwd_microstep: 295.88 | bwd_inner_microstep: 295.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:53:15,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.66 | bwd_microstep: 268.44 | bwd_inner_microstep: 268.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:16,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.99 | bwd_microstep: 262.63 | bwd_inner_microstep: 262.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:16,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.09 | bwd_microstep: 257.11 | bwd_inner_microstep: 257.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:16,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 248.73 | bwd_inner_microstep: 248.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:53:17,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.43 | bwd_microstep: 249.17 | bwd_inner_microstep: 249.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:53:17,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.71 | bwd_microstep: 247.47 | bwd_inner_microstep: 247.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:18,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 247.68 | bwd_inner_microstep: 247.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:18,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.83 | bwd_microstep: 245.35 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:19,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.24 | bwd_microstep: 245.43 | bwd_inner_microstep: 245.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:19,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 244.28 | bwd_inner_microstep: 244.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:53:20,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.25 | bwd_microstep: 257.27 | bwd_inner_microstep: 257.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:20,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.46 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:20,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.07 | bwd_microstep: 247.15 | bwd_inner_microstep: 247.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:21,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.08 | optimizer_gradients: 0.60 | optimizer_step: 3.19 +[2024-12-31 18:53:21,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 493.07 | bwd_inner_microstep: 247.54 | bwd_allreduce_microstep: 245.48 | step_microstep: 11.03 +[2024-12-31 18:53:21,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2842.45 | bwd: 4398.38 | bwd_inner: 4152.27 | bwd_allreduce: 245.67 | step: 13.82 + 96%|█████████▌| 726/759 [1:43:16<04:08, 7.52s/it] {'loss': 1.2261, 'learning_rate': 9.904300267901012e-08, 'epoch': 0.96} + 96%|█████████▌| 726/759 [1:43:16<04:08, 7.52s/it][2024-12-31 18:53:22,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 289.80 | bwd_microstep: 502.72 | bwd_inner_microstep: 502.38 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.27 +[2024-12-31 18:53:22,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 193.63 | bwd_microstep: 292.84 | bwd_inner_microstep: 292.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:23,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.75 | bwd_microstep: 283.63 | bwd_inner_microstep: 283.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:53:23,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.65 | bwd_microstep: 277.16 | bwd_inner_microstep: 277.14 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:24,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.40 | bwd_microstep: 255.01 | bwd_inner_microstep: 254.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:24,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 246.12 | bwd_inner_microstep: 246.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:25,218] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 248.26 | bwd_inner_microstep: 248.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:25,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 255.57 | bwd_inner_microstep: 255.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:26,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:26,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 262.42 | bwd_inner_microstep: 262.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:26,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 247.20 | bwd_inner_microstep: 247.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:27,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.93 | bwd_microstep: 249.01 | bwd_inner_microstep: 248.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:27,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.58 | bwd_microstep: 245.02 | bwd_inner_microstep: 245.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:53:28,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.49 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:28,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.33 | bwd_microstep: 242.41 | bwd_inner_microstep: 242.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:53:29,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.60 | optimizer_gradients: 0.65 | optimizer_step: 3.59 +[2024-12-31 18:53:29,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 257.21 | bwd_inner_microstep: 243.60 | bwd_allreduce_microstep: 13.52 | step_microstep: 11.13 +[2024-12-31 18:53:29,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2917.48 | bwd: 4352.68 | bwd_inner: 4338.31 | bwd_allreduce: 13.78 | step: 14.37 + 96%|█████████▌| 727/759 [1:43:24<04:01, 7.53s/it] {'loss': 1.2033, 'learning_rate': 9.314053963669245e-08, 'epoch': 0.96} + 96%|█████████▌| 727/759 [1:43:24<04:01, 7.53s/it][2024-12-31 18:53:29,759] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 219.42 | bwd_microstep: 349.11 | bwd_inner_microstep: 348.77 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:53:30,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.92 | bwd_microstep: 292.05 | bwd_inner_microstep: 292.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:30,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.51 | bwd_microstep: 282.46 | bwd_inner_microstep: 282.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:31,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.96 | bwd_microstep: 258.06 | bwd_inner_microstep: 258.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:31,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.28 | bwd_microstep: 252.93 | bwd_inner_microstep: 252.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:32,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.46 | bwd_microstep: 250.44 | bwd_inner_microstep: 250.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:32,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.16 | bwd_microstep: 246.65 | bwd_inner_microstep: 246.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:53:32,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 245.91 | bwd_inner_microstep: 245.88 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:53:33,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 247.37 | bwd_inner_microstep: 247.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:33,837] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 247.57 | bwd_inner_microstep: 247.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:34,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.12 | bwd_microstep: 244.82 | bwd_inner_microstep: 244.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:34,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.72 | bwd_microstep: 247.61 | bwd_inner_microstep: 247.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:35,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.76 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:35,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.77 | bwd_microstep: 242.45 | bwd_inner_microstep: 242.42 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:36,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.54 | bwd_microstep: 243.75 | bwd_inner_microstep: 243.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:36,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.67 | optimizer_step: 3.37 +[2024-12-31 18:53:36,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 257.54 | bwd_inner_microstep: 243.78 | bwd_allreduce_microstep: 13.59 | step_microstep: 11.03 +[2024-12-31 18:53:36,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2848.94 | bwd: 4153.97 | bwd_inner: 4139.35 | bwd_allreduce: 13.92 | step: 14.08 + 96%|█████████▌| 728/759 [1:43:31<03:51, 7.46s/it] {'loss': 1.228, 'learning_rate': 8.741857753083228e-08, 'epoch': 0.96} + 96%|█████████▌| 728/759 [1:43:31<03:51, 7.46s/it][2024-12-31 18:53:37,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.43 | bwd_microstep: 350.70 | bwd_inner_microstep: 350.36 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:53:37,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.21 | bwd_microstep: 357.37 | bwd_inner_microstep: 357.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:38,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.40 | bwd_microstep: 281.68 | bwd_inner_microstep: 281.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:38,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.99 | bwd_microstep: 257.36 | bwd_inner_microstep: 257.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:53:39,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 251.21 | bwd_inner_microstep: 250.88 | bwd_allreduce_microstep: 0.20 | step_microstep: 0.26 +[2024-12-31 18:53:39,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.57 | bwd_microstep: 247.64 | bwd_inner_microstep: 247.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:53:39,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.10 | bwd_microstep: 247.53 | bwd_inner_microstep: 247.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:40,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 250.56 | bwd_inner_microstep: 250.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:40,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.96 | bwd_microstep: 245.81 | bwd_inner_microstep: 245.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:41,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.13 | bwd_microstep: 244.29 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:41,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.08 | bwd_microstep: 243.28 | bwd_inner_microstep: 243.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:53:42,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.14 | bwd_microstep: 243.23 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:42,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.82 | bwd_microstep: 242.93 | bwd_inner_microstep: 242.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:53:42,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 244.14 | bwd_inner_microstep: 244.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:53:43,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.92 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:43,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.08 | optimizer_gradients: 0.57 | optimizer_step: 3.15 +[2024-12-31 18:53:43,844] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.24 | bwd_microstep: 283.83 | bwd_inner_microstep: 242.56 | bwd_allreduce_microstep: 41.22 | step_microstep: 17.76 +[2024-12-31 18:53:43,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2829.96 | bwd: 4235.15 | bwd_inner: 4192.75 | bwd_allreduce: 41.72 | step: 20.88 + 96%|█████████▌| 729/759 [1:43:38<03:43, 7.43s/it] {'loss': 1.2217, 'learning_rate': 8.187722061439806e-08, 'epoch': 0.96} + 96%|█████████▌| 729/759 [1:43:38<03:43, 7.43s/it][2024-12-31 18:53:44,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.57 | bwd_microstep: 334.84 | bwd_inner_microstep: 334.51 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:53:45,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 242.92 | bwd_microstep: 400.89 | bwd_inner_microstep: 400.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:53:45,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.76 | bwd_microstep: 283.89 | bwd_inner_microstep: 283.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:46,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.96 | bwd_microstep: 254.31 | bwd_inner_microstep: 254.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:53:46,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.06 | bwd_microstep: 256.16 | bwd_inner_microstep: 256.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:46,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 264.46 | bwd_inner_microstep: 264.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.03 +[2024-12-31 18:53:47,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 256.61 | bwd_inner_microstep: 256.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:53:47,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 244.60 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:48,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.33 | bwd_microstep: 245.41 | bwd_inner_microstep: 245.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:48,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.66 | bwd_inner_microstep: 244.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:49,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 249.73 | bwd_inner_microstep: 249.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:53:49,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 244.03 | bwd_inner_microstep: 244.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:49,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.29 | bwd_microstep: 243.23 | bwd_inner_microstep: 243.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:50,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.90 | bwd_microstep: 241.52 | bwd_inner_microstep: 241.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:53:50,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:51,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.16 | optimizer_gradients: 0.64 | optimizer_step: 3.24 +[2024-12-31 18:53:51,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.27 | bwd_microstep: 265.86 | bwd_inner_microstep: 241.49 | bwd_allreduce_microstep: 24.28 | step_microstep: 11.07 +[2024-12-31 18:53:51,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2869.45 | bwd: 4273.72 | bwd_inner: 4248.55 | bwd_allreduce: 24.54 | step: 13.78 + 96%|█████████▌| 730/759 [1:43:46<03:35, 7.43s/it] {'loss': 1.2252, 'learning_rate': 7.651656984977051e-08, 'epoch': 0.96} + 96%|█████████▌| 730/759 [1:43:46<03:35, 7.43s/it][2024-12-31 18:53:51,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.35 | bwd_microstep: 313.37 | bwd_inner_microstep: 313.23 | bwd_allreduce_microstep: 0.06 | step_microstep: 0.09 +[2024-12-31 18:53:52,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.01 | bwd_microstep: 288.05 | bwd_inner_microstep: 288.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:53:52,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.36 | bwd_microstep: 270.27 | bwd_inner_microstep: 270.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:53:53,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.87 | bwd_microstep: 254.86 | bwd_inner_microstep: 254.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:53,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.28 | bwd_microstep: 256.60 | bwd_inner_microstep: 256.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:54,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 248.59 | bwd_inner_microstep: 248.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:53:54,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:53:54,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.51 | bwd_microstep: 245.79 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:55,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 248.90 | bwd_inner_microstep: 248.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:55,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:56,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.28 | bwd_microstep: 244.76 | bwd_inner_microstep: 244.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:56,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:57,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.59 | bwd_microstep: 241.53 | bwd_inner_microstep: 241.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:53:57,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.54 | bwd_microstep: 240.85 | bwd_inner_microstep: 240.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:53:57,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.55 | bwd_microstep: 240.76 | bwd_inner_microstep: 240.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:53:58,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.58 | optimizer_gradients: 0.69 | optimizer_step: 3.55 +[2024-12-31 18:53:58,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 257.67 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 13.61 | step_microstep: 11.54 +[2024-12-31 18:53:58,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2794.43 | bwd: 4086.01 | bwd_inner: 4071.57 | bwd_allreduce: 13.82 | step: 14.32 + 96%|█████████▋| 731/759 [1:43:53<03:25, 7.36s/it] {'loss': 1.2155, 'learning_rate': 7.133672290690064e-08, 'epoch': 0.96} + 96%|█████████▋| 731/759 [1:43:53<03:25, 7.36s/it][2024-12-31 18:53:59,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.55 | bwd_microstep: 347.80 | bwd_inner_microstep: 347.36 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.26 +[2024-12-31 18:53:59,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.37 | bwd_microstep: 301.57 | bwd_inner_microstep: 301.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:53:59,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.54 | bwd_microstep: 257.40 | bwd_inner_microstep: 257.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:00,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.62 | bwd_microstep: 259.78 | bwd_inner_microstep: 259.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:00,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.86 | bwd_microstep: 248.42 | bwd_inner_microstep: 248.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:54:01,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 249.31 | bwd_inner_microstep: 249.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:01,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.68 | bwd_microstep: 245.85 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:54:02,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.91 | bwd_microstep: 250.59 | bwd_inner_microstep: 250.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:54:02,645] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 245.04 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:54:03,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 245.28 | bwd_inner_microstep: 245.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:03,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.52 | bwd_microstep: 247.70 | bwd_inner_microstep: 247.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:03,949] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.96 | bwd_microstep: 246.04 | bwd_inner_microstep: 246.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:54:04,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.44 | bwd_inner_microstep: 244.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:54:04,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 243.34 | bwd_inner_microstep: 242.80 | bwd_allreduce_microstep: 0.30 | step_microstep: 0.27 +[2024-12-31 18:54:05,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.19 | bwd_microstep: 245.11 | bwd_inner_microstep: 245.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:54:05,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 7.42 | optimizer_gradients: 0.56 | optimizer_step: 3.09 +[2024-12-31 18:54:05,780] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.19 | bwd_microstep: 335.49 | bwd_inner_microstep: 242.35 | bwd_allreduce_microstep: 93.10 | step_microstep: 13.85 +[2024-12-31 18:54:05,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2821.98 | bwd: 4213.43 | bwd_inner: 4118.65 | bwd_allreduce: 93.59 | step: 17.10 + 96%|█████████▋| 732/759 [1:44:00<03:18, 7.35s/it] {'loss': 1.2393, 'learning_rate': 6.633777416153232e-08, 'epoch': 0.96} + 96%|█████████▋| 732/759 [1:44:00<03:18, 7.35s/it][2024-12-31 18:54:06,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 212.01 | bwd_microstep: 336.30 | bwd_inner_microstep: 335.93 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:54:06,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.16 | bwd_microstep: 316.31 | bwd_inner_microstep: 316.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:54:07,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.56 | bwd_microstep: 269.12 | bwd_inner_microstep: 269.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:07,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.20 | bwd_microstep: 256.73 | bwd_inner_microstep: 256.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:08,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 247.67 | bwd_inner_microstep: 247.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:54:08,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.72 | bwd_microstep: 247.36 | bwd_inner_microstep: 247.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:54:09,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.31 | bwd_microstep: 247.25 | bwd_inner_microstep: 247.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:09,626] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.08 | bwd_microstep: 256.15 | bwd_inner_microstep: 256.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:10,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:54:10,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 256.63 | bwd_inner_microstep: 256.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:10,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 244.42 | bwd_inner_microstep: 244.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:54:11,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:11,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 245.25 | bwd_inner_microstep: 245.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:12,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 274.04 | bwd_inner_microstep: 273.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:54:12,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 252.77 | bwd_inner_microstep: 252.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:54:13,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.41 | optimizer_gradients: 0.65 | optimizer_step: 3.46 +[2024-12-31 18:54:13,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.03 | bwd_microstep: 673.14 | bwd_inner_microstep: 241.61 | bwd_allreduce_microstep: 431.47 | step_microstep: 13.00 +[2024-12-31 18:54:13,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2824.83 | bwd: 4612.12 | bwd_inner: 4179.63 | bwd_allreduce: 431.75 | step: 16.19 + 97%|█████████▋| 733/759 [1:44:08<03:14, 7.48s/it] {'loss': 1.2009, 'learning_rate': 6.151981469348034e-08, 'epoch': 0.97} + 97%|█████████▋| 733/759 [1:44:08<03:14, 7.48s/it][2024-12-31 18:54:14,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.69 | bwd_microstep: 313.50 | bwd_inner_microstep: 313.17 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:54:14,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.41 | bwd_microstep: 293.06 | bwd_inner_microstep: 293.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:54:15,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.81 | bwd_microstep: 291.02 | bwd_inner_microstep: 291.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:54:15,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.82 | bwd_microstep: 267.37 | bwd_inner_microstep: 267.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:16,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.36 | bwd_microstep: 270.41 | bwd_inner_microstep: 270.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:54:16,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.76 | bwd_microstep: 249.48 | bwd_inner_microstep: 249.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:16,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.99 | bwd_inner_microstep: 245.96 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:54:17,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 247.63 | bwd_inner_microstep: 247.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:54:17,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 244.37 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:18,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 248.18 | bwd_inner_microstep: 248.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:18,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 246.22 | bwd_inner_microstep: 246.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:19,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.94 | bwd_microstep: 245.16 | bwd_inner_microstep: 245.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:19,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 265.07 | bwd_inner_microstep: 265.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:54:19,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 243.23 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:20,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.14 | bwd_microstep: 245.08 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:20,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.88 | optimizer_gradients: 0.71 | optimizer_step: 3.18 +[2024-12-31 18:54:20,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.88 | bwd_microstep: 317.45 | bwd_inner_microstep: 241.71 | bwd_allreduce_microstep: 75.70 | step_microstep: 11.07 +[2024-12-31 18:54:20,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2818.60 | bwd: 4233.43 | bwd_inner: 4156.73 | bwd_allreduce: 75.94 | step: 14.12 + 97%|█████████▋| 734/759 [1:44:15<03:06, 7.44s/it] {'loss': 1.2215, 'learning_rate': 5.688293228497399e-08, 'epoch': 0.97} + 97%|█████████▋| 734/759 [1:44:15<03:06, 7.44s/it][2024-12-31 18:54:21,456] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 201.71 | bwd_microstep: 311.73 | bwd_inner_microstep: 311.35 | bwd_allreduce_microstep: 0.17 | step_microstep: 0.18 +[2024-12-31 18:54:21,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.77 | bwd_microstep: 282.25 | bwd_inner_microstep: 282.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:54:22,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.65 | bwd_microstep: 257.19 | bwd_inner_microstep: 257.12 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:22,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.48 | bwd_microstep: 255.69 | bwd_inner_microstep: 255.50 | bwd_allreduce_microstep: 0.03 | step_microstep: 0.20 +[2024-12-31 18:54:23,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.96 | bwd_microstep: 250.14 | bwd_inner_microstep: 250.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:54:23,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.54 | bwd_microstep: 251.97 | bwd_inner_microstep: 251.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:24,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 244.97 | bwd_inner_microstep: 244.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:54:24,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 246.02 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:25,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:25,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.95 | bwd_microstep: 246.35 | bwd_inner_microstep: 246.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:25,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 248.19 | bwd_inner_microstep: 248.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:26,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 246.34 | bwd_inner_microstep: 246.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:54:26,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.89 | bwd_microstep: 243.90 | bwd_inner_microstep: 243.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:27,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 240.88 | bwd_inner_microstep: 240.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:27,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.89 | bwd_microstep: 241.52 | bwd_inner_microstep: 241.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:54:28,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.33 | optimizer_gradients: 0.62 | optimizer_step: 13.14 +[2024-12-31 18:54:28,119] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.19 | bwd_microstep: 271.91 | bwd_inner_microstep: 245.82 | bwd_allreduce_microstep: 26.02 | step_microstep: 22.14 +[2024-12-31 18:54:28,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2793.70 | bwd: 4083.37 | bwd_inner: 4056.03 | bwd_allreduce: 26.45 | step: 25.37 + 97%|█████████▋| 735/759 [1:44:23<02:56, 7.37s/it] {'loss': 1.2348, 'learning_rate': 5.2427211419051605e-08, 'epoch': 0.97} + 97%|█████████▋| 735/759 [1:44:23<02:56, 7.37s/it][2024-12-31 18:54:28,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 241.83 | bwd_microstep: 315.67 | bwd_inner_microstep: 315.34 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:54:29,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.29 | bwd_microstep: 290.61 | bwd_inner_microstep: 290.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:54:29,662] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.85 | bwd_microstep: 266.48 | bwd_inner_microstep: 266.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:54:30,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.69 | bwd_microstep: 254.18 | bwd_inner_microstep: 254.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:54:30,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 249.50 | bwd_inner_microstep: 249.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:30,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 246.11 | bwd_inner_microstep: 246.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:31,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.86 | bwd_microstep: 245.84 | bwd_inner_microstep: 245.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:31,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 248.20 | bwd_inner_microstep: 248.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:54:32,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.99 | bwd_microstep: 248.03 | bwd_inner_microstep: 248.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:54:32,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.56 | bwd_microstep: 244.69 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:33,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 246.67 | bwd_inner_microstep: 246.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:33,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.99 | bwd_inner_microstep: 244.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:34,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.70 | bwd_microstep: 243.79 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:54:34,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 246.84 | bwd_inner_microstep: 246.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:34,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.67 | bwd_microstep: 246.25 | bwd_inner_microstep: 246.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:35,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.73 | optimizer_step: 3.45 +[2024-12-31 18:54:35,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 255.23 | bwd_inner_microstep: 241.58 | bwd_allreduce_microstep: 13.56 | step_microstep: 11.98 +[2024-12-31 18:54:35,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2858.93 | bwd: 4093.19 | bwd_inner: 4078.50 | bwd_allreduce: 13.95 | step: 14.98 + 97%|█████████▋| 736/759 [1:44:30<02:48, 7.34s/it] {'loss': 1.2593, 'learning_rate': 4.815273327803183e-08, 'epoch': 0.97} + 97%|█████████▋| 736/759 [1:44:30<02:48, 7.34s/it][2024-12-31 18:54:35,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 214.42 | bwd_microstep: 338.23 | bwd_inner_microstep: 337.87 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:54:36,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.13 | bwd_microstep: 287.02 | bwd_inner_microstep: 286.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:36,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.52 | bwd_microstep: 272.06 | bwd_inner_microstep: 272.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:37,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.81 | bwd_microstep: 255.60 | bwd_inner_microstep: 255.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:54:37,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.43 | bwd_microstep: 250.90 | bwd_inner_microstep: 250.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.07 +[2024-12-31 18:54:38,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.17 | bwd_microstep: 255.01 | bwd_inner_microstep: 254.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:54:38,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 249.96 | bwd_inner_microstep: 249.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:54:39,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.90 | bwd_microstep: 245.66 | bwd_inner_microstep: 245.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:39,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 243.72 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:54:39,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.02 | bwd_microstep: 241.37 | bwd_inner_microstep: 241.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:40,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 244.12 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.24 | step_microstep: 0.27 +[2024-12-31 18:54:40,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.41 | bwd_microstep: 243.34 | bwd_inner_microstep: 243.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:54:41,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.66 | bwd_microstep: 242.85 | bwd_inner_microstep: 242.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:54:41,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.77 | bwd_microstep: 242.08 | bwd_inner_microstep: 242.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:42,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.52 | bwd_microstep: 242.51 | bwd_inner_microstep: 242.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:42,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.78 | optimizer_gradients: 0.57 | optimizer_step: 3.24 +[2024-12-31 18:54:42,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.70 | bwd_microstep: 308.02 | bwd_inner_microstep: 244.26 | bwd_allreduce_microstep: 63.72 | step_microstep: 40.95 +[2024-12-31 18:54:42,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2809.25 | bwd: 4162.58 | bwd_inner: 4097.62 | bwd_allreduce: 64.20 | step: 43.77 + 97%|█████████▋| 737/759 [1:44:37<02:41, 7.32s/it] {'loss': 1.2245, 'learning_rate': 4.405957574202147e-08, 'epoch': 0.97} + 97%|█████████▋| 737/759 [1:44:37<02:41, 7.32s/it][2024-12-31 18:54:43,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.40 | bwd_microstep: 348.25 | bwd_inner_microstep: 347.85 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:54:43,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 202.41 | bwd_microstep: 313.26 | bwd_inner_microstep: 313.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:44,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.05 | bwd_microstep: 286.51 | bwd_inner_microstep: 286.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:44,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.29 | bwd_microstep: 269.43 | bwd_inner_microstep: 269.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:54:45,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.25 | bwd_microstep: 265.49 | bwd_inner_microstep: 265.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:54:45,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.13 | bwd_microstep: 262.68 | bwd_inner_microstep: 262.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:46,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.26 | bwd_microstep: 267.37 | bwd_inner_microstep: 267.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:54:46,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.61 | bwd_microstep: 248.29 | bwd_inner_microstep: 248.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:47,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.88 | bwd_microstep: 247.36 | bwd_inner_microstep: 247.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:47,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.42 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:47,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:48,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 245.33 | bwd_inner_microstep: 245.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:48,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.91 | bwd_microstep: 243.87 | bwd_inner_microstep: 243.84 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:49,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:54:49,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 249.58 | bwd_inner_microstep: 249.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:50,105] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.98 | optimizer_gradients: 0.58 | optimizer_step: 3.68 +[2024-12-31 18:54:50,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.22 | bwd_microstep: 282.19 | bwd_inner_microstep: 243.69 | bwd_allreduce_microstep: 38.45 | step_microstep: 11.16 +[2024-12-31 18:54:50,107] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2867.97 | bwd: 4267.93 | bwd_inner: 4228.55 | bwd_allreduce: 38.70 | step: 14.18 + 97%|█████████▋| 738/759 [1:44:45<02:34, 7.35s/it] {'loss': 1.1899, 'learning_rate': 4.014781338751106e-08, 'epoch': 0.97} + 97%|█████████▋| 738/759 [1:44:45<02:34, 7.35s/it][2024-12-31 18:54:50,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.80 | bwd_microstep: 311.53 | bwd_inner_microstep: 311.16 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:54:51,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 277.83 | bwd_microstep: 384.76 | bwd_inner_microstep: 384.73 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:51,794] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.19 | bwd_microstep: 275.39 | bwd_inner_microstep: 275.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:52,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.42 | bwd_microstep: 262.49 | bwd_inner_microstep: 262.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:52,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.74 | bwd_microstep: 255.20 | bwd_inner_microstep: 255.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:53,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.29 | bwd_microstep: 255.59 | bwd_inner_microstep: 255.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:53,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 208.57 | bwd_microstep: 247.80 | bwd_inner_microstep: 247.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:54,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.38 | bwd_microstep: 249.68 | bwd_inner_microstep: 249.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:54,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.11 | bwd_microstep: 245.93 | bwd_inner_microstep: 245.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:54:54,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.57 | bwd_microstep: 249.74 | bwd_inner_microstep: 249.72 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:54:55,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.76 | bwd_microstep: 244.96 | bwd_inner_microstep: 244.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:54:55,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.81 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:56,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:54:56,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.92 | bwd_inner_microstep: 243.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:54:57,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 243.53 | bwd_inner_microstep: 243.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:54:57,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 6.05 | optimizer_gradients: 0.87 | optimizer_step: 3.29 +[2024-12-31 18:54:57,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.73 | bwd_microstep: 258.70 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 13.57 | step_microstep: 12.72 +[2024-12-31 18:54:57,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2952.34 | bwd: 4217.03 | bwd_inner: 4202.25 | bwd_allreduce: 13.95 | step: 15.81 + 97%|█████████▋| 739/759 [1:44:52<02:27, 7.39s/it] {'loss': 1.2334, 'learning_rate': 3.641751748600042e-08, 'epoch': 0.97} + 97%|█████████▋| 739/759 [1:44:52<02:27, 7.39s/it][2024-12-31 18:54:58,162] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.37 | bwd_microstep: 347.08 | bwd_inner_microstep: 346.73 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.23 +[2024-12-31 18:54:58,663] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.78 | bwd_microstep: 290.42 | bwd_inner_microstep: 290.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:54:59,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 190.15 | bwd_microstep: 286.16 | bwd_inner_microstep: 286.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:54:59,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 183.37 | bwd_microstep: 265.63 | bwd_inner_microstep: 265.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:00,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.10 | bwd_microstep: 255.56 | bwd_inner_microstep: 255.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:00,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 249.89 | bwd_inner_microstep: 249.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.27 +[2024-12-31 18:55:00,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.96 | bwd_microstep: 248.32 | bwd_inner_microstep: 248.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:55:01,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.75 | bwd_microstep: 250.65 | bwd_inner_microstep: 249.60 | bwd_allreduce_microstep: 0.33 | step_microstep: 0.65 +[2024-12-31 18:55:01,856] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.42 | bwd_microstep: 246.52 | bwd_inner_microstep: 246.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:02,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 249.36 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:02,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 245.97 | bwd_inner_microstep: 245.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:55:03,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.24 | bwd_microstep: 244.04 | bwd_inner_microstep: 243.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:03,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.64 | bwd_microstep: 248.02 | bwd_inner_microstep: 247.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:04,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 248.10 | bwd_inner_microstep: 248.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:55:04,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:05,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.22 | optimizer_gradients: 0.57 | optimizer_step: 3.16 +[2024-12-31 18:55:05,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.71 | bwd_microstep: 558.97 | bwd_inner_microstep: 245.10 | bwd_allreduce_microstep: 313.82 | step_microstep: 11.19 +[2024-12-31 18:55:05,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2863.61 | bwd: 4478.29 | bwd_inner: 4162.75 | bwd_allreduce: 314.38 | step: 14.56 + 97%|█████████▋| 740/759 [1:45:00<02:21, 7.46s/it] {'loss': 1.2141, 'learning_rate': 3.2868756002712997e-08, 'epoch': 0.97} + 97%|█████████▋| 740/759 [1:45:00<02:21, 7.46s/it][2024-12-31 18:55:05,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.37 | bwd_microstep: 364.54 | bwd_inner_microstep: 364.20 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:55:06,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.00 | bwd_microstep: 282.93 | bwd_inner_microstep: 282.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:55:06,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 184.73 | bwd_microstep: 265.31 | bwd_inner_microstep: 265.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:55:07,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.88 | bwd_microstep: 257.51 | bwd_inner_microstep: 257.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:07,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.36 | bwd_microstep: 260.18 | bwd_inner_microstep: 260.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:08,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.84 | bwd_microstep: 247.35 | bwd_inner_microstep: 247.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:55:08,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.50 | bwd_microstep: 247.40 | bwd_inner_microstep: 247.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:08,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.78 | bwd_microstep: 246.47 | bwd_inner_microstep: 246.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:09,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.70 | bwd_microstep: 246.07 | bwd_inner_microstep: 246.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:09,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 247.08 | bwd_inner_microstep: 247.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:10,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.47 | bwd_microstep: 244.84 | bwd_inner_microstep: 244.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:55:10,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 243.44 | bwd_inner_microstep: 243.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:11,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.10 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:11,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.90 | bwd_microstep: 243.93 | bwd_inner_microstep: 243.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:12,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.75 | bwd_microstep: 243.08 | bwd_inner_microstep: 243.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:12,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.61 | optimizer_gradients: 1.06 | optimizer_step: 3.43 +[2024-12-31 18:55:12,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.04 | bwd_microstep: 256.57 | bwd_inner_microstep: 242.58 | bwd_allreduce_microstep: 13.89 | step_microstep: 12.30 +[2024-12-31 18:55:12,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2820.31 | bwd: 4139.99 | bwd_inner: 4125.24 | bwd_allreduce: 14.16 | step: 15.28 + 98%|█████████▊| 741/759 [1:45:07<02:13, 7.40s/it] {'loss': 1.2478, 'learning_rate': 2.950159359535132e-08, 'epoch': 0.98} + 98%|█████████▊| 741/759 [1:45:07<02:13, 7.40s/it][2024-12-31 18:55:13,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.57 | bwd_microstep: 376.25 | bwd_inner_microstep: 375.88 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.20 +[2024-12-31 18:55:13,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.40 | bwd_microstep: 348.60 | bwd_inner_microstep: 348.58 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:14,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.10 | bwd_microstep: 263.19 | bwd_inner_microstep: 263.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:14,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.68 | bwd_microstep: 262.25 | bwd_inner_microstep: 262.22 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:55:15,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.08 | bwd_microstep: 247.95 | bwd_inner_microstep: 247.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:15,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.35 | bwd_microstep: 246.62 | bwd_inner_microstep: 246.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:55:15,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.27 | bwd_microstep: 248.93 | bwd_inner_microstep: 248.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:55:16,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.12 | bwd_microstep: 247.68 | bwd_inner_microstep: 247.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:16,766] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.77 | bwd_microstep: 244.26 | bwd_inner_microstep: 244.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:17,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.20 | bwd_microstep: 253.05 | bwd_inner_microstep: 253.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:55:17,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.50 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:55:18,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.89 | bwd_microstep: 244.51 | bwd_inner_microstep: 244.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:18,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.53 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:55:18,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.48 | bwd_microstep: 247.60 | bwd_inner_microstep: 247.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:19,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.71 | bwd_microstep: 241.62 | bwd_inner_microstep: 241.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:19,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.87 | optimizer_step: 3.22 +[2024-12-31 18:55:19,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 286.09 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 41.70 | step_microstep: 10.73 +[2024-12-31 18:55:19,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2847.92 | bwd: 4245.68 | bwd_inner: 4203.19 | bwd_allreduce: 41.96 | step: 13.51 + 98%|█████████▊| 742/759 [1:45:14<02:05, 7.39s/it] {'loss': 1.2436, 'learning_rate': 2.6316091612920146e-08, 'epoch': 0.98} + 98%|█████████▊| 742/759 [1:45:14<02:05, 7.39s/it][2024-12-31 18:55:20,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 230.64 | bwd_microstep: 369.60 | bwd_inner_microstep: 369.24 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:55:21,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 221.89 | bwd_microstep: 351.24 | bwd_inner_microstep: 351.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:21,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.86 | bwd_microstep: 263.01 | bwd_inner_microstep: 262.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:55:21,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.07 | bwd_microstep: 261.51 | bwd_inner_microstep: 261.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:55:22,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.63 | bwd_microstep: 255.54 | bwd_inner_microstep: 255.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:22,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.44 | bwd_microstep: 249.36 | bwd_inner_microstep: 249.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:23,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.44 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:55:23,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.97 | bwd_microstep: 246.71 | bwd_inner_microstep: 246.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:24,157] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.12 | bwd_microstep: 246.64 | bwd_inner_microstep: 246.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:24,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.71 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.28 +[2024-12-31 18:55:25,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:25,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 248.74 | bwd_inner_microstep: 248.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:55:25,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.26 | bwd_microstep: 244.12 | bwd_inner_microstep: 244.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:26,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.80 | bwd_microstep: 241.80 | bwd_inner_microstep: 241.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:26,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.77 | bwd_microstep: 241.28 | bwd_inner_microstep: 241.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:27,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.59 | optimizer_gradients: 0.64 | optimizer_step: 3.37 +[2024-12-31 18:55:27,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.00 | bwd_microstep: 270.04 | bwd_inner_microstep: 256.28 | bwd_allreduce_microstep: 13.64 | step_microstep: 11.28 +[2024-12-31 18:55:27,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2849.01 | bwd: 4225.08 | bwd_inner: 4210.12 | bwd_allreduce: 14.03 | step: 14.45 + 98%|█████████▊| 743/759 [1:45:22<01:58, 7.39s/it] {'loss': 1.2141, 'learning_rate': 2.3312308094607382e-08, 'epoch': 0.98} + 98%|█████████▊| 743/759 [1:45:22<01:58, 7.39s/it][2024-12-31 18:55:28,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 290.54 | bwd_microstep: 506.31 | bwd_inner_microstep: 505.97 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:55:28,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.10 | bwd_microstep: 283.67 | bwd_inner_microstep: 283.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:28,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.99 | bwd_microstep: 267.32 | bwd_inner_microstep: 267.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:29,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.92 | bwd_microstep: 287.39 | bwd_inner_microstep: 287.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:29,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.32 | bwd_microstep: 257.60 | bwd_inner_microstep: 257.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:30,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.82 | bwd_microstep: 248.02 | bwd_inner_microstep: 247.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:55:30,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 248.54 | bwd_inner_microstep: 248.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:55:31,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 253.80 | bwd_inner_microstep: 253.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:31,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.40 | bwd_microstep: 245.88 | bwd_inner_microstep: 245.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:55:32,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.97 | bwd_microstep: 244.93 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:55:32,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.43 | bwd_microstep: 244.79 | bwd_inner_microstep: 244.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:33,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.69 | bwd_microstep: 287.70 | bwd_inner_microstep: 287.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:33,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.76 | bwd_microstep: 245.00 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:33,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 243.56 | bwd_inner_microstep: 243.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:34,319] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.33 | bwd_microstep: 242.52 | bwd_inner_microstep: 242.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:34,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.26 | optimizer_gradients: 0.81 | optimizer_step: 3.16 +[2024-12-31 18:55:34,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.61 | bwd_microstep: 289.84 | bwd_inner_microstep: 241.61 | bwd_allreduce_microstep: 48.19 | step_microstep: 11.70 +[2024-12-31 18:55:34,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2904.51 | bwd: 4396.99 | bwd_inner: 4347.87 | bwd_allreduce: 48.46 | step: 14.70 + 98%|█████████▊| 744/759 [1:45:29<01:51, 7.45s/it] {'loss': 1.1924, 'learning_rate': 2.049029776873268e-08, 'epoch': 0.98} + 98%|█████████▊| 744/759 [1:45:29<01:51, 7.45s/it][2024-12-31 18:55:35,529] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 364.23 | bwd_microstep: 343.30 | bwd_inner_microstep: 342.92 | bwd_allreduce_microstep: 0.16 | step_microstep: 0.25 +[2024-12-31 18:55:36,024] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.02 | bwd_microstep: 286.52 | bwd_inner_microstep: 286.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:55:36,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.39 | bwd_microstep: 291.78 | bwd_inner_microstep: 291.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:55:36,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.57 | bwd_microstep: 262.31 | bwd_inner_microstep: 262.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:37,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 265.80 | bwd_inner_microstep: 265.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.13 +[2024-12-31 18:55:37,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 249.14 | bwd_inner_microstep: 249.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:55:38,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.01 | bwd_microstep: 251.59 | bwd_inner_microstep: 251.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:38,749] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.65 | bwd_microstep: 245.51 | bwd_inner_microstep: 245.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:39,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 245.05 | bwd_inner_microstep: 245.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:39,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.87 | bwd_microstep: 244.57 | bwd_inner_microstep: 244.54 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:40,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.27 | bwd_microstep: 249.66 | bwd_inner_microstep: 249.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:40,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:40,919] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 245.55 | bwd_inner_microstep: 245.32 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.20 +[2024-12-31 18:55:41,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.18 | bwd_microstep: 243.45 | bwd_inner_microstep: 243.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:41,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 253.19 | bwd_inner_microstep: 253.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:55:42,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.67 | optimizer_gradients: 0.66 | optimizer_step: 3.41 +[2024-12-31 18:55:42,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.29 | bwd_microstep: 255.52 | bwd_inner_microstep: 241.89 | bwd_allreduce_microstep: 13.53 | step_microstep: 11.93 +[2024-12-31 18:55:42,251] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2981.59 | bwd: 4176.11 | bwd_inner: 4161.44 | bwd_allreduce: 13.95 | step: 14.86 + 98%|█████████▊| 745/759 [1:45:37<01:44, 7.45s/it] {'loss': 1.2596, 'learning_rate': 1.7850112051738255e-08, 'epoch': 0.98} + 98%|█████████▊| 745/759 [1:45:37<01:44, 7.45s/it][2024-12-31 18:55:42,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.47 | bwd_microstep: 363.14 | bwd_inner_microstep: 362.77 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:55:43,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.21 | bwd_microstep: 296.48 | bwd_inner_microstep: 296.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:43,840] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.56 | bwd_microstep: 269.76 | bwd_inner_microstep: 269.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:44,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.14 | bwd_microstep: 267.53 | bwd_inner_microstep: 267.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:44,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 220.06 | bwd_microstep: 250.52 | bwd_inner_microstep: 250.50 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:45,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.50 | bwd_microstep: 250.82 | bwd_inner_microstep: 250.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:45,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.28 | bwd_microstep: 247.92 | bwd_inner_microstep: 247.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:46,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 251.00 | bwd_inner_microstep: 250.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:55:46,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 247.54 | bwd_inner_microstep: 247.51 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:46,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.29 | bwd_microstep: 246.36 | bwd_inner_microstep: 246.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:47,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.99 | bwd_microstep: 244.67 | bwd_inner_microstep: 244.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:47,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.50 | bwd_microstep: 246.26 | bwd_inner_microstep: 246.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:48,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 243.82 | bwd_inner_microstep: 243.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:48,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 242.72 | bwd_inner_microstep: 242.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:49,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.36 | bwd_microstep: 241.26 | bwd_inner_microstep: 241.23 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.04 +[2024-12-31 18:55:49,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.90 | optimizer_gradients: 0.60 | optimizer_step: 3.22 +[2024-12-31 18:55:49,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.02 | bwd_microstep: 278.23 | bwd_inner_microstep: 247.46 | bwd_allreduce_microstep: 30.71 | step_microstep: 10.83 +[2024-12-31 18:55:49,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2881.73 | bwd: 4188.13 | bwd_inner: 4156.59 | bwd_allreduce: 30.97 | step: 13.67 + 98%|█████████▊| 746/759 [1:45:44<01:36, 7.42s/it] {'loss': 1.1749, 'learning_rate': 1.5391799047266287e-08, 'epoch': 0.98} + 98%|█████████▊| 746/759 [1:45:44<01:36, 7.42s/it][2024-12-31 18:55:50,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.14 | bwd_microstep: 335.06 | bwd_inner_microstep: 334.71 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:55:50,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.48 | bwd_microstep: 266.64 | bwd_inner_microstep: 266.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:51,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.17 | bwd_microstep: 263.11 | bwd_inner_microstep: 263.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:51,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.99 | bwd_microstep: 256.94 | bwd_inner_microstep: 256.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:55:51,991] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 249.18 | bwd_inner_microstep: 249.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:55:52,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.56 | bwd_microstep: 250.50 | bwd_inner_microstep: 250.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:55:52,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 247.13 | bwd_inner_microstep: 247.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:53,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.54 | bwd_microstep: 244.94 | bwd_inner_microstep: 244.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:53,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.66 | bwd_microstep: 278.89 | bwd_inner_microstep: 278.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:54,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.64 | bwd_microstep: 244.71 | bwd_inner_microstep: 244.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:55:54,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.22 | bwd_microstep: 244.89 | bwd_inner_microstep: 244.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:55,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.56 | bwd_microstep: 244.09 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:55,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.95 | bwd_microstep: 243.79 | bwd_inner_microstep: 243.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:55,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.06 | bwd_microstep: 243.52 | bwd_inner_microstep: 243.49 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:55:56,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 243.06 | bwd_inner_microstep: 243.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:55:56,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.44 | optimizer_gradients: 0.79 | optimizer_step: 3.13 +[2024-12-31 18:55:56,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 309.04 | bwd_inner_microstep: 244.06 | bwd_allreduce_microstep: 64.94 | step_microstep: 12.46 +[2024-12-31 18:55:56,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2795.13 | bwd: 4165.63 | bwd_inner: 4099.82 | bwd_allreduce: 65.18 | step: 15.54 + 98%|█████████▊| 747/759 [1:45:51<01:28, 7.37s/it] {'loss': 1.2327, 'learning_rate': 1.3115403545270744e-08, 'epoch': 0.98} + 98%|█████████▊| 747/759 [1:45:51<01:28, 7.37s/it][2024-12-31 18:55:57,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 215.71 | bwd_microstep: 341.75 | bwd_inner_microstep: 341.40 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.19 +[2024-12-31 18:55:58,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.18 | bwd_microstep: 360.85 | bwd_inner_microstep: 360.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:55:58,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.48 | bwd_microstep: 282.35 | bwd_inner_microstep: 282.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:55:59,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.93 | bwd_microstep: 262.33 | bwd_inner_microstep: 262.31 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:59,455] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.43 | bwd_microstep: 256.81 | bwd_inner_microstep: 256.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:55:59,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.37 | bwd_microstep: 246.08 | bwd_inner_microstep: 246.06 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:00,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.11 | bwd_microstep: 244.01 | bwd_inner_microstep: 243.98 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:56:00,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.73 | bwd_microstep: 247.24 | bwd_inner_microstep: 247.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:01,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.72 | bwd_microstep: 249.73 | bwd_inner_microstep: 249.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:01,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 246.77 | bwd_inner_microstep: 246.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:02,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.41 | bwd_microstep: 243.80 | bwd_inner_microstep: 243.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:56:02,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.68 | bwd_microstep: 243.78 | bwd_inner_microstep: 243.75 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:02,927] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 243.71 | bwd_inner_microstep: 243.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:03,375] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.36 | bwd_microstep: 253.69 | bwd_inner_microstep: 253.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:56:03,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.85 | bwd_microstep: 241.34 | bwd_inner_microstep: 241.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:04,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.67 | optimizer_step: 3.41 +[2024-12-31 18:56:04,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.40 | bwd_microstep: 256.86 | bwd_inner_microstep: 243.10 | bwd_allreduce_microstep: 13.63 | step_microstep: 11.09 +[2024-12-31 18:56:04,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2868.39 | bwd: 4221.19 | bwd_inner: 4206.68 | bwd_allreduce: 13.92 | step: 14.20 + 99%|█████████▊| 748/759 [1:45:59<01:21, 7.38s/it] {'loss': 1.2044, 'learning_rate': 1.1020967021210249e-08, 'epoch': 0.99} + 99%|█████████▊| 748/759 [1:45:59<01:21, 7.38s/it][2024-12-31 18:56:04,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.13 | bwd_microstep: 311.75 | bwd_inner_microstep: 311.41 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.21 +[2024-12-31 18:56:05,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.42 | bwd_microstep: 296.48 | bwd_inner_microstep: 296.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:56:05,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.84 | bwd_microstep: 283.19 | bwd_inner_microstep: 283.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:56:06,239] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.13 | bwd_microstep: 255.32 | bwd_inner_microstep: 255.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:56:06,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.71 | bwd_microstep: 261.10 | bwd_inner_microstep: 261.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:07,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.01 | bwd_microstep: 255.68 | bwd_inner_microstep: 255.65 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:56:07,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.91 | bwd_microstep: 248.94 | bwd_inner_microstep: 248.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.08 +[2024-12-31 18:56:08,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.98 | bwd_microstep: 243.67 | bwd_inner_microstep: 243.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:08,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.48 | bwd_microstep: 244.32 | bwd_inner_microstep: 244.30 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:08,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 189.34 | bwd_microstep: 243.59 | bwd_inner_microstep: 243.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:09,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.89 | bwd_microstep: 243.18 | bwd_inner_microstep: 243.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:56:09,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 259.83 | bwd_inner_microstep: 259.80 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:56:10,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.02 | bwd_microstep: 243.97 | bwd_inner_microstep: 243.94 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:56:10,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.56 | bwd_microstep: 252.17 | bwd_inner_microstep: 252.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:11,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.47 | bwd_microstep: 241.38 | bwd_inner_microstep: 241.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:11,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.90 | optimizer_gradients: 0.58 | optimizer_step: 3.10 +[2024-12-31 18:56:11,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 538.56 | bwd_inner_microstep: 272.65 | bwd_allreduce_microstep: 265.86 | step_microstep: 16.40 +[2024-12-31 18:56:11,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2828.55 | bwd: 4423.22 | bwd_inner: 4156.57 | bwd_allreduce: 266.10 | step: 19.17 + 99%|█████████▊| 749/759 [1:46:06<01:14, 7.43s/it] {'loss': 1.2405, 'learning_rate': 9.108527635284248e-09, 'epoch': 0.99} + 99%|█████████▊| 749/759 [1:46:06<01:14, 7.43s/it][2024-12-31 18:56:12,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 204.41 | bwd_microstep: 314.77 | bwd_inner_microstep: 314.41 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:56:12,941] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 227.04 | bwd_microstep: 358.00 | bwd_inner_microstep: 357.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:56:13,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.13 | bwd_microstep: 279.74 | bwd_inner_microstep: 279.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:56:13,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.95 | bwd_microstep: 277.98 | bwd_inner_microstep: 277.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:56:14,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.64 | bwd_microstep: 261.69 | bwd_inner_microstep: 261.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:56:14,833] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.92 | bwd_microstep: 248.84 | bwd_inner_microstep: 248.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:15,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.37 | bwd_microstep: 250.32 | bwd_inner_microstep: 250.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:15,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.71 | bwd_microstep: 249.06 | bwd_inner_microstep: 249.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:16,158] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.22 | bwd_microstep: 256.92 | bwd_inner_microstep: 256.89 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.25 +[2024-12-31 18:56:16,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:56:17,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 249.62 | bwd_inner_microstep: 249.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:17,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 182.60 | bwd_microstep: 243.64 | bwd_inner_microstep: 243.61 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:56:17,912] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.98 | bwd_microstep: 244.21 | bwd_inner_microstep: 244.19 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:56:18,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 264.22 | bwd_microstep: 452.49 | bwd_inner_microstep: 452.16 | bwd_allreduce_microstep: 0.22 | step_microstep: 0.27 +[2024-12-31 18:56:19,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 194.00 | bwd_microstep: 241.66 | bwd_inner_microstep: 241.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:19,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.81 | optimizer_step: 4.62 +[2024-12-31 18:56:19,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.74 | bwd_microstep: 259.79 | bwd_inner_microstep: 245.45 | bwd_allreduce_microstep: 14.01 | step_microstep: 16.77 +[2024-12-31 18:56:19,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3012.49 | bwd: 4435.34 | bwd_inner: 4419.39 | bwd_allreduce: 14.69 | step: 19.89 + 99%|█████████▉| 750/759 [1:46:14<01:07, 7.53s/it] {'loss': 1.1996, 'learning_rate': 7.378120231745778e-09, 'epoch': 0.99} + 99%|█████████▉| 750/759 [1:46:14<01:07, 7.53s/it][2024-12-31 18:56:20,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 239.72 | bwd_microstep: 390.91 | bwd_inner_microstep: 390.56 | bwd_allreduce_microstep: 0.14 | step_microstep: 0.22 +[2024-12-31 18:56:20,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.82 | bwd_microstep: 280.39 | bwd_inner_microstep: 280.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:21,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 188.68 | bwd_microstep: 281.66 | bwd_inner_microstep: 281.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:56:21,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.05 | bwd_microstep: 264.10 | bwd_inner_microstep: 264.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:56:22,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.75 | bwd_microstep: 255.30 | bwd_inner_microstep: 255.27 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:56:22,531] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.03 | bwd_microstep: 246.85 | bwd_inner_microstep: 246.83 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:22,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.20 | bwd_microstep: 244.88 | bwd_inner_microstep: 244.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:23,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.53 | bwd_microstep: 245.67 | bwd_inner_microstep: 245.64 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:23,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.68 | bwd_microstep: 244.45 | bwd_inner_microstep: 244.43 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.24 +[2024-12-31 18:56:24,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.48 | bwd_microstep: 244.62 | bwd_inner_microstep: 244.59 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:24,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 245.80 | bwd_inner_microstep: 245.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:25,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.85 | bwd_microstep: 250.50 | bwd_inner_microstep: 250.47 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:25,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.79 | bwd_microstep: 243.36 | bwd_inner_microstep: 243.33 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:26,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.03 | bwd_microstep: 243.42 | bwd_inner_microstep: 243.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:26,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 242.54 | bwd_inner_microstep: 242.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:26,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.90 | optimizer_step: 3.48 +[2024-12-31 18:56:26,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.11 | bwd_microstep: 255.25 | bwd_inner_microstep: 241.39 | bwd_allreduce_microstep: 13.77 | step_microstep: 12.04 +[2024-12-31 18:56:26,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2845.96 | bwd: 4179.78 | bwd_inner: 4165.16 | bwd_allreduce: 14.04 | step: 15.29 + 99%|█████████▉| 751/759 [1:46:21<00:59, 7.47s/it] {'loss': 1.2085, 'learning_rate': 5.8297763382597625e-09, 'epoch': 0.99} + 99%|█████████▉| 751/759 [1:46:21<00:59, 7.47s/it][2024-12-31 18:56:27,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 222.50 | bwd_microstep: 354.14 | bwd_inner_microstep: 353.80 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:56:27,995] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.44 | bwd_microstep: 298.94 | bwd_inner_microstep: 298.91 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:28,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.14 | bwd_microstep: 262.66 | bwd_inner_microstep: 262.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:28,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.84 | bwd_microstep: 281.29 | bwd_inner_microstep: 281.26 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:29,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.33 | bwd_microstep: 256.98 | bwd_inner_microstep: 256.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:29,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.76 | bwd_microstep: 255.07 | bwd_inner_microstep: 255.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:30,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.62 | bwd_microstep: 267.16 | bwd_inner_microstep: 267.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:30,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.21 | bwd_microstep: 246.18 | bwd_inner_microstep: 246.15 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:31,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 246.60 | bwd_inner_microstep: 246.57 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:31,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.92 | bwd_microstep: 246.27 | bwd_inner_microstep: 246.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:32,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.45 | bwd_microstep: 243.02 | bwd_inner_microstep: 242.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:32,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 243.10 | bwd_inner_microstep: 243.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:32,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.58 | bwd_microstep: 247.40 | bwd_inner_microstep: 247.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:56:33,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.66 | bwd_microstep: 250.49 | bwd_inner_microstep: 250.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:33,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.35 | bwd_microstep: 241.51 | bwd_inner_microstep: 241.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:34,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.83 | optimizer_gradients: 0.64 | optimizer_step: 3.34 +[2024-12-31 18:56:34,229] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.31 | bwd_microstep: 256.22 | bwd_inner_microstep: 242.52 | bwd_allreduce_microstep: 13.57 | step_microstep: 10.96 +[2024-12-31 18:56:34,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.26 | bwd: 4197.12 | bwd_inner: 4182.67 | bwd_allreduce: 13.86 | step: 14.02 + 99%|█████████▉| 752/759 [1:46:29<00:52, 7.43s/it] {'loss': 1.2112, 'learning_rate': 4.463524165333466e-09, 'epoch': 0.99} + 99%|█████████▉| 752/759 [1:46:29<00:52, 7.43s/it][2024-12-31 18:56:34,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 218.77 | bwd_microstep: 344.47 | bwd_inner_microstep: 344.13 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:56:35,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 251.72 | bwd_microstep: 419.22 | bwd_inner_microstep: 419.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:56:35,963] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.70 | bwd_microstep: 266.98 | bwd_inner_microstep: 266.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:56:36,415] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.53 | bwd_microstep: 257.48 | bwd_inner_microstep: 257.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:36,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.29 | bwd_microstep: 248.00 | bwd_inner_microstep: 247.97 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:56:37,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.94 | bwd_microstep: 254.35 | bwd_inner_microstep: 254.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:37,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.52 | bwd_microstep: 248.63 | bwd_inner_microstep: 248.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:56:38,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.23 | bwd_microstep: 246.38 | bwd_inner_microstep: 246.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:38,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 174.53 | bwd_microstep: 244.70 | bwd_inner_microstep: 244.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:39,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.33 | bwd_microstep: 246.31 | bwd_inner_microstep: 246.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:39,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.87 | bwd_microstep: 244.10 | bwd_inner_microstep: 244.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:39,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 244.81 | bwd_inner_microstep: 244.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:40,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.22 | bwd_microstep: 240.98 | bwd_inner_microstep: 240.95 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:40,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.26 | bwd_microstep: 242.74 | bwd_inner_microstep: 242.71 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:41,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.15 | bwd_microstep: 243.05 | bwd_inner_microstep: 243.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:42,089] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.57 | optimizer_gradients: 0.63 | optimizer_step: 6.13 +[2024-12-31 18:56:42,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.00 | bwd_microstep: 687.02 | bwd_inner_microstep: 241.67 | bwd_allreduce_microstep: 445.31 | step_microstep: 14.63 +[2024-12-31 18:56:42,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2884.70 | bwd: 4679.25 | bwd_inner: 4233.17 | bwd_allreduce: 445.56 | step: 17.61 + 99%|█████████▉| 753/759 [1:46:37<00:45, 7.56s/it] {'loss': 1.2329, 'learning_rate': 3.2793886057991277e-09, 'epoch': 0.99} + 99%|█████████▉| 753/759 [1:46:37<00:45, 7.56s/it][2024-12-31 18:56:42,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 213.78 | bwd_microstep: 340.90 | bwd_inner_microstep: 340.56 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.19 +[2024-12-31 18:56:43,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.66 | bwd_microstep: 290.41 | bwd_inner_microstep: 290.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:56:43,641] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 191.36 | bwd_microstep: 267.14 | bwd_inner_microstep: 267.11 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:44,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.38 | bwd_microstep: 282.20 | bwd_inner_microstep: 282.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +local variable 'images' referenced before assignment nuscene_cap_194k +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 212, in multi_modal_multi_image_get_item + image = Image.open(new_cam_path[key]).convert('RGB') + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/site-packages/PIL/Image.py", line 3466, in open + filename = os.path.realpath(os.fspath(fp)) + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/posixpath.py", line 393, in realpath + return abspath(path) + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/posixpath.py", line 380, in abspath + cwd = os.getcwd() +FileNotFoundError: [Errno 2] No such file or directory + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 272, in __getitem__ + ret = self.multi_modal_multi_image_get_item(data_item) + File "/fusion-algo-lidar-secret-nas/interns/zzc/project/InternVL25/internvl_chat/internvl/train/nuscene_dataset.py", line 214, in multi_modal_multi_image_get_item + print(f'Failed to load image: {images}, the dataset is: {self.ds_name}') +UnboundLocalError: local variable 'images' referenced before assignment +[2024-12-31 18:56:44,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.73 | bwd_microstep: 255.27 | bwd_inner_microstep: 255.24 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:56:45,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.08 | bwd_microstep: 255.84 | bwd_inner_microstep: 255.81 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:45,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.13 | bwd_microstep: 248.49 | bwd_inner_microstep: 248.46 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.10 +[2024-12-31 18:56:45,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 249.27 | bwd_inner_microstep: 249.25 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:46,339] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 246.81 | bwd_inner_microstep: 246.78 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:46,772] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.00 | bwd_microstep: 245.02 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:56:47,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.83 | bwd_microstep: 250.19 | bwd_inner_microstep: 250.16 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:47,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 245.01 | bwd_inner_microstep: 244.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:48,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.74 | bwd_microstep: 243.43 | bwd_inner_microstep: 243.40 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:48,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.42 | bwd_microstep: 297.47 | bwd_inner_microstep: 297.44 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:48,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.57 | bwd_microstep: 240.77 | bwd_inner_microstep: 240.74 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:49,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.52 | optimizer_gradients: 0.81 | optimizer_step: 3.17 +[2024-12-31 18:56:49,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.11 | bwd_microstep: 547.15 | bwd_inner_microstep: 243.28 | bwd_allreduce_microstep: 303.82 | step_microstep: 17.27 +[2024-12-31 18:56:49,734] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2838.93 | bwd: 4505.40 | bwd_inner: 4200.82 | bwd_allreduce: 304.06 | step: 20.20 + 99%|█████████▉| 754/759 [1:46:44<00:37, 7.58s/it] {'loss': 1.1981, 'learning_rate': 2.277391234363213e-09, 'epoch': 0.99} + 99%|█████████▉| 754/759 [1:46:44<00:37, 7.58s/it][2024-12-31 18:56:50,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 216.26 | bwd_microstep: 344.04 | bwd_inner_microstep: 343.68 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.19 +[2024-12-31 18:56:50,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 196.11 | bwd_microstep: 286.31 | bwd_inner_microstep: 286.28 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:51,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 260.95 | bwd_inner_microstep: 260.92 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:56:51,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.52 | bwd_microstep: 255.84 | bwd_inner_microstep: 255.82 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:52,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 175.88 | bwd_microstep: 254.89 | bwd_inner_microstep: 254.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:56:52,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.43 | bwd_microstep: 247.69 | bwd_inner_microstep: 247.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:53,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.17 | bwd_microstep: 247.17 | bwd_inner_microstep: 247.10 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.23 +[2024-12-31 18:56:53,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.43 | bwd_microstep: 243.50 | bwd_inner_microstep: 243.48 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:53,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.82 | bwd_microstep: 243.03 | bwd_inner_microstep: 243.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:56:54,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.03 | bwd_microstep: 244.55 | bwd_inner_microstep: 244.53 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.26 +[2024-12-31 18:56:54,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 245.07 | bwd_inner_microstep: 245.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:55,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.55 | bwd_microstep: 243.35 | bwd_inner_microstep: 243.32 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:55,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.26 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:56:56,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.99 | bwd_microstep: 240.88 | bwd_inner_microstep: 240.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:56,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 179.26 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.33 +[2024-12-31 18:56:57,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.29 | optimizer_gradients: 0.66 | optimizer_step: 3.09 +[2024-12-31 18:56:57,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.05 | bwd_microstep: 381.76 | bwd_inner_microstep: 244.08 | bwd_allreduce_microstep: 137.63 | step_microstep: 11.83 +[2024-12-31 18:56:57,073] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2825.44 | bwd: 4225.94 | bwd_inner: 4087.38 | bwd_allreduce: 137.89 | step: 15.12 + 99%|█████████▉| 755/759 [1:46:52<00:30, 7.51s/it] {'loss': 1.2015, 'learning_rate': 1.4575503072100649e-09, 'epoch': 0.99} + 99%|█████████▉| 755/759 [1:46:52<00:30, 7.51s/it][2024-12-31 18:56:57,606] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 208.24 | bwd_microstep: 311.33 | bwd_inner_microstep: 310.98 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.22 +[2024-12-31 18:56:58,118] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 197.43 | bwd_microstep: 296.88 | bwd_inner_microstep: 296.85 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:56:58,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.85 | bwd_microstep: 266.08 | bwd_inner_microstep: 266.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:56:59,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.47 | bwd_microstep: 256.24 | bwd_inner_microstep: 256.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:56:59,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.92 | bwd_microstep: 264.78 | bwd_inner_microstep: 264.76 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:56:59,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.75 | bwd_microstep: 264.21 | bwd_inner_microstep: 264.18 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:00,389] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.55 | bwd_microstep: 248.15 | bwd_inner_microstep: 248.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:00,835] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 180.65 | bwd_microstep: 247.05 | bwd_inner_microstep: 247.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:01,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.23 | bwd_microstep: 245.12 | bwd_inner_microstep: 245.09 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:01,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.29 | bwd_microstep: 244.75 | bwd_inner_microstep: 244.66 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:02,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.59 | bwd_microstep: 246.02 | bwd_inner_microstep: 245.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.06 +[2024-12-31 18:57:02,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 173.10 | bwd_microstep: 245.92 | bwd_inner_microstep: 245.73 | bwd_allreduce_microstep: 0.02 | step_microstep: 0.28 +[2024-12-31 18:57:03,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.13 | bwd_microstep: 242.72 | bwd_inner_microstep: 242.69 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:57:03,440] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.87 | bwd_microstep: 242.80 | bwd_inner_microstep: 242.77 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:03,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.94 | bwd_microstep: 240.90 | bwd_inner_microstep: 240.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:04,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.99 | optimizer_gradients: 0.56 | optimizer_step: 3.11 +[2024-12-31 18:57:04,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.63 | bwd_microstep: 585.91 | bwd_inner_microstep: 245.26 | bwd_allreduce_microstep: 340.60 | step_microstep: 11.33 +[2024-12-31 18:57:04,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2837.10 | bwd: 4449.02 | bwd_inner: 4107.32 | bwd_allreduce: 340.95 | step: 14.21 + 100%|█████████▉| 756/759 [1:46:59<00:22, 7.53s/it] {'loss': 1.2342, 'learning_rate': 8.198807616732752e-10, 'epoch': 1.0} + 100%|█████████▉| 756/759 [1:46:59<00:22, 7.53s/it][2024-12-31 18:57:05,254] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 224.84 | bwd_microstep: 357.71 | bwd_inner_microstep: 357.34 | bwd_allreduce_microstep: 0.13 | step_microstep: 0.20 +[2024-12-31 18:57:05,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 228.59 | bwd_microstep: 370.42 | bwd_inner_microstep: 370.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:57:06,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.93 | bwd_microstep: 267.40 | bwd_inner_microstep: 267.37 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:06,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.96 | bwd_microstep: 261.10 | bwd_inner_microstep: 261.07 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:57:07,234] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.18 | bwd_microstep: 251.05 | bwd_inner_microstep: 251.02 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:57:07,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.84 | bwd_microstep: 249.08 | bwd_inner_microstep: 249.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:57:08,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.93 | bwd_microstep: 282.32 | bwd_inner_microstep: 282.29 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:08,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 245.62 | bwd_inner_microstep: 245.60 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.05 +[2024-12-31 18:57:09,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.30 | bwd_microstep: 312.08 | bwd_inner_microstep: 312.05 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:09,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.89 | bwd_microstep: 245.65 | bwd_inner_microstep: 245.62 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:57:09,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.01 | bwd_microstep: 244.36 | bwd_inner_microstep: 244.34 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:10,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.25 | bwd_microstep: 246.03 | bwd_inner_microstep: 246.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.15 +[2024-12-31 18:57:10,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.10 | bwd_microstep: 243.55 | bwd_inner_microstep: 243.52 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:11,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.38 | bwd_microstep: 244.58 | bwd_inner_microstep: 244.55 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:11,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.62 | bwd_microstep: 243.11 | bwd_inner_microstep: 243.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:12,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.56 | optimizer_gradients: 0.64 | optimizer_step: 3.36 +[2024-12-31 18:57:12,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.75 | bwd_microstep: 258.20 | bwd_inner_microstep: 244.57 | bwd_allreduce_microstep: 13.54 | step_microstep: 10.81 +[2024-12-31 18:57:12,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2861.24 | bwd: 4322.32 | bwd_inner: 4307.93 | bwd_allreduce: 13.81 | step: 13.57 + 100%|█████████▉| 757/759 [1:47:07<00:15, 7.52s/it] {'loss': 1.2115, 'learning_rate': 3.6439421595924065e-10, 'epoch': 1.0} + 100%|█████████▉| 757/759 [1:47:07<00:15, 7.52s/it][2024-12-31 18:57:12,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 225.43 | bwd_microstep: 357.86 | bwd_inner_microstep: 357.53 | bwd_allreduce_microstep: 0.12 | step_microstep: 0.19 +[2024-12-31 18:57:13,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 192.23 | bwd_microstep: 290.06 | bwd_inner_microstep: 290.03 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:57:13,727] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 195.73 | bwd_microstep: 287.02 | bwd_inner_microstep: 286.99 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.16 +[2024-12-31 18:57:14,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 181.54 | bwd_microstep: 270.39 | bwd_inner_microstep: 270.36 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:14,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 177.86 | bwd_microstep: 263.11 | bwd_inner_microstep: 263.08 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:15,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.27 | bwd_microstep: 250.04 | bwd_inner_microstep: 250.01 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:15,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.51 | bwd_microstep: 247.66 | bwd_inner_microstep: 247.63 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:57:15,990] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.46 | bwd_microstep: 254.71 | bwd_inner_microstep: 254.68 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:16,422] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.39 | bwd_microstep: 244.92 | bwd_inner_microstep: 244.90 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:57:16,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.17 | bwd_microstep: 244.44 | bwd_inner_microstep: 244.41 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:57:17,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.88 | bwd_microstep: 246.69 | bwd_inner_microstep: 246.67 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:57:17,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.15 | bwd_microstep: 243.24 | bwd_inner_microstep: 243.21 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:18,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.84 | bwd_microstep: 243.85 | bwd_inner_microstep: 243.65 | bwd_allreduce_microstep: 0.11 | step_microstep: 0.20 +[2024-12-31 18:57:18,581] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.97 | bwd_microstep: 241.23 | bwd_inner_microstep: 241.20 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:19,010] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.17 | bwd_microstep: 240.96 | bwd_inner_microstep: 240.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:57:19,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 5.49 | optimizer_gradients: 0.71 | optimizer_step: 3.17 +[2024-12-31 18:57:19,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.67 | bwd_microstep: 371.61 | bwd_inner_microstep: 242.02 | bwd_allreduce_microstep: 129.54 | step_microstep: 11.53 +[2024-12-31 18:57:19,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2857.63 | bwd: 4297.88 | bwd_inner: 4167.36 | bwd_allreduce: 129.92 | step: 14.59 + 100%|█████████▉| 758/759 [1:47:14<00:07, 7.49s/it] {'loss': 1.2319, 'learning_rate': 9.109896894066161e-11, 'epoch': 1.0} + 100%|█████████▉| 758/759 [1:47:14<00:07, 7.49s/it][2024-12-31 18:57:20,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.31 | bwd_microstep: 317.27 | bwd_inner_microstep: 316.89 | bwd_allreduce_microstep: 0.15 | step_microstep: 0.20 +[2024-12-31 18:57:20,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 203.37 | bwd_microstep: 310.96 | bwd_inner_microstep: 310.93 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.21 +[2024-12-31 18:57:21,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 187.94 | bwd_microstep: 281.02 | bwd_inner_microstep: 281.00 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:21,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 178.80 | bwd_microstep: 260.90 | bwd_inner_microstep: 260.87 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.22 +[2024-12-31 18:57:22,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 172.51 | bwd_microstep: 250.37 | bwd_inner_microstep: 250.35 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.19 +[2024-12-31 18:57:22,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 176.06 | bwd_microstep: 254.41 | bwd_inner_microstep: 254.38 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.20 +[2024-12-31 18:57:22,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 171.06 | bwd_microstep: 247.82 | bwd_inner_microstep: 247.79 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.17 +[2024-12-31 18:57:23,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 226.29 | bwd_microstep: 247.73 | bwd_inner_microstep: 247.70 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.28 +[2024-12-31 18:57:23,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 170.31 | bwd_microstep: 243.20 | bwd_inner_microstep: 243.17 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.18 +[2024-12-31 18:57:24,279] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.28 | bwd_microstep: 242.89 | bwd_inner_microstep: 242.86 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:57:24,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 242.59 | bwd_inner_microstep: 242.56 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.14 +[2024-12-31 18:57:25,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.98 | bwd_microstep: 242.15 | bwd_inner_microstep: 242.13 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.12 +[2024-12-31 18:57:25,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 169.51 | bwd_microstep: 242.42 | bwd_inner_microstep: 242.39 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:57:25,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.78 | bwd_microstep: 240.48 | bwd_inner_microstep: 240.45 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.09 +[2024-12-31 18:57:26,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 167.77 | bwd_microstep: 240.06 | bwd_inner_microstep: 240.04 | bwd_allreduce_microstep: 0.01 | step_microstep: 0.11 +[2024-12-31 18:57:27,240] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_allgather: 4.87 | optimizer_gradients: 0.65 | optimizer_step: 3.09 +[2024-12-31 18:57:27,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 168.76 | bwd_microstep: 634.37 | bwd_inner_microstep: 243.22 | bwd_allreduce_microstep: 391.10 | step_microstep: 10.48 +[2024-12-31 18:57:27,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2870.15 | bwd: 4498.68 | bwd_inner: 4106.79 | bwd_allreduce: 391.35 | step: 13.36 + 100%|██████████| 759/759 [1:47:22<00:00, 7.55s/it] {'loss': 1.2129, 'learning_rate': 0.0, 'epoch': 1.0} + 100%|██████████| 759/759 [1:47:22<00:00, 7.55s/it]Failed to load image: ['./nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-01-16-41-59+0800__CAM_FRONT_LEFT__1533112967904844.jpg', './nuscenes/samples/CAM_FRONT/n015-2018-08-01-16-41-59+0800__CAM_FRONT__1533112967912460.jpg', './nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-01-16-41-59+0800__CAM_FRONT_RIGHT__1533112967920339.jpg', './nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-01-16-41-59+0800__CAM_BACK_LEFT__1533112967947423.jpg', './nuscenes/samples/CAM_BACK/n015-2018-08-01-16-41-59+0800__CAM_BACK__1533112967937525.jpg', './nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-01-16-41-59+0800__CAM_BACK_RIGHT__1533112967927893.jpg'], the dataset is: nuscene_cap_194k +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +Failed to load image: ['./nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-01-16-32-59+0800__CAM_FRONT_LEFT__1533112829604844.jpg', './nuscenes/samples/CAM_FRONT/n015-2018-08-01-16-32-59+0800__CAM_FRONT__1533112829612460.jpg', './nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-01-16-32-59+0800__CAM_FRONT_RIGHT__1533112829620339.jpg', './nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-01-16-32-59+0800__CAM_BACK_LEFT__1533112829647423.jpg', './nuscenes/samples/CAM_BACK/n015-2018-08-01-16-32-59+0800__CAM_BACK__1533112829637525.jpg', './nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-01-16-32-59+0800__CAM_BACK_RIGHT__1533112829627893.jpg'], the dataset is: nuscene_cap_194k +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +Failed to load image: ['./nuscenes/samples/CAM_FRONT_LEFT/n008-2018-08-30-15-16-55-0400__CAM_FRONT_LEFT__1535656770404799.jpg', './nuscenes/samples/CAM_FRONT/n008-2018-08-30-15-16-55-0400__CAM_FRONT__1535656770412404.jpg', './nuscenes/samples/CAM_FRONT_RIGHT/n008-2018-08-30-15-16-55-0400__CAM_FRONT_RIGHT__1535656770420482.jpg', './nuscenes/samples/CAM_BACK_LEFT/n008-2018-08-30-15-16-55-0400__CAM_BACK_LEFT__1535656770447405.jpg', './nuscenes/samples/CAM_BACK/n008-2018-08-30-15-16-55-0400__CAM_BACK__1535656770437558.jpg', './nuscenes/samples/CAM_BACK_RIGHT/n008-2018-08-30-15-16-55-0400__CAM_BACK_RIGHT__1535656770428113.jpg'], the dataset is: nuscene_cap_194k +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +Failed to load image: ['./nuscenes/samples/CAM_FRONT_LEFT/n015-2018-08-01-16-41-59+0800__CAM_FRONT_LEFT__1533113238854844.jpg', './nuscenes/samples/CAM_FRONT/n015-2018-08-01-16-41-59+0800__CAM_FRONT__1533113238862460.jpg', './nuscenes/samples/CAM_FRONT_RIGHT/n015-2018-08-01-16-41-59+0800__CAM_FRONT_RIGHT__1533113238870339.jpg', './nuscenes/samples/CAM_BACK_LEFT/n015-2018-08-01-16-41-59+0800__CAM_BACK_LEFT__1533113238897423.jpg', './nuscenes/samples/CAM_BACK/n015-2018-08-01-16-41-59+0800__CAM_BACK__1533113238887525.jpg', './nuscenes/samples/CAM_BACK_RIGHT/n015-2018-08-01-16-41-59+0800__CAM_BACK_RIGHT__1533113238877893.jpg'], the dataset is: nuscene_cap_194k +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +petrel_client is not installed. If you read data locally instead of from ceph, ignore it. +petrel_client is not installed. Using PIL to load images. +Error in atexit._run_exitfuncs: +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py", line 74, in load + loaded_dict = pickle.load(handle) +FileNotFoundError: [Errno 2] No such file or directory +Error in atexit._run_exitfuncs: +Traceback (most recent call last): + File "/fusion-algo-lidar-secret-nas/interns/zzc/myenv/internvl25_crh/lib/python3.9/site-packages/deepspeed/ops/transformer/inference/triton/matmul_ext.py", line 74, in load + loaded_dict = pickle.load(handle) +FileNotFoundError: [Errno 2] No such file or directory +[INFO|trainer.py:1962] 2024-12-31 18:57:30,936 >> + +Training completed. Do not forget to share your model on huggingface.co/models =) + + + {'train_runtime': 6445.9977, 'train_samples_per_second': 30.144, 'train_steps_per_second': 0.118, 'train_loss': 1.2714912057394097, 'epoch': 1.0} + 100%|██████████| 759/759 [1:47:25<00:00, 7.55s/it] 100%|██████████| 759/759 [1:47:25<00:00, 8.49s/it] +[INFO|trainer.py:2936] 2024-12-31 18:57:32,198 >> Saving model checkpoint to work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight +[INFO|configuration_utils.py:473] 2024-12-31 18:57:32,215 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/config.json +[INFO|configuration_utils.py:594] 2024-12-31 18:57:32,218 >> Configuration saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/generation_config.json +[INFO|modeling_utils.py:2493] 2024-12-31 19:00:03,221 >> Model weights saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/model.safetensors +[INFO|tokenization_utils_base.py:2433] 2024-12-31 19:00:03,805 >> tokenizer config file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/tokenizer_config.json +[INFO|tokenization_utils_base.py:2442] 2024-12-31 19:00:04,047 >> Special tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/special_tokens_map.json +[INFO|tokenization_utils_base.py:2493] 2024-12-31 19:00:04,057 >> added tokens file saved in work_dirs/internvl2_2b_internlm2_1_8b_dynamic_res_nuscene_pre_loadofficialweight/added_tokens.json +***** train metrics ***** + epoch = 1.0 + train_loss = 1.2715 + train_runtime = 1:47:25.99 + train_samples = 194308 + train_samples_per_second = 30.144 + train_steps_per_second = 0.118