{ "add_nsp": false, "async_worker": true, "attention_dropout_checkpoint": false, "current_run_id": "", "data_loader_type": "dist", "dataset_path": "/data2/sagawatatsuya/chemlm_pretraining2/chemlm_pretraining/dataset/data/samples", "deepspeed": true, "deepspeed_config": "/data2/sagawatatsuya/chemlm_pretraining2/output/3th_size/3th_size-/epoch2048_hf/deepspeed_config.json", "deepspeed_transformer_kernel": false, "do_validation": true, "ds_config": { "bf16": { "enabled": true }, "gradient_clipping": 0.0, "steps_per_print": 100, "train_batch_size": 4096, "train_micro_batch_size_per_gpu": 1024, "wall_clock_breakdown": false, "zero_optimization": { "gather_16bit_weights_on_model_save": true, "stage": 0 } }, "early_exit_time_marker": 1000000.0, "early_stop_eval_loss": 2.1, "early_stop_time": 720, "finetune_checkpoint_at_end": true, "fp16": true, "fp16_backend": "ds", "fp16_opt": "O2", "gelu_checkpoint": false, "gradient_accumulation_steps": 4, "gradient_clipping": 0.0, "job_name": "3th_size", "learning_rate": 0.001, "load_checkpoint_id": "latest_checkpoint", "load_training_checkpoint": "/data2/sagawatatsuya/chemlm_pretraining2/output/3th_size/3th_size-/epoch2048", "local_rank": 0, "log_throughput_every": 20, "lr": 0.001, "max_predictions_per_seq": 77, "max_steps": 9223372036854775807, "max_steps_per_epoch": 9223372036854775807, "model_config": { "attention_probs_dropout_prob": 0.1, "encoder_ln_mode": "pre-ln", "fused_linear_layer": true, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 256, "initializer_range": 0.02, "intermediate_size": 1024, "layer_norm_type": "apex", "layernorm_embedding": false, "max_position_embeddings": 512, "num_attention_heads": 4, "num_hidden_layers": 6, "sparse_mask_prediction": true, "type_vocab_size": 2, "vocab_size": 2362 }, "model_type": "bert-mlm", "no_nsp": true, "normalize_invertible": false, "num_epochs": 2048, "num_epochs_between_checkpoints": 1, "num_workers": 12, "output_dir": "/data2/sagawatatsuya/chemlm_pretraining2/output/3th_size", "prescale_gradients": false, "print_steps": 100, "project_name": "chemlm_pretraining_3th_size", "scale_cnt_limit": 100, "seed": 42, "steps_per_print": 100, "stochastic_mode": false, "tokenizer_name": "ibm-research/MoLFormer-XL-both-10pct", "total_training_time": 1000000.0, "train_batch_size": 4096, "train_micro_batch_size_per_gpu": 1024, "use_early_stopping": false, "validation_begin_proportion": 0.05, "validation_end_proportion": 0.01, "validation_epochs": 1, "validation_epochs_begin": 1, "validation_epochs_end": 1, "validation_micro_batch": 1024, "vocab_size": 2368, "wall_clock_breakdown": false }