chemlm-86.24m / args.json
sagawa's picture
Upload 8 files
f6be4ca verified
raw
history blame
2.86 kB
{
"add_nsp": false,
"async_worker": true,
"attention_dropout_checkpoint": false,
"current_run_id": "",
"data_loader_type": "dist",
"dataset_path": "/data2/sagawatatsuya/chemlm_pretraining2/chemlm_pretraining/dataset/data/samples",
"deepspeed": true,
"deepspeed_config": "/data2/sagawatatsuya/chemlm_pretraining2/output/5th_size/5th_size-/epoch2048_hf/deepspeed_config.json",
"deepspeed_transformer_kernel": false,
"do_validation": true,
"ds_config": {
"bf16": {
"enabled": true
},
"gradient_clipping": 0.0,
"steps_per_print": 100,
"train_batch_size": 4096,
"train_micro_batch_size_per_gpu": 128,
"wall_clock_breakdown": false,
"zero_optimization": {
"gather_16bit_weights_on_model_save": true,
"stage": 0
}
},
"early_exit_time_marker": 1000000.0,
"early_stop_eval_loss": 2.1,
"early_stop_time": 720,
"finetune_checkpoint_at_end": true,
"fp16": true,
"fp16_backend": "ds",
"fp16_opt": "O2",
"gelu_checkpoint": false,
"gradient_accumulation_steps": 32,
"gradient_clipping": 0.0,
"job_name": "5th_size",
"learning_rate": 0.001,
"load_checkpoint_id": "latest_checkpoint",
"load_training_checkpoint": "/data2/sagawatatsuya/chemlm_pretraining2/output/5th_size/5th_size-/epoch2048",
"local_rank": 0,
"log_throughput_every": 20,
"lr": 0.001,
"max_predictions_per_seq": 77,
"max_steps": 9223372036854775807,
"max_steps_per_epoch": 9223372036854775807,
"model_config": {
"attention_probs_dropout_prob": 0.1,
"encoder_ln_mode": "pre-ln",
"fused_linear_layer": true,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_type": "apex",
"layernorm_embedding": false,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"sparse_mask_prediction": true,
"type_vocab_size": 2,
"vocab_size": 2362
},
"model_type": "bert-mlm",
"no_nsp": true,
"normalize_invertible": false,
"num_epochs": 2048,
"num_epochs_between_checkpoints": 1,
"num_workers": 12,
"output_dir": "/data2/sagawatatsuya/chemlm_pretraining2/output/5th_size",
"prescale_gradients": false,
"print_steps": 100,
"project_name": "chemlm_pretraining_5th_size",
"scale_cnt_limit": 100,
"seed": 42,
"steps_per_print": 100,
"stochastic_mode": false,
"tokenizer_name": "ibm-research/MoLFormer-XL-both-10pct",
"total_training_time": 1000000.0,
"train_batch_size": 4096,
"train_micro_batch_size_per_gpu": 128,
"use_early_stopping": false,
"validation_begin_proportion": 0.05,
"validation_end_proportion": 0.01,
"validation_epochs": 1,
"validation_epochs_begin": 1,
"validation_epochs_end": 1,
"validation_micro_batch": 128,
"vocab_size": 2368,
"wall_clock_breakdown": false
}