| { |
| "add_nsp": false, |
| "async_worker": true, |
| "attention_dropout_checkpoint": false, |
| "current_run_id": "", |
| "data_loader_type": "dist", |
| "dataset_path": "/data2/sagawatatsuya/chemlm_pretraining2/chemlm_pretraining/dataset/data/samples", |
| "deepspeed": true, |
| "deepspeed_config": "/data2/sagawatatsuya/chemlm_pretraining2/output/4th_size/4th_size-/epoch2048_hf/deepspeed_config.json", |
| "deepspeed_transformer_kernel": false, |
| "do_validation": true, |
| "ds_config": { |
| "bf16": { |
| "enabled": true |
| }, |
| "gradient_clipping": 0.0, |
| "steps_per_print": 100, |
| "train_batch_size": 4096, |
| "train_micro_batch_size_per_gpu": 512, |
| "wall_clock_breakdown": false, |
| "zero_optimization": { |
| "gather_16bit_weights_on_model_save": true, |
| "stage": 0 |
| } |
| }, |
| "early_exit_time_marker": 1000000.0, |
| "early_stop_eval_loss": 2.1, |
| "early_stop_time": 720, |
| "finetune_checkpoint_at_end": true, |
| "fp16": true, |
| "fp16_backend": "ds", |
| "fp16_opt": "O2", |
| "gelu_checkpoint": false, |
| "gradient_accumulation_steps": 8, |
| "gradient_clipping": 0.0, |
| "job_name": "4th_size", |
| "learning_rate": 0.001, |
| "load_checkpoint_id": "latest_checkpoint", |
| "load_training_checkpoint": "/data2/sagawatatsuya/chemlm_pretraining2/output/4th_size/4th_size-/epoch2048", |
| "local_rank": 0, |
| "log_throughput_every": 20, |
| "lr": 0.001, |
| "max_predictions_per_seq": 77, |
| "max_steps": 9223372036854775807, |
| "max_steps_per_epoch": 9223372036854775807, |
| "model_config": { |
| "attention_probs_dropout_prob": 0.1, |
| "encoder_ln_mode": "pre-ln", |
| "fused_linear_layer": true, |
| "hidden_act": "gelu", |
| "hidden_dropout_prob": 0.1, |
| "hidden_size": 512, |
| "initializer_range": 0.02, |
| "intermediate_size": 2048, |
| "layer_norm_type": "apex", |
| "layernorm_embedding": false, |
| "max_position_embeddings": 512, |
| "num_attention_heads": 8, |
| "num_hidden_layers": 8, |
| "sparse_mask_prediction": true, |
| "type_vocab_size": 2, |
| "vocab_size": 2362 |
| }, |
| "model_type": "bert-mlm", |
| "no_nsp": true, |
| "normalize_invertible": false, |
| "num_epochs": 2048, |
| "num_epochs_between_checkpoints": 1, |
| "num_workers": 12, |
| "output_dir": "/data2/sagawatatsuya/chemlm_pretraining2/output/4th_size", |
| "prescale_gradients": false, |
| "print_steps": 100, |
| "project_name": "chemlm_pretraining_4th_size", |
| "scale_cnt_limit": 100, |
| "seed": 42, |
| "steps_per_print": 100, |
| "stochastic_mode": false, |
| "tokenizer_name": "ibm-research/MoLFormer-XL-both-10pct", |
| "total_training_time": 1000000.0, |
| "train_batch_size": 4096, |
| "train_micro_batch_size_per_gpu": 512, |
| "use_early_stopping": false, |
| "validation_begin_proportion": 0.05, |
| "validation_end_proportion": 0.01, |
| "validation_epochs": 1, |
| "validation_epochs_begin": 1, |
| "validation_epochs_end": 1, |
| "validation_micro_batch": 512, |
| "vocab_size": 2368, |
| "wall_clock_breakdown": false |
| } |
|
|