| max_seq_len: 8192 | |
| global_seed: 17 | |
| # Run Name | |
| run_name: mpt-30b-orca-1ep_flan3m # If left blank, will be read from env var $RUN_NAME | |
| model: | |
| name: hf_causal_lm | |
| pretrained: true | |
| pretrained_model_name_or_path: mosaicml/mpt-30b | |
| init_device: mixed | |
| config_overrides: | |
| max_seq_len: ${max_seq_len} | |
| attn_config: | |
| attn_impl: triton | |
| # Set this to `true` if using `train_loader.dataset.packing_ratio` below | |
| attn_uses_sequence_id: false | |
| # Tokenizer | |
| tokenizer: | |
| name: mosaicml/mpt-30b | |
| kwargs: | |
| model_max_length: ${max_seq_len} | |
| # Dataloaders | |
| train_loader: | |
| name: finetuning | |
| dataset: | |
| hf_name: csv | |
| hf_kwargs: | |
| data_dir: ~/mpt/llm-foundry/data/orca_3m_gpt3.5 | |
| preprocessing_fn: | |
| split: train | |
| max_seq_len: ${max_seq_len} | |
| allow_pad_trimming: false | |
| decoder_only_format: true | |
| # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...` | |
| # # to profile this run's optimal packing_ratio as it depends on GPU count, | |
| # # batch size, sequence length | |
| packing_ratio: 19.0 | |
| shuffle: true | |
| drop_last: true | |
| num_workers: 8 | |
| pin_memory: false | |
| prefetch_factor: 2 | |
| persistent_workers: true | |
| timeout: 0 | |
| # Optimization | |
| scheduler: | |
| name: linear_decay_with_warmup # linear no warmup is HF default which dolly used | |
| t_warmup: 100ba # add some warmup though, seems to help with MPT | |
| alpha_f: 0 | |
| optimizer: | |
| # Based on Dolly | |
| name: decoupled_lionw | |
| lr: 2.0e-6 | |
| betas: | |
| - 0.9 | |
| - 0.999 | |
| eps: 1.0e-8 | |
| weight_decay: 0 | |
| algorithms: | |
| gradient_clipping: | |
| clipping_type: norm | |
| clipping_threshold: 1.0 | |
| max_duration: 1ep # 2-3 epochs seems like the sweet spot | |
| eval_interval: 1 | |
| # eval_subset_num_batches: -1 | |
| # eval_first: true | |
| global_train_batch_size: 8 # somewhere in the 6-8 * numgpus range seems good | |
| # System | |
| seed: ${global_seed} | |
| # device_eval_batch_size: 8 | |
| device_train_microbatch_size: 2 | |
| # device_train_microbatch_size: auto | |
| precision: amp_bf16 | |
| # FSDP | |
| fsdp_config: | |
| sharding_strategy: FULL_SHARD | |
| mixed_precision: PURE | |
| activation_checkpointing: true | |
| activation_checkpointing_reentrant: false | |
| activation_cpu_offload: false | |
| limit_all_gathers: true | |
| verbose: false | |
| # Logging | |
| progress_bar: false | |
| log_to_console: true | |
| console_log_interval: 1ba | |
| callbacks: | |
| speed_monitor: | |
| window_size: 10 | |
| lr_monitor: {} | |
| memory_monitor: {} | |
| runtime_estimator: {} | |
| # loggers: | |
| # wandb: {} | |
| # Checkpoint to local filesystem or remote object store | |
| # save_interval: 5000ba | |
| save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK | |
| save_folder: ./{run_name}/checkpoints | |
| # save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints | |