iamplus
/

mpt-30b-v2

Text Generation

text-generation-inference

Model card Files Files and versions

mpt-30b-v2 / mpt-30b_v2.yaml

manojpreveen's picture

Rename mpt-30b_orca.yaml to mpt-30b_v2.yaml

9e69a22 almost 3 years ago

history blame contribute delete

2.66 kB

	max_seq_len: 8192
	global_seed: 17

	# Run Name
	run_name: mpt-30b-orca-1ep_flan3m # If left blank, will be read from env var $RUN_NAME

	model:
	name: hf_causal_lm
	pretrained: true
	pretrained_model_name_or_path: mosaicml/mpt-30b
	init_device: mixed
	config_overrides:
	max_seq_len: ${max_seq_len}
	attn_config:
	attn_impl: triton
	# Set this to `true` if using `train_loader.dataset.packing_ratio` below
	attn_uses_sequence_id: false

	# Tokenizer
	tokenizer:
	name: mosaicml/mpt-30b
	kwargs:
	model_max_length: ${max_seq_len}


	# Dataloaders
	train_loader:
	name: finetuning
	dataset:
	hf_name: csv
	hf_kwargs:
	data_dir: ~/mpt/llm-foundry/data/orca_3m_gpt3.5
	preprocessing_fn:
	split: train
	max_seq_len: ${max_seq_len}
	allow_pad_trimming: false
	decoder_only_format: true
	# # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
	# # to profile this run's optimal packing_ratio as it depends on GPU count,
	# # batch size, sequence length
	packing_ratio: 19.0
	shuffle: true
	drop_last: true
	num_workers: 8
	pin_memory: false
	prefetch_factor: 2
	persistent_workers: true
	timeout: 0

	# Optimization
	scheduler:
	name: linear_decay_with_warmup # linear no warmup is HF default which dolly used
	t_warmup: 100ba # add some warmup though, seems to help with MPT
	alpha_f: 0

	optimizer:
	# Based on Dolly
	name: decoupled_lionw
	lr: 2.0e-6
	betas:
	- 0.9
	- 0.999
	eps: 1.0e-8
	weight_decay: 0

	algorithms:
	gradient_clipping:
	clipping_type: norm
	clipping_threshold: 1.0

	max_duration: 1ep # 2-3 epochs seems like the sweet spot
	eval_interval: 1
	# eval_subset_num_batches: -1
	# eval_first: true
	global_train_batch_size: 8 # somewhere in the 6-8 * numgpus range seems good

	# System
	seed: ${global_seed}
	# device_eval_batch_size: 8
	device_train_microbatch_size: 2
	# device_train_microbatch_size: auto
	precision: amp_bf16

	# FSDP
	fsdp_config:
	sharding_strategy: FULL_SHARD
	mixed_precision: PURE
	activation_checkpointing: true
	activation_checkpointing_reentrant: false
	activation_cpu_offload: false
	limit_all_gathers: true
	verbose: false

	# Logging
	progress_bar: false
	log_to_console: true
	console_log_interval: 1ba

	callbacks:
	speed_monitor:
	window_size: 10
	lr_monitor: {}
	memory_monitor: {}
	runtime_estimator: {}

	# loggers:
	# wandb: {}

	# Checkpoint to local filesystem or remote object store
	# save_interval: 5000ba
	save_num_checkpoints_to_keep: 1 # Important, this cleans up checkpoints saved to DISK
	save_folder: ./{run_name}/checkpoints
	# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints