EmbeddedLLM
/

ELLM_Star

Model card Files Files and versions

Metrics Training metrics Community

ELLM_Star / config.yaml

tjellm's picture

Upload folder using huggingface_hub

90fc0ea verified over 1 year ago

history blame contribute delete

3.85 kB

	# lightning.pytorch==2.4.0.dev20240728
	seed_everything: 123
	trainer:
	accelerator: gpu
	strategy:
	class_path: lightning.pytorch.strategies.DeepSpeedStrategy
	init_args:
	accelerator: null
	zero_optimization: true
	stage: 2
	remote_device: null
	offload_optimizer: false
	offload_parameters: true
	offload_params_device: cpu
	nvme_path: /local_nvme
	params_buffer_count: 5
	params_buffer_size: 100000000
	max_in_cpu: 1000000000
	offload_optimizer_device: cpu
	optimizer_buffer_count: 4
	block_size: 1048576
	queue_depth: 8
	single_submit: false
	overlap_events: true
	thread_count: 1
	pin_memory: true
	sub_group_size: 1000000000000
	contiguous_gradients: true
	overlap_comm: true
	allgather_partitions: true
	reduce_scatter: true
	allgather_bucket_size: 200000000
	reduce_bucket_size: 200000000
	zero_allow_untested_optimizer: true
	logging_batch_size_per_gpu: auto
	config: null
	logging_level: 30
	parallel_devices: null
	cluster_environment: null
	loss_scale: 0.0
	initial_scale_power: 16
	loss_scale_window: 1000
	hysteresis: 2
	min_loss_scale: 1
	partition_activations: false
	cpu_checkpointing: false
	contiguous_memory_optimization: false
	synchronize_checkpoint_boundary: false
	load_full_weights: false
	precision_plugin: null
	process_group_backend: null
	devices: 8
	num_nodes: 1
	precision: bf16-true
	logger:
	class_path: lightning.pytorch.loggers.TensorBoardLogger
	init_args:
	save_dir: /media/logs
	name: main
	version: null
	log_graph: false
	default_hp_metric: true
	prefix: ''
	sub_dir: null
	comment: ''
	purge_step: null
	max_queue: 10
	flush_secs: 120
	filename_suffix: ''
	callbacks: null
	fast_dev_run: false
	max_epochs: 2
	min_epochs: null
	max_steps: -1
	min_steps: null
	max_time: null
	limit_train_batches: null
	limit_val_batches: null
	limit_test_batches: null
	limit_predict_batches: null
	overfit_batches: 0.0
	val_check_interval: null
	check_val_every_n_epoch: 1
	num_sanity_val_steps: 0
	log_every_n_steps: 1
	enable_checkpointing: null
	enable_progress_bar: null
	enable_model_summary: null
	accumulate_grad_batches: 8
	gradient_clip_val: null
	gradient_clip_algorithm: null
	deterministic: null
	benchmark: null
	inference_mode: true
	use_distributed_sampler: true
	profiler: null
	detect_anomaly: false
	barebones: false
	plugins: null
	sync_batchnorm: false
	reload_dataloaders_every_n_epochs: 0
	default_root_dir: null
	model:
	config:
	model_name: Mistral-7B-v0.2
	dtype: bfloat16
	num_thoughts: 2
	thought_length: 8
	lookahead_tokens: 4
	embedding_grad_weights: 100.0
	temperature: 1.0
	do_sample: true
	train_max_length: 120
	offload_cache: false
	top_k: null
	top_p: null
	checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
	weight_decay: 0.001
	warmup_steps: 20
	policy_weight: 1.0
	init_lr: 1.0e-06
	optimizer:
	class_path: torch.optim.AdamW
	init_args:
	lr: 1.0e-06
	betas:
	- 0.9
	- 0.999
	eps: 1.0e-08
	weight_decay: 0.001
	amsgrad: false
	maximize: false
	foreach: null
	capturable: false
	differentiable: false
	fused: null
	scheduler: null
	ckpt_path: null
	data:
	class_path: src.dataset.OpenWebMathDataModule
	init_args:
	data_path: /media/datasets/openwebmath
	tokenizer:
	class_path: src.dataset.SpecialTokenizer
	init_args:
	checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
	batch_size: 1
	max_seq_length: 120
	num_samples: 2048
	ignore_index: -100
	val_split_fraction: 0.125
	seed: 42
	num_workers: 1