webis
/

tite-2-late

Model card Files Files and versions

tite-2-late / pl_config.yaml

fschlatt's picture

Upload folder using huggingface_hub

b55bd0b verified 9 months ago

history blame contribute delete

4.51 kB

	# lightning.pytorch==2.5.2
	seed_everything: 42
	trainer:
	accelerator: auto
	strategy: auto
	devices: auto
	num_nodes: 1
	precision: bf16-mixed
	callbacks:
	- class_path: lightning.pytorch.callbacks.ModelCheckpoint
	init_args:
	dirpath: null
	filename: null
	monitor: null
	verbose: false
	save_last: null
	save_top_k: 1
	save_weights_only: false
	mode: min
	auto_insert_metric_name: true
	every_n_train_steps: null
	train_time_interval: null
	every_n_epochs: null
	save_on_train_epoch_end: null
	enable_version_counter: true
	fast_dev_run: false
	max_epochs: null
	min_epochs: null
	max_steps: 200000
	min_steps: null
	max_time: null
	limit_train_batches: null
	limit_val_batches: null
	limit_test_batches: null
	limit_predict_batches: null
	overfit_batches: 0.0
	val_check_interval: 50000
	check_val_every_n_epoch: 1
	num_sanity_val_steps: null
	log_every_n_steps: null
	enable_checkpointing: null
	enable_progress_bar: false
	enable_model_summary: null
	accumulate_grad_batches: 2
	gradient_clip_val: 1
	gradient_clip_algorithm: null
	deterministic: null
	benchmark: null
	inference_mode: true
	use_distributed_sampler: true
	profiler: null
	detect_anomaly: false
	barebones: false
	plugins: null
	sync_batchnorm: false
	reload_dataloaders_every_n_epochs: 0
	default_root_dir: null
	model_registry: null
	model:
	class_path: tite.module.TiteModule
	init_args:
	model:
	class_path: tite.model.TiteForPreTraining
	init_args:
	config:
	class_path: tite.model.TiteConfig
	init_args:
	vocab_size: 30522
	num_hidden_layers: 12
	hidden_sizes: 768
	num_attention_heads: 12
	intermediate_sizes: 3072
	kernel_sizes:
	- null
	- null
	- null
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	strides:
	- null
	- null
	- null
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	- 2
	dropout_prob: 0.1
	max_position_embeddings: 512
	initializer_range: 0.02
	layer_norm_eps: 1.0e-12
	pad_token_id: 0
	hidden_act: gelu_pytorch_tanh
	absolute_positional_embedding_type: null
	relative_positional_embedding_type: rotary
	pooling_location: intra
	rotary_interleaved: true
	norm_location: post
	norm_type: layer
	pooling_implementation: triton
	rope_implementation: eager
	positional_embedding_type: null
	enhanced_masked_auto_encoding: true
	bow_auto_encoding: true
	tokenizer:
	class_path: tite.model.TiteTokenizer
	init_args:
	vocab_file: tokenizers/tite/vocab.txt
	tokenizer_file: tokenizers/tite/tokenizer.json
	do_lower_case: true
	unk_token: '[UNK]'
	sep_token: '[SEP]'
	pad_token: '[PAD]'
	cls_token: '[CLS]'
	mask_token: '[MASK]'
	tokenize_chinese_chars: true
	strip_accents: null
	dict_kwargs:
	model_max_length: 512
	validate_on_glue: true
	validate_on_trec_dl: true
	log_gradients: false
	compile: true
	data:
	class_path: tite.datasets.FineWebDataModule
	init_args:
	collator:
	class_path: tite.datasets.TransformationCollator
	init_args:
	text_keys:
	- text
	- null
	string_transformations: null
	token_transformations:
	- class_path: tite.transformation.TokenMask
	init_args:
	mask_id: 103
	mask_prob: 0.3
	transformation_prob: 1.0
	max_length: 512
	path: HuggingFaceFW/fineweb-edu
	batch_size: 128
	seed: null
	num_workers: 8
	streaming: true
	lr_scheduler:
	class_path: tite.utils.lr_schedulers.SigmoidLRSchedulerWithLinearWarmup
	init_args:
	num_warmup_steps: 3000
	final_value: 0.02
	num_delay_steps: 0
	optimizer:
	class_path: tite.utils.adamw.AdamWNoWeightDecayBiasNorm
	init_args:
	lr: 0.0001
	betas:
	- 0.9
	- 0.999
	eps: 1.0e-08
	weight_decay: 0.01
	amsgrad: false
	maximize: false
	foreach: null
	capturable: false
	differentiable: false
	fused: null
	ckpt_path: null