| |
| model_name_or_path: Qwen2.5-7B |
| flash_attn: fa2 |
|
|
|
|
| |
| stage: sft |
| do_train: true |
| finetuning_type: full |
| deepspeed: examples/deepspeed/ds_z3_config.json |
|
|
| |
| dataset: tower_zhen_cot_mt_gemini_distill.sft |
| template: chatml |
| cutoff_len: 12000 |
| |
| |
| overwrite_cache: true |
| preprocessing_num_workers: 16 |
| dataloader_num_workers: 4 |
| |
|
|
| |
|
|
| |
| output_dir: saves/qwen-7b/full/sft/v2 |
| logging_steps: 1 |
| save_steps: 2500 |
| plot_loss: true |
| overwrite_output_dir: true |
| report_to: wandb |
|
|
| |
| per_device_train_batch_size: 4 |
| gradient_accumulation_steps: 2 |
| learning_rate: 1.0e-5 |
| num_train_epochs: 1.0 |
| lr_scheduler_type: cosine |
| warmup_ratio: 0.1 |
| |
| bf16: true |
| ddp_timeout: 180000000 |
| resume_from_checkpoint: null |
| |
|
|
| |
| |
| |
| |
| |
|
|