| model_id: Qwen/Qwen2.5-7B-Instruct |
| model_family: qwen2.5 |
| LoRA: |
| r: 0 |
| alpha: 128 |
| dropout: 0.05 |
| used_policy: true |
| used_strategy: true |
| step_token: <step>,</step> |
| answer_token: <answer>,</answer> |
| loss_type: grad_ascent |
| tune_vision_tower: false |
| tune_mm_projector: true |
| tune_language_model: true |
| data_path: ./dataset/safety_reasoning_v2_4step.json |
| split: with_strategy_policy_4step |
| batch_size: 2 |
| gradient_accumulation_steps: 8 |
| max_grad_norm: 1.0 |
| num_epochs: 3 |
| max_length: 3072 |
| save_dir: models/final_ft_${num_epochs}_epochs_lr${lr}_${model_family}_${split} |
| save_steps: 165 |
| lr: 1.0e-05 |
| weight_decay: 0.01 |
| seed: 233 |
| workers: 4 |
| lr_scheduler_type: cosine |
| warmup_ratio: 0.0 |
| max_train_steps: -1 |
| report_to: wandb |
| resume_from_checkpoint: '' |
|
|