tenacious-bench-adapter / hyperparams.json
rafiakedir's picture
feat: add model card, inference example, and training scripts
e1374e8 verified
{
"model_id": "unsloth/Qwen2.5-1.5B-Instruct",
"training_algorithm": "ORPO",
"lora": {
"r": 16,
"lora_alpha": 32,
"target_modules": ["q_proj", "v_proj"],
"lora_dropout": 0.05,
"bias": "none",
"task_type": "CAUSAL_LM"
},
"orpo_trainer": {
"learning_rate": 8e-6,
"per_device_train_batch_size": 2,
"gradient_accumulation_steps": 4,
"effective_batch_size": 8,
"num_train_epochs": 3,
"warmup_ratio": 0.1,
"lr_scheduler_type": "cosine",
"beta": 0.1,
"max_length": 1024,
"max_prompt_length": 512,
"logging_steps": 10,
"save_steps": 50,
"seed": 42
},
"precision": {
"bf16": false,
"fp16": true,
"note": "T4 GPU: fp16 only. Switch to bf16 on A100/4090."
},
"adapter_output_dir": "training/adapter",
"hub_model_id": "rafiakedir/tenacious-bench-adapter",
"fixed_seed": 42,
"rationale": {
"orpo_vs_dpo": "ORPO chosen over DPO because it requires no reference model, reducing GPU memory footprint by ~40% on T4. Reference-free approach is appropriate for a judge component where the reference policy is undefined.",
"backbone_choice": "Qwen2.5-1.5B-Instruct selected per Prometheus-2 paper (Kim et al., 2024) showing 7B-class judge viability at 1.5B with preference tuning.",
"lora_rank": "Rank 16 with alpha 32 (2:1 ratio) is standard for task-specific adaptation. Rank 8 was considered but judge rubric complexity warrants higher rank.",
"beta_orpo": "Beta=0.1 follows ORPO paper (Hong et al., 2024) recommendation for instruction-following tasks."
}
}