feat: add model card, inference example, and training scripts

e1374e8 verified 18 days ago

1.57 kB

	{
	"model_id": "unsloth/Qwen2.5-1.5B-Instruct",
	"training_algorithm": "ORPO",
	"lora": {
	"r": 16,
	"lora_alpha": 32,
	"target_modules": ["q_proj", "v_proj"],
	"lora_dropout": 0.05,
	"bias": "none",
	"task_type": "CAUSAL_LM"
	},
	"orpo_trainer": {
	"learning_rate": 8e-6,
	"per_device_train_batch_size": 2,
	"gradient_accumulation_steps": 4,
	"effective_batch_size": 8,
	"num_train_epochs": 3,
	"warmup_ratio": 0.1,
	"lr_scheduler_type": "cosine",
	"beta": 0.1,
	"max_length": 1024,
	"max_prompt_length": 512,
	"logging_steps": 10,
	"save_steps": 50,
	"seed": 42
	},
	"precision": {
	"bf16": false,
	"fp16": true,
	"note": "T4 GPU: fp16 only. Switch to bf16 on A100/4090."
	},
	"adapter_output_dir": "training/adapter",
	"hub_model_id": "rafiakedir/tenacious-bench-adapter",
	"fixed_seed": 42,
	"rationale": {
	"orpo_vs_dpo": "ORPO chosen over DPO because it requires no reference model, reducing GPU memory footprint by ~40% on T4. Reference-free approach is appropriate for a judge component where the reference policy is undefined.",
	"backbone_choice": "Qwen2.5-1.5B-Instruct selected per Prometheus-2 paper (Kim et al., 2024) showing 7B-class judge viability at 1.5B with preference tuning.",
	"lora_rank": "Rank 16 with alpha 32 (2:1 ratio) is standard for task-specific adaptation. Rank 8 was considered but judge rubric complexity warrants higher rank.",
	"beta_orpo": "Beta=0.1 follows ORPO paper (Hong et al., 2024) recommendation for instruction-following tasks."
	}
	}