{
  "model_id": "unsloth/Qwen2.5-1.5B-Instruct",
  "training_algorithm": "ORPO",
  "lora": {
    "r": 16,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "v_proj"],
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM"
  },
  "orpo_trainer": {
    "learning_rate": 8e-6,
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "effective_batch_size": 8,
    "num_train_epochs": 3,
    "warmup_ratio": 0.1,
    "lr_scheduler_type": "cosine",
    "beta": 0.1,
    "max_length": 1024,
    "max_prompt_length": 512,
    "logging_steps": 10,
    "save_steps": 50,
    "seed": 42
  },
  "precision": {
    "bf16": false,
    "fp16": true,
    "note": "T4 GPU: fp16 only. Switch to bf16 on A100/4090."
  },
  "adapter_output_dir": "training/adapter",
  "hub_model_id": "rafiakedir/tenacious-bench-adapter",
  "fixed_seed": 42,
  "rationale": {
    "orpo_vs_dpo": "ORPO chosen over DPO because it requires no reference model, reducing GPU memory footprint by ~40% on T4. Reference-free approach is appropriate for a judge component where the reference policy is undefined.",
    "backbone_choice": "Qwen2.5-1.5B-Instruct selected per Prometheus-2 paper (Kim et al., 2024) showing 7B-class judge viability at 1.5B with preference tuning.",
    "lora_rank": "Rank 16 with alpha 32 (2:1 ratio) is standard for task-specific adaptation. Rank 8 was considered but judge rubric complexity warrants higher rank.",
    "beta_orpo": "Beta=0.1 follows ORPO paper (Hong et al., 2024) recommendation for instruction-following tasks."
  }
}