import torch from datasets import load_dataset from trl import RewardTrainer, RewardConfig from peft import LoraConfig HUB_ORG = 'narcolepticchicken' MODEL = 'Qwen/Qwen3-4B' DATASET = f'{HUB_ORG}/speculative-actions-verifier-pref' OUTPUT = f'{HUB_ORG}/speculative-verifier-qwen3-4b' print('Loading dataset...') ds = load_dataset(DATASET) print('Configuring LoRA...') peft_config = LoraConfig( r=16, lora_alpha=32, target_modules=['q_proj', 'v_proj', 'k_proj', 'o_proj'], ) print('Configuring Reward Training...') config = RewardConfig( output_dir='/tmp/verifier-out', hub_model_id=OUTPUT, push_to_hub=True, learning_rate=2e-4, per_device_train_batch_size=4, gradient_accumulation_steps=4, num_train_epochs=2, bf16=True, gradient_checkpointing=True, logging_strategy='steps', logging_steps=10, logging_first_step=True, disable_tqdm=True, report_to='trackio', run_name='verifier-reward-qwen3-4b', ) print('Initializing Reward Trainer...') trainer = RewardTrainer( model=MODEL, train_dataset=ds['train'], eval_dataset=ds['test'], args=config, peft_config=peft_config, ) print('Training verifier...') trainer.train() trainer.push_to_hub() print('Verifier training complete.')