Spaces:
Sleeping
Sleeping
File size: 4,678 Bytes
98fc9b6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)
Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:
!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .
3. Run the following Python script in the next cell.
"""
import collections
import random
from datasets import Dataset
import torch
# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction
# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)
max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16
# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download
max_seq_length = max_seq_length,
dtype = None,
load_in_4bit = True,
)
# Enable LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
model,
r = lora_rank,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"],
lora_alpha = lora_rank,
use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
)
# 3. Prepare Dummy Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(30):
# This fires an HTTP request to your Hugging Face Space
obs = env.reset()
initial_prompts.append({"prompt": obs.problem_text})
dataset = Dataset.from_list(initial_prompts)
# 4. Define Reward Function for TRL
def compute_rewards(prompts, completions, **kwargs):
"""
Interfaces with the OpenEnv running on Hugging Face Spaces.
Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
"""
rewards = []
parsed_actions = []
prompt_answers = collections.defaultdict(list)
# Track completion variants
for prompt, completion in zip(prompts, completions):
try:
parts = completion.split("Answer:")
reasoning = parts[0].strip()
answer = parts[1].strip() if len(parts) > 1 else ""
except Exception:
reasoning = completion
answer = ""
parsed_actions.append((prompt, completion, reasoning, answer))
prompt_answers[prompt].append(answer)
majority_answers = {}
for p, ans_list in prompt_answers.items():
if ans_list:
majority_answers[p] = collections.Counter(ans_list).most_common(1)[0][0]
for p, c, r, a in parsed_actions:
action = AutomathreasonerAction(reasoning=r, final_answer=a)
# In a real environment mapping, we would initialize the episode with the specific prompt.
# But for REST API environments, we simply reset and forcefully simulate.
obs = env.reset()
# Step through HTTP API
step_obs = env.step(action)
r_total = step_obs.reward
# Self-consistency matching bonus
majority = majority_answers.get(p, "")
if (a == majority) and len(a) > 0:
r_total += 0.2
rewards.append(r_total)
return rewards
# 5. Execute Training
training_args = GRPOConfig(
output_dir="colab_outputs",
learning_rate=2e-5,
per_device_train_batch_size=1, # 1 for Colab GPUs to prevent OOM
gradient_accumulation_steps=4,
max_prompt_length=128,
max_completion_length=256,
num_generations=4, # K=4 (Reduced from 8 for Colab T4 Memory limitations)
max_steps=150,
logging_steps=10,
optim="adamw_8bit", # 8-bit optimizer saves VRAM
)
trainer = GRPOTrainer(
model=model,
reward_funcs=[compute_rewards],
args=training_args,
train_dataset=dataset,
)
print("Starting GRPO Training in Colab using Remote HF Environment...")
# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()
# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")
|