File size: 4,678 Bytes
98fc9b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)

Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:

!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .

3. Run the following Python script in the next cell.
"""

import collections
import random
from datasets import Dataset
import torch

# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer

# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction

# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)

max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16

# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download 
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Enable LoRA fine-tuning 
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
)

# 3. Prepare Dummy Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(30):
    # This fires an HTTP request to your Hugging Face Space
    obs = env.reset()
    initial_prompts.append({"prompt": obs.problem_text})

dataset = Dataset.from_list(initial_prompts)

# 4. Define Reward Function for TRL
def compute_rewards(prompts, completions, **kwargs):
    """
    Interfaces with the OpenEnv running on Hugging Face Spaces.
    Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
    """
    rewards = []
    parsed_actions = []
    prompt_answers = collections.defaultdict(list)
    
    # Track completion variants
    for prompt, completion in zip(prompts, completions):
        try:
            parts = completion.split("Answer:")
            reasoning = parts[0].strip()
            answer = parts[1].strip() if len(parts) > 1 else ""
        except Exception:
            reasoning = completion
            answer = ""
            
        parsed_actions.append((prompt, completion, reasoning, answer))
        prompt_answers[prompt].append(answer)
        
    majority_answers = {}
    for p, ans_list in prompt_answers.items():
        if ans_list:
            majority_answers[p] = collections.Counter(ans_list).most_common(1)[0][0]

    for p, c, r, a in parsed_actions:
        action = AutomathreasonerAction(reasoning=r, final_answer=a)
        
        # In a real environment mapping, we would initialize the episode with the specific prompt.
        # But for REST API environments, we simply reset and forcefully simulate.
        obs = env.reset()
        
        # Step through HTTP API
        step_obs = env.step(action)
        r_total = step_obs.reward
        
        # Self-consistency matching bonus
        majority = majority_answers.get(p, "")
        if (a == majority) and len(a) > 0:
            r_total += 0.2
            
        rewards.append(r_total)
            
    return rewards

# 5. Execute Training
training_args = GRPOConfig(
    output_dir="colab_outputs",
    learning_rate=2e-5,
    per_device_train_batch_size=1, # 1 for Colab GPUs to prevent OOM
    gradient_accumulation_steps=4,
    max_prompt_length=128,
    max_completion_length=256,
    num_generations=4, # K=4 (Reduced from 8 for Colab T4 Memory limitations)
    max_steps=150,
    logging_steps=10,
    optim="adamw_8bit", # 8-bit optimizer saves VRAM
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[compute_rewards],
    args=training_args,
    train_dataset=dataset,
)

print("Starting GRPO Training in Colab using Remote HF Environment...")
# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()

# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")