""" Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU) Instructions for Colab: 1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth) 2. Run the following installation commands in your first cell: !pip install unsloth "trl<0.9.0" !pip install openenv-core pydantic httpx !git clone !cd AutoMathReasoner && pip install -e . 3. Run the following Python script in the next cell. """ import collections import random from datasets import Dataset import torch # Unsloth & TRL from unsloth import FastLanguageModel from trl import GRPOConfig, GRPOTrainer # AutoMathReasoner OpenEnv Client import sys sys.path.append("./AutoMathReasoner") from AutoMathReasoner.client import AutomathreasonerEnv from AutoMathReasoner.env.models import AutomathreasonerAction # 1. Configuration # Replace with your actual Hugging Face Space URL! HF_SPACE_URL = "https://your-username-automathreasoner.hf.space" env = AutomathreasonerEnv(url=HF_SPACE_URL) max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit lora_rank = 16 # 2. Load Model via Unsloth (optimized for Free Colab VRAM) print("Loading model via Unsloth...") model, tokenizer = FastLanguageModel.from_pretrained( model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download max_seq_length = max_seq_length, dtype = None, load_in_4bit = True, ) # Enable LoRA fine-tuning model = FastLanguageModel.get_peft_model( model, r = lora_rank, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha = lora_rank, use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4 ) # 3. Prepare Dummy Prompts from the Remote Environment print("Gathering initial prompts from HF Space environment...") initial_prompts = [] for _ in range(30): # This fires an HTTP request to your Hugging Face Space obs = env.reset() initial_prompts.append({"prompt": obs.problem_text}) dataset = Dataset.from_list(initial_prompts) # 4. Define Reward Function for TRL def compute_rewards(prompts, completions, **kwargs): """ Interfaces with the OpenEnv running on Hugging Face Spaces. Extracts the generation, passes it via HTTP to the env, and yields the dense reward. """ rewards = [] parsed_actions = [] prompt_answers = collections.defaultdict(list) # Track completion variants for prompt, completion in zip(prompts, completions): try: parts = completion.split("Answer:") reasoning = parts[0].strip() answer = parts[1].strip() if len(parts) > 1 else "" except Exception: reasoning = completion answer = "" parsed_actions.append((prompt, completion, reasoning, answer)) prompt_answers[prompt].append(answer) majority_answers = {} for p, ans_list in prompt_answers.items(): if ans_list: majority_answers[p] = collections.Counter(ans_list).most_common(1)[0][0] for p, c, r, a in parsed_actions: action = AutomathreasonerAction(reasoning=r, final_answer=a) # In a real environment mapping, we would initialize the episode with the specific prompt. # But for REST API environments, we simply reset and forcefully simulate. obs = env.reset() # Step through HTTP API step_obs = env.step(action) r_total = step_obs.reward # Self-consistency matching bonus majority = majority_answers.get(p, "") if (a == majority) and len(a) > 0: r_total += 0.2 rewards.append(r_total) return rewards # 5. Execute Training training_args = GRPOConfig( output_dir="colab_outputs", learning_rate=2e-5, per_device_train_batch_size=1, # 1 for Colab GPUs to prevent OOM gradient_accumulation_steps=4, max_prompt_length=128, max_completion_length=256, num_generations=4, # K=4 (Reduced from 8 for Colab T4 Memory limitations) max_steps=150, logging_steps=10, optim="adamw_8bit", # 8-bit optimizer saves VRAM ) trainer = GRPOTrainer( model=model, reward_funcs=[compute_rewards], args=training_args, train_dataset=dataset, ) print("Starting GRPO Training in Colab using Remote HF Environment...") # Will show wandb/tensorboard logging so you can prove "it is actually learning" trainer.train() # 6. Push to Hugging Face # Optional: save locally or push to Hub after it learns # model.push_to_hub("your-name/AutoMathReasoner-Trained")