Spaces:

Pratap-K
/

AutoMathReasoner

Sleeping

App Files Files Community

AutoMathReasoner / train /colab_train.py

Pratap-K

AutoMathReasoner

98fc9b6 15 days ago

raw

history blame contribute delete

4.68 kB

	"""
	Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)

	Instructions for Colab:
	1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
	2. Run the following installation commands in your first cell:

	!pip install unsloth "trl<0.9.0"
	!pip install openenv-core pydantic httpx
	!git clone <YOUR-GITHUB-REPO-URL>
	!cd AutoMathReasoner && pip install -e .

	3. Run the following Python script in the next cell.
	"""

	import collections
	import random
	from datasets import Dataset
	import torch

	# Unsloth & TRL
	from unsloth import FastLanguageModel
	from trl import GRPOConfig, GRPOTrainer

	# AutoMathReasoner OpenEnv Client
	import sys
	sys.path.append("./AutoMathReasoner")
	from AutoMathReasoner.client import AutomathreasonerEnv
	from AutoMathReasoner.env.models import AutomathreasonerAction

	# 1. Configuration
	# Replace with your actual Hugging Face Space URL!
	HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
	env = AutomathreasonerEnv(url=HF_SPACE_URL)

	max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
	lora_rank = 16

	# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
	print("Loading model via Unsloth...")
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download
	max_seq_length = max_seq_length,
	dtype = None,
	load_in_4bit = True,
	)

	# Enable LoRA fine-tuning
	model = FastLanguageModel.get_peft_model(
	model,
	r = lora_rank,
	target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
	"gate_proj", "up_proj", "down_proj"],
	lora_alpha = lora_rank,
	use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
	)

	# 3. Prepare Dummy Prompts from the Remote Environment
	print("Gathering initial prompts from HF Space environment...")
	initial_prompts = []
	for _ in range(30):
	# This fires an HTTP request to your Hugging Face Space
	obs = env.reset()
	initial_prompts.append({"prompt": obs.problem_text})

	dataset = Dataset.from_list(initial_prompts)

	# 4. Define Reward Function for TRL
	def compute_rewards(prompts, completions, **kwargs):
	"""
	Interfaces with the OpenEnv running on Hugging Face Spaces.
	Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
	"""
	rewards = []
	parsed_actions = []
	prompt_answers = collections.defaultdict(list)

	# Track completion variants
	for prompt, completion in zip(prompts, completions):
	try:
	parts = completion.split("Answer:")
	reasoning = parts[0].strip()
	answer = parts[1].strip() if len(parts) > 1 else ""
	except Exception:
	reasoning = completion
	answer = ""

	parsed_actions.append((prompt, completion, reasoning, answer))
	prompt_answers[prompt].append(answer)

	majority_answers = {}
	for p, ans_list in prompt_answers.items():
	if ans_list:
	majority_answers[p] = collections.Counter(ans_list).most_common(1)[0][0]

	for p, c, r, a in parsed_actions:
	action = AutomathreasonerAction(reasoning=r, final_answer=a)

	# In a real environment mapping, we would initialize the episode with the specific prompt.
	# But for REST API environments, we simply reset and forcefully simulate.
	obs = env.reset()

	# Step through HTTP API
	step_obs = env.step(action)
	r_total = step_obs.reward

	# Self-consistency matching bonus
	majority = majority_answers.get(p, "")
	if (a == majority) and len(a) > 0:
	r_total += 0.2

	rewards.append(r_total)

	return rewards

	# 5. Execute Training
	training_args = GRPOConfig(
	output_dir="colab_outputs",
	learning_rate=2e-5,
	per_device_train_batch_size=1, # 1 for Colab GPUs to prevent OOM
	gradient_accumulation_steps=4,
	max_prompt_length=128,
	max_completion_length=256,
	num_generations=4, # K=4 (Reduced from 8 for Colab T4 Memory limitations)
	max_steps=150,
	logging_steps=10,
	optim="adamw_8bit", # 8-bit optimizer saves VRAM
	)

	trainer = GRPOTrainer(
	model=model,
	reward_funcs=[compute_rewards],
	args=training_args,
	train_dataset=dataset,
	)

	print("Starting GRPO Training in Colab using Remote HF Environment...")
	# Will show wandb/tensorboard logging so you can prove "it is actually learning"
	trainer.train()

	# 6. Push to Hugging Face
	# Optional: save locally or push to Hub after it learns
	# model.push_to_hub("your-name/AutoMathReasoner-Trained")