"""
Colab Training Script for AutoMathReasoner (Hugging Face Space + Free T4 GPU)

Instructions for Colab:
1. Create a new Google Colab notebook (Free Tier: T4 GPU is supported by Unsloth)
2. Run the following installation commands in your first cell:

!pip install unsloth "trl<0.9.0"
!pip install openenv-core pydantic httpx
!git clone <YOUR-GITHUB-REPO-URL>
!cd AutoMathReasoner && pip install -e .

3. Run the following Python script in the next cell.
"""

import collections
import random
from datasets import Dataset
import torch

# Unsloth & TRL
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer

# AutoMathReasoner OpenEnv Client
import sys
sys.path.append("./AutoMathReasoner")
from AutoMathReasoner.client import AutomathreasonerEnv
from AutoMathReasoner.env.models import AutomathreasonerAction

# 1. Configuration
# Replace with your actual Hugging Face Space URL!
HF_SPACE_URL = "https://your-username-automathreasoner.hf.space"
env = AutomathreasonerEnv(url=HF_SPACE_URL)

max_seq_length = 1024 # Fits well within Colab T4 16GB VRAM limit
lora_rank = 16

# 2. Load Model via Unsloth (optimized for Free Colab VRAM)
print("Loading model via Unsloth...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Pre-quantized 4bit for fast download 
    max_seq_length = max_seq_length,
    dtype = None,
    load_in_4bit = True,
)

# Enable LoRA fine-tuning 
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = lora_rank,
    use_gradient_checkpointing = "unsloth", # Crucial for fitting into T4
)

# 3. Prepare Dummy Prompts from the Remote Environment
print("Gathering initial prompts from HF Space environment...")
initial_prompts = []
for _ in range(30):
    # This fires an HTTP request to your Hugging Face Space
    obs = env.reset()
    initial_prompts.append({"prompt": obs.problem_text})

dataset = Dataset.from_list(initial_prompts)

# 4. Define Reward Function for TRL
def compute_rewards(prompts, completions, **kwargs):
    """
    Interfaces with the OpenEnv running on Hugging Face Spaces.
    Extracts the generation, passes it via HTTP to the env, and yields the dense reward.
    """
    rewards = []
    parsed_actions = []
    prompt_answers = collections.defaultdict(list)
    
    # Track completion variants
    for prompt, completion in zip(prompts, completions):
        try:
            parts = completion.split("Answer:")
            reasoning = parts[0].strip()
            answer = parts[1].strip() if len(parts) > 1 else ""
        except Exception:
            reasoning = completion
            answer = ""
            
        parsed_actions.append((prompt, completion, reasoning, answer))
        prompt_answers[prompt].append(answer)
        
    majority_answers = {}
    for p, ans_list in prompt_answers.items():
        if ans_list:
            majority_answers[p] = collections.Counter(ans_list).most_common(1)[0][0]

    for p, c, r, a in parsed_actions:
        action = AutomathreasonerAction(reasoning=r, final_answer=a)
        
        # In a real environment mapping, we would initialize the episode with the specific prompt.
        # But for REST API environments, we simply reset and forcefully simulate.
        obs = env.reset()
        
        # Step through HTTP API
        step_obs = env.step(action)
        r_total = step_obs.reward
        
        # Self-consistency matching bonus
        majority = majority_answers.get(p, "")
        if (a == majority) and len(a) > 0:
            r_total += 0.2
            
        rewards.append(r_total)
            
    return rewards

# 5. Execute Training
training_args = GRPOConfig(
    output_dir="colab_outputs",
    learning_rate=2e-5,
    per_device_train_batch_size=1, # 1 for Colab GPUs to prevent OOM
    gradient_accumulation_steps=4,
    max_prompt_length=128,
    max_completion_length=256,
    num_generations=4, # K=4 (Reduced from 8 for Colab T4 Memory limitations)
    max_steps=150,
    logging_steps=10,
    optim="adamw_8bit", # 8-bit optimizer saves VRAM
)

trainer = GRPOTrainer(
    model=model,
    reward_funcs=[compute_rewards],
    args=training_args,
    train_dataset=dataset,
)

print("Starting GRPO Training in Colab using Remote HF Environment...")
# Will show wandb/tensorboard logging so you can prove "it is actually learning"
trainer.train()

# 6. Push to Hugging Face
# Optional: save locally or push to Hub after it learns
# model.push_to_hub("your-name/AutoMathReasoner-Trained")