#!/usr/bin/env python3
"""
Simple per-neuron activation tracker for LLaMA-2 MLP layers.
Runs on a fixed set of models and input IDs.
"""

import torch
import os
from types import MethodType
from vllm import LLM, SamplingParams  # Keep your original import since hook logic depends on vLLM

# ---------------------- Config ----------------------
RUN_CONFIGS = [
    {
        'model': 'qwen2.5-0.5',  # replace with LLaMA2 HF model if needed
        'ids_path': '../ids/qwen2.5-0.5/id.en.train.qwen2.5-0.5',
        'lang': 'en'
    },
    {
        'model': 'qwen2.5-0.5',
        'ids_path': '../ids/qwen2.5-0.5/id.de.train.qwen2.5-0.5',
        'lang': 'de'
    },
    # Add more entries here
]

SAVE_FOLDER = "data"
os.makedirs(SAVE_FOLDER, exist_ok=True)

# ---------------------- Original Hook Function ----------------------
def make_llama_hook(idx):
    def llama_forward(self, x):
        gate_up, _ = self.gate_up_proj(x)  # b, l, 2i
        i = gate_up.size(-1)
        gate_up[:, :, : i // 2] = torch.nn.SiLU()(gate_up[:, :, : i // 2])
        activation = gate_up[:, :, : i // 2].float() # b, l, i
        over_zero[idx, :] += (activation > 0).sum(dim=(0,1))
        x = gate_up[:, :, : i // 2] * gate_up[:, :, i // 2 :]
        x, _ = self.down_proj(x)
        return x
    return llama_forward

# ---------------------- Run All Configs ----------------------
for config in RUN_CONFIGS:
    model_name = config['model']
    lang = config['lang']
    ids_path = config['ids_path']

    print(f"\n=== Processing model: {model_name}, lang: {lang} ===")

    # Load model
    model = LLM(
        model=model_name, 
        tensor_parallel_size=1, 
        enforce_eager=True,
        trust_remote_code=True
    )

    max_length = model.llm_engine.model_config.max_model_len
    num_layers = model.llm_engine.model_config.hf_config.num_hidden_layers
    intermediate_size = model.llm_engine.model_config.hf_config.intermediate_size

    print(f"Layers: {num_layers}, Intermediate size: {intermediate_size}, Max length: {max_length}")

    # Setup activation tracker
    over_zero = torch.zeros(num_layers, intermediate_size, dtype=torch.int32).to('cuda')

    # Hook MLP layers
    for i in range(num_layers):
        mlp = model.llm_engine.model_executor.driver_worker.model_runner.model.model.layers[i].mlp
        mlp.forward = MethodType(make_llama_hook(i), mlp)

    # Load input IDs
    print("Loading IDs...")
    ids = torch.load(ids_path)
    print(f"ID shape: {ids.shape}")

    l = ids.size(0)
    l = min(l, 99999744) // max_length * max_length
    input_ids = ids[:l].reshape(-1, max_length)
    print(f"Processing {input_ids.size(0)} sequences of length {max_length}")

    # Run inference
    print("Running inference...")
    _ = model.generate(
        prompt_token_ids=input_ids.tolist(), 
        sampling_params=SamplingParams(max_tokens=1)
    )

    # Save results
    output_path = os.path.join(SAVE_FOLDER, f'activation.{lang}.train.{model_name.split("/")[-1]}')
    torch.save({
        'n': l,
        'over_zero': over_zero.cpu(),
        'num_layers': num_layers,
        'intermediate_size': intermediate_size
    }, output_path)

    print(f"Saved activation counts to {output_path}")
    print(f"Processed {l} tokens total")
    print("Activation analysis complete!")