| |
| """ |
| Track per-neuron activations in Qwen2 MLP layers using Hugging Face Transformers |
| with explicit device management. |
| """ |
|
|
| import argparse |
| import os |
| from types import MethodType |
|
|
| import torch |
| from torch import Tensor |
| from tqdm import tqdm |
| from transformers import AutoModelForCausalLM |
|
|
| |
| class ActivationTracker: |
| def __init__(self, num_layers: int, intermediate_size: int): |
| |
| self.over_zero = torch.zeros( |
| num_layers, intermediate_size, dtype=torch.int32, device="cpu" |
| ) |
|
|
| def make_qwen_hook(self, index: int): |
| over_zero = self.over_zero |
|
|
| def qwen_forward(self, x: Tensor): |
| gate_activation = self.act_fn(self.gate_proj(x)) |
| with torch.no_grad(): |
| over_zero[index, :] += (gate_activation > 0).sum(dim=(0, 1)).to("cpu") |
| return self.down_proj(gate_activation * self.up_proj(x)) |
|
|
| return qwen_forward |
|
|
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--model", type=str, required=True, help="HF model ID or local folder path") |
| parser.add_argument("--lang", type=str, required=True, help="Language code for dataset") |
| parser.add_argument("--data-path", type=str, required=True, help="Path to tokenized dataset (torch tensor)") |
| parser.add_argument("--output-dir", type=str, default="activations", help="Directory to save over_zero") |
| parser.add_argument("--batch-size", type=int, default=1, help="Batch size per device") |
| parser.add_argument("--chunk-size", type=int, default=4096, help="Max sequence length to process at once") |
| args = parser.parse_args() |
|
|
| |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Using device: {device}") |
| os.makedirs(args.output_dir, exist_ok=True) |
|
|
| |
| print("Loading data...") |
| ids = torch.load(args.data_path, map_location="cpu") |
|
|
| |
| print(f"Loading model: {args.model}") |
| model = AutoModelForCausalLM.from_pretrained( |
| args.model, |
| device_map="auto", |
| torch_dtype=torch.bfloat16 |
| ) |
| model.eval() |
|
|
| num_layers = model.config.num_hidden_layers |
| intermediate_size = model.config.intermediate_size |
| max_len = model.config.max_position_embeddings |
|
|
| |
| tracker = ActivationTracker(num_layers=num_layers, intermediate_size=intermediate_size) |
|
|
| |
| for i, layer in enumerate(model.model.layers): |
| layer.mlp.forward = MethodType(tracker.make_qwen_hook(i), layer.mlp) |
|
|
| |
| chunk_size = min(args.chunk_size, max_len) |
| n = (ids.size(0) // chunk_size) * chunk_size |
| input_ids = ids[:n].reshape(-1, chunk_size) |
|
|
| print(f"Processing {input_ids.size(0)} sequences of length {chunk_size}") |
|
|
| |
| with torch.no_grad(): |
| for i in tqdm(range(0, input_ids.size(0), args.batch_size), desc="Processing", unit="batch"): |
| batch = input_ids[i:i + args.batch_size] |
| |
| |
| |
| batch = batch.to(next(model.parameters()).device) |
| |
| |
| if torch.cuda.is_available(): |
| torch.cuda.empty_cache() |
| |
| model(input_ids=batch) |
|
|
| |
| model_name = os.path.basename(args.model.rstrip("/")) |
| out_path = os.path.join(args.output_dir, f"activation_{model_name}_{args.lang}.pt") |
| torch.save(tracker.over_zero, out_path) |
| print(f"Saved activation counts to {out_path}") |
| print("Activation single job done") |