Upload folder using huggingface_hub

fed1832 verified 7 months ago

4.04 kB

	#!/usr/bin/env python3
	"""
	Track per-neuron activations in Qwen2 MLP layers using Hugging Face Transformers
	with explicit device management.
	"""

	import argparse
	import os
	from types import MethodType

	import torch
	from torch import Tensor
	from tqdm import tqdm
	from transformers import AutoModelForCausalLM

	# ---------------------- Activation Tracker ----------------------
	class ActivationTracker:
	def __init__(self, num_layers: int, intermediate_size: int):
	# store on CPU to avoid memory issues
	self.over_zero = torch.zeros(
	num_layers, intermediate_size, dtype=torch.int32, device="cpu"
	)

	def make_qwen_hook(self, index: int):
	over_zero = self.over_zero

	def qwen_forward(self, x: Tensor):
	gate_activation = self.act_fn(self.gate_proj(x))
	with torch.no_grad():
	over_zero[index, :] += (gate_activation > 0).sum(dim=(0, 1)).to("cpu")
	return self.down_proj(gate_activation * self.up_proj(x))

	return qwen_forward

	# ---------------------- Arguments ----------------------
	parser = argparse.ArgumentParser()
	parser.add_argument("--model", type=str, required=True, help="HF model ID or local folder path")
	parser.add_argument("--lang", type=str, required=True, help="Language code for dataset")
	parser.add_argument("--data-path", type=str, required=True, help="Path to tokenized dataset (torch tensor)")
	parser.add_argument("--output-dir", type=str, default="activations", help="Directory to save over_zero")
	parser.add_argument("--batch-size", type=int, default=1, help="Batch size per device")
	parser.add_argument("--chunk-size", type=int, default=4096, help="Max sequence length to process at once")
	args = parser.parse_args()

	# ---------------------- Setup Device ----------------------
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")
	os.makedirs(args.output_dir, exist_ok=True)

	# ---------------------- Load Data ----------------------
	print("Loading data...")
	ids = torch.load(args.data_path, map_location="cpu") # Load to CPU first

	# ---------------------- Load Model ----------------------
	print(f"Loading model: {args.model}")
	model = AutoModelForCausalLM.from_pretrained(
	args.model,
	device_map="auto", # Let it automatically distribute across available GPUs
	torch_dtype=torch.bfloat16 # reduce memory
	)
	model.eval()

	num_layers = model.config.num_hidden_layers
	intermediate_size = model.config.intermediate_size
	max_len = model.config.max_position_embeddings

	# Setup tracker
	tracker = ActivationTracker(num_layers=num_layers, intermediate_size=intermediate_size)

	# Monkey-patch MLP layers
	for i, layer in enumerate(model.model.layers):
	layer.mlp.forward = MethodType(tracker.make_qwen_hook(i), layer.mlp)

	# Prepare input - use chunk_size instead of max_len for memory efficiency
	chunk_size = min(args.chunk_size, max_len)
	n = (ids.size(0) // chunk_size) * chunk_size
	input_ids = ids[:n].reshape(-1, chunk_size)

	print(f"Processing {input_ids.size(0)} sequences of length {chunk_size}")

	# ---------------------- Run Inference ----------------------
	with torch.no_grad():
	for i in tqdm(range(0, input_ids.size(0), args.batch_size), desc="Processing", unit="batch"):
	batch = input_ids[i:i + args.batch_size]

	# Move batch to the same device as the model's first parameter
	# This works with device_map="auto"
	batch = batch.to(next(model.parameters()).device)

	# Clear cache before each batch to prevent memory buildup
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	model(input_ids=batch)

	# ---------------------- Save activations ----------------------
	model_name = os.path.basename(args.model.rstrip("/"))
	out_path = os.path.join(args.output_dir, f"activation_{model_name}_{args.lang}.pt")
	torch.save(tracker.over_zero, out_path)
	print(f"Saved activation counts to {out_path}")
	print("Activation single job done")