| |
| """ |
| Simple per-neuron activation tracker for Qwen2 MLP layers using vLLM. |
| """ |
|
|
| import argparse |
| from types import MethodType |
| import torch |
| import os |
| from vllm import LLM, SamplingParams |
|
|
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument("-m", "--model", type=str, required=True, help="HF model ID or local folder path") |
| parser.add_argument("-l", "--lang", type=str, required=True, help="Language code for dataset") |
| parser.add_argument("--save-folder", type=str, default="data", help="Folder to save activation results") |
| args = parser.parse_args() |
|
|
| |
| os.makedirs(args.save_folder, exist_ok=True) |
|
|
| |
| model = LLM( |
| model=args.model, |
| tensor_parallel_size=1, |
| enforce_eager=True, |
| trust_remote_code=True |
| ) |
|
|
| max_length = model.llm_engine.model_config.max_model_len |
| num_layers = model.llm_engine.model_config.hf_config.num_hidden_layers |
| intermediate_size = model.llm_engine.model_config.hf_config.intermediate_size |
|
|
| print(f"Model: {args.model}") |
| print(f"Layers: {num_layers}, Intermediate size: {intermediate_size}, Max length: {max_length}") |
|
|
| |
| over_zero = torch.zeros(num_layers, intermediate_size, dtype=torch.int32).to('cuda') |
|
|
| def make_qwen_hook(idx): |
| def qwen_forward(self, x): |
| """ |
| x: (s, d) |
| s: number of tokens |
| d: number of features for each tokens |
| """ |
|
|
| |
| |
| gate_up, _ = self.gate_up_proj(x) |
|
|
| |
| intermediate_size = gate_up.size(-1) // 2 |
| |
| gate = gate_up[..., :intermediate_size] |
| |
| up = gate_up[..., intermediate_size:] |
|
|
| |
| |
| gate_activation = torch.nn.functional.silu(gate) |
|
|
| |
| over_zero[idx, :] += (gate_activation > 0).sum(dim=(0)) |
| |
| |
| x, _ = self.down_proj(gate_activation * up) |
| return x |
| |
| return qwen_forward |
|
|
| |
| print("Setting up activation hooks...") |
| for i in range(num_layers): |
| mlp = model.llm_engine.model_executor.driver_worker.model_runner.model.model.layers[i].mlp |
| mlp.forward = MethodType(make_qwen_hook(i), mlp) |
|
|
| |
| print("Loading ids...") |
| ids = torch.load(f'../ids/qwen2.5-0.5/id.{args.lang}.train.qwen2.5-0.5') |
| print(f"ID shape {ids.shape}") |
|
|
| |
| l = ids.size(0) |
| l = min(l, 99999744) // max_length * max_length |
| input_ids = ids[:l].reshape(-1, max_length) |
| print(f"Processing {input_ids.size(0)} sequences of length {max_length}") |
|
|
| |
| print("Running inference...") |
| output = model.generate( |
| prompt_token_ids=input_ids.tolist(), |
| sampling_params=SamplingParams(max_tokens=1) |
| ) |
|
|
| |
| result = { |
| 'n': l, |
| 'over_zero': over_zero.to('cpu'), |
| 'num_layers': num_layers, |
| 'intermediate_size': intermediate_size |
| } |
|
|
| output_path = os.path.join(args.save_folder, f'activation.{args.lang}.train.qwen-{args.model.split("/")[-1]}') |
| torch.save(result, output_path) |
|
|
| print(f"Saved activation counts to {output_path}") |
| print(f"Processed {l} tokens total") |
| print("Activation analysis complete!") |
|
|