| |
| """ |
| Simple per-neuron activation tracker for LLaMA-2 MLP layers. |
| Runs on a fixed set of models and input IDs. |
| """ |
|
|
| import torch |
| import os |
| from types import MethodType |
| from vllm import LLM, SamplingParams |
|
|
| |
| RUN_CONFIGS = [ |
| { |
| 'model': 'qwen2.5-0.5', |
| 'ids_path': '../ids/qwen2.5-0.5/id.en.train.qwen2.5-0.5', |
| 'lang': 'en' |
| }, |
| { |
| 'model': 'qwen2.5-0.5', |
| 'ids_path': '../ids/qwen2.5-0.5/id.de.train.qwen2.5-0.5', |
| 'lang': 'de' |
| }, |
| |
| ] |
|
|
| SAVE_FOLDER = "data" |
| os.makedirs(SAVE_FOLDER, exist_ok=True) |
|
|
| |
| def make_llama_hook(idx): |
| def llama_forward(self, x): |
| gate_up, _ = self.gate_up_proj(x) |
| i = gate_up.size(-1) |
| gate_up[:, :, : i // 2] = torch.nn.SiLU()(gate_up[:, :, : i // 2]) |
| activation = gate_up[:, :, : i // 2].float() |
| over_zero[idx, :] += (activation > 0).sum(dim=(0,1)) |
| x = gate_up[:, :, : i // 2] * gate_up[:, :, i // 2 :] |
| x, _ = self.down_proj(x) |
| return x |
| return llama_forward |
|
|
| |
| for config in RUN_CONFIGS: |
| model_name = config['model'] |
| lang = config['lang'] |
| ids_path = config['ids_path'] |
|
|
| print(f"\n=== Processing model: {model_name}, lang: {lang} ===") |
|
|
| |
| model = LLM( |
| model=model_name, |
| tensor_parallel_size=1, |
| enforce_eager=True, |
| trust_remote_code=True |
| ) |
|
|
| max_length = model.llm_engine.model_config.max_model_len |
| num_layers = model.llm_engine.model_config.hf_config.num_hidden_layers |
| intermediate_size = model.llm_engine.model_config.hf_config.intermediate_size |
|
|
| print(f"Layers: {num_layers}, Intermediate size: {intermediate_size}, Max length: {max_length}") |
|
|
| |
| over_zero = torch.zeros(num_layers, intermediate_size, dtype=torch.int32).to('cuda') |
|
|
| |
| for i in range(num_layers): |
| mlp = model.llm_engine.model_executor.driver_worker.model_runner.model.model.layers[i].mlp |
| mlp.forward = MethodType(make_llama_hook(i), mlp) |
|
|
| |
| print("Loading IDs...") |
| ids = torch.load(ids_path) |
| print(f"ID shape: {ids.shape}") |
|
|
| l = ids.size(0) |
| l = min(l, 99999744) // max_length * max_length |
| input_ids = ids[:l].reshape(-1, max_length) |
| print(f"Processing {input_ids.size(0)} sequences of length {max_length}") |
|
|
| |
| print("Running inference...") |
| _ = model.generate( |
| prompt_token_ids=input_ids.tolist(), |
| sampling_params=SamplingParams(max_tokens=1) |
| ) |
|
|
| |
| output_path = os.path.join(SAVE_FOLDER, f'activation.{lang}.train.{model_name.split("/")[-1]}') |
| torch.save({ |
| 'n': l, |
| 'over_zero': over_zero.cpu(), |
| 'num_layers': num_layers, |
| 'intermediate_size': intermediate_size |
| }, output_path) |
|
|
| print(f"Saved activation counts to {output_path}") |
| print(f"Processed {l} tokens total") |
| print("Activation analysis complete!") |
|
|