Spaces:

NousResearch
/

cna-refusal-ablation

Running on L40S

App Files Files Community

GPU memory safety

by sk16er - opened about 9 hours ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+85

-84

Files changed (1) hide show

neuron_steer/core.py +85 -84

neuron_steer/core.py CHANGED Viewed

@@ -423,94 +423,95 @@ def compute_attribution(
         if hasattr(layer.mlp, "neuron_act"):
             layer.mlp.neuron_act = None
-    with torch.enable_grad():
-        outputs = model(input_ids)
-        logits = outputs.logits[0, position]  # [vocab_size]
-        target_logit = logits[target_token_id]
-        if target_only:
-            metric = target_logit
-        elif counterfactual_token_id is None:
-            sorted_logits, sorted_ids = logits.sort(descending=True)
-            if sorted_ids[0].item() == target_token_id:
-                counterfactual_logit = sorted_logits[1]
             else:
-                counterfactual_logit = sorted_logits[0]
-            metric = target_logit - counterfactual_logit
-        else:
-            counterfactual_logit = logits[counterfactual_token_id]
-            metric = target_logit - counterfactual_logit
-        # Backward through linearized model
-        metric.backward()
-    # Collect attributions from saved neuron activations
-    attributions = {}
-    layer_stats = {}  # diagnostic info
-    for i, layer in enumerate(_get_model_layers(model)):
-        if i in blacklist_layers:
-            continue
-        mlp = layer.mlp
-        if not hasattr(mlp, "neuron_act") or mlp.neuron_act is None:
-            continue
-        if mlp.neuron_act.grad is None:
-            continue
-        act = mlp.neuron_act.detach()   # [1, T, intermediate_size]
-        grad = mlp.neuron_act.grad      # [1, T, intermediate_size]
-        # Attribution = gradient * activation (element-wise)
-        attr = (grad * act)[0]  # [T, intermediate_size]
-        T = attr.shape[0]
-        # NaN-safe statistics (exclude NaN from sums)
-        valid_mask = ~torch.isnan(attr)
-        valid_attr = attr[valid_mask]
-        if valid_attr.numel() > 0:
-            layer_total = valid_attr.abs().sum().item()
-            layer_max = valid_attr.abs().max().item()
-            nan_frac = 1.0 - valid_mask.float().mean().item()
-        else:
-            layer_total = 0.0
-            layer_max = 0.0
-            nan_frac = 1.0
-        layer_stats[i] = {"total": layer_total, "max": layer_max, "nan_frac": nan_frac}
-        if last_n_positions is not None:
-            start_pos = max(0, T - last_n_positions)
-        elif filter_bos:
-            start_pos = 1
-        else:
-            start_pos = 0
-        for p in range(start_pos, T):
-            pos_attr = attr[p]
-            abs_attr = pos_attr.abs()
-            # NaN-safe topk: replace NaN with 0 so they don't crowd out valid values
-            nan_mask = torch.isnan(abs_attr)
-            if nan_mask.any():
-                abs_attr = abs_attr.clone()
-                abs_attr[nan_mask] = 0.0
-            # Keep top-k neurons at this position
-            k = min(top_k_per_layer, abs_attr.shape[0])
-            top_vals, top_idxs = abs_attr.topk(k)
-            for val, idx in zip(top_vals, top_idxs):
-                if val.item() > 1e-8:
-                    n = idx.item()
-                    if (i, n) in blacklist_neurons:
-                        continue
-                    nidx = NeuronIdx(layer=i, position=p, neuron=n)
-                    attributions[nidx] = pos_attr[idx].item()
-    # Free GPU memory - clear saved activations after collection
-    for layer in _get_model_layers(model):
-        if hasattr(layer.mlp, "neuron_act"):
-            layer.mlp.neuron_act = None
     if verbose:
         print(f"  Attribution distribution by layer:")

         if hasattr(layer.mlp, "neuron_act"):
             layer.mlp.neuron_act = None
+    try:
+        with torch.enable_grad():
+            outputs = model(input_ids)
+            logits = outputs.logits[0, position]  # [vocab_size]
+            target_logit = logits[target_token_id]
+            if target_only:
+                metric = target_logit
+            elif counterfactual_token_id is None:
+                sorted_logits, sorted_ids = logits.sort(descending=True)
+                if sorted_ids[0].item() == target_token_id:
+                    counterfactual_logit = sorted_logits[1]
+                else:
+                    counterfactual_logit = sorted_logits[0]
+                metric = target_logit - counterfactual_logit
             else:
+                counterfactual_logit = logits[counterfactual_token_id]
+                metric = target_logit - counterfactual_logit
+            # Backward through linearized model
+            metric.backward()
+        # Collect attributions from saved neuron activations
+        attributions = {}
+        layer_stats = {}  # diagnostic info
+        for i, layer in enumerate(_get_model_layers(model)):
+            if i in blacklist_layers:
+                continue
+            mlp = layer.mlp
+            if not hasattr(mlp, "neuron_act") or mlp.neuron_act is None:
+                continue
+            if mlp.neuron_act.grad is None:
+                continue
+            act = mlp.neuron_act.detach()   # [1, T, intermediate_size]
+            grad = mlp.neuron_act.grad      # [1, T, intermediate_size]
+            # Attribution = gradient * activation (element-wise)
+            attr = (grad * act)[0]  # [T, intermediate_size]
+            T = attr.shape[0]
+            # NaN-safe statistics (exclude NaN from sums)
+            valid_mask = ~torch.isnan(attr)
+            valid_attr = attr[valid_mask]
+            if valid_attr.numel() > 0:
+                layer_total = valid_attr.abs().sum().item()
+                layer_max = valid_attr.abs().max().item()
+                nan_frac = 1.0 - valid_mask.float().mean().item()
+            else:
+                layer_total = 0.0
+                layer_max = 0.0
+                nan_frac = 1.0
+            layer_stats[i] = {"total": layer_total, "max": layer_max, "nan_frac": nan_frac}
+            if last_n_positions is not None:
+                start_pos = max(0, T - last_n_positions)
+            elif filter_bos:
+                start_pos = 1
+            else:
+                start_pos = 0
+            for p in range(start_pos, T):
+                pos_attr = attr[p]
+                abs_attr = pos_attr.abs()
+                # NaN-safe topk: replace NaN with 0 so they don't crowd out valid values
+                nan_mask = torch.isnan(abs_attr)
+                if nan_mask.any():
+                    abs_attr = abs_attr.clone()
+                    abs_attr[nan_mask] = 0.0
+                # Keep top-k neurons at this position
+                k = min(top_k_per_layer, abs_attr.shape[0])
+                top_vals, top_idxs = abs_attr.topk(k)
+                for val, idx in zip(top_vals, top_idxs):
+                    if val.item() > 1e-8:
+                        n = idx.item()
+                        if (i, n) in blacklist_neurons:
+                            continue
+                        nidx = NeuronIdx(layer=i, position=p, neuron=n)
+                        attributions[nidx] = pos_attr[idx].item()
+    finally:
+        # Free GPU memory - clear saved activations after collection
+        for layer in _get_model_layers(model):
+            if hasattr(layer.mlp, "neuron_act"):
+                layer.mlp.neuron_act = None
     if verbose:
         print(f"  Attribution distribution by layer:")