obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 8

Commit

102206c

verified ·

1 Parent(s): e3db1b1

Upload 135 files

Browse files

Files changed (9) hide show

app.py +15 -4
obliteratus/abliterate.py +100 -15
obliteratus/analysis/leace.py +239 -0
obliteratus/bayesian_optimizer.py +147 -90
obliteratus/cli.py +7 -1
obliteratus/informed_pipeline.py +380 -56
scripts/run_benchmark_remote.sh +10 -4
tests/test_informed_pipeline.py +38 -12
tests/test_leace.py +230 -0

app.py CHANGED Viewed

@@ -316,6 +316,7 @@ def _get_preset_defaults(method_display: str):
     cfg = _PRESET_CONFIGS.get(method_key, _PRESET_CONFIGS["advanced"])
     return {
         "n_directions": cfg.get("n_directions", 4),
         "regularization": cfg.get("regularization", 0.3),
         "refinement_passes": cfg.get("refinement_passes", 2),
         "norm_preserve": cfg.get("norm_preserve", True),
@@ -348,6 +349,7 @@ def _on_method_change(method_display: str):
     d = _get_preset_defaults(method_display)
     return (
         d["n_directions"],
         d["regularization"],
         d["refinement_passes"],
         d["reflection_strength"],
@@ -1731,8 +1733,9 @@ def _format_multi_model_results(results: list[dict], context: dict | None = None
 def obliterate(model_choice: str, method_choice: str,
                prompt_volume_choice: str, dataset_source_choice: str,
                custom_harmful: str, custom_harmless: str,
-               # Advanced params (sliders)
-               adv_n_directions: int, adv_regularization: float,
                adv_refinement_passes: int, adv_reflection_strength: float,
                adv_embed_regularization: float, adv_steering_strength: float,
                adv_transplant_blend: float,
@@ -1906,6 +1909,7 @@ def obliterate(model_choice: str, method_choice: str,
                     on_log=on_log,
                     # Advanced overrides from UI
                     n_directions=int(adv_n_directions),
                     regularization=float(adv_regularization),
                     refinement_passes=int(adv_refinement_passes),
                     norm_preserve=adv_norm_preserve,
@@ -3930,7 +3934,13 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
                 with gr.Row():
                     adv_n_directions = gr.Slider(
                         1, 8, value=_defaults["n_directions"], step=1,
-                        label="Directions", info="Number of refusal directions to extract via SVD",
                     )
                     adv_regularization = gr.Slider(
                         0.0, 1.0, value=_defaults["regularization"], step=0.05,
@@ -3999,7 +4009,8 @@ with gr.Blocks(theme=THEME, css=CSS, js=_JS, title="OBLITERATUS", fill_height=Tr
             # List of all advanced controls (order must match _on_method_change return)
             _adv_controls = [
-                adv_n_directions, adv_regularization, adv_refinement_passes,
                 adv_reflection_strength, adv_embed_regularization,
                 adv_steering_strength, adv_transplant_blend,
                 adv_spectral_bands, adv_spectral_threshold,

     cfg = _PRESET_CONFIGS.get(method_key, _PRESET_CONFIGS["advanced"])
     return {
         "n_directions": cfg.get("n_directions", 4),
+        "direction_method": cfg.get("direction_method", "svd"),
         "regularization": cfg.get("regularization", 0.3),
         "refinement_passes": cfg.get("refinement_passes", 2),
         "norm_preserve": cfg.get("norm_preserve", True),
     d = _get_preset_defaults(method_display)
     return (
         d["n_directions"],
+        d["direction_method"],
         d["regularization"],
         d["refinement_passes"],
         d["reflection_strength"],
 def obliterate(model_choice: str, method_choice: str,
                prompt_volume_choice: str, dataset_source_choice: str,
                custom_harmful: str, custom_harmless: str,
+               # Advanced params (sliders + radio)
+               adv_n_directions: int, adv_direction_method: str,
+               adv_regularization: float,
                adv_refinement_passes: int, adv_reflection_strength: float,
                adv_embed_regularization: float, adv_steering_strength: float,
                adv_transplant_blend: float,
                     on_log=on_log,
                     # Advanced overrides from UI
                     n_directions=int(adv_n_directions),
+                    direction_method=adv_direction_method,
                     regularization=float(adv_regularization),
                     refinement_passes=int(adv_refinement_passes),
                     norm_preserve=adv_norm_preserve,
                 with gr.Row():
                     adv_n_directions = gr.Slider(
                         1, 8, value=_defaults["n_directions"], step=1,
+                        label="Directions", info="Number of refusal directions to extract",
+                    )
+                    adv_direction_method = gr.Radio(
+                        choices=["diff_means", "svd", "leace"],
+                        value=_defaults["direction_method"],
+                        label="Direction Method",
+                        info="diff_means: simple & robust, svd: multi-direction, leace: optimal erasure",
                     )
                     adv_regularization = gr.Slider(
                         0.0, 1.0, value=_defaults["regularization"], step=0.05,
             # List of all advanced controls (order must match _on_method_change return)
             _adv_controls = [
+                adv_n_directions, adv_direction_method,
+                adv_regularization, adv_refinement_passes,
                 adv_reflection_strength, adv_embed_regularization,
                 adv_steering_strength, adv_transplant_blend,
                 adv_spectral_bands, adv_spectral_threshold,

obliteratus/abliterate.py CHANGED Viewed

@@ -63,6 +63,7 @@ METHODS = {
         "label": "Basic (Arditi et al.)",
         "description": "Single refusal direction via difference-in-means",
         "n_directions": 1,
         "norm_preserve": False,
         "regularization": 0.0,
         "refinement_passes": 1,
@@ -75,6 +76,7 @@ METHODS = {
         "label": "Advanced (Multi-direction + Norm-preserving)",
         "description": "SVD-based multi-direction extraction with norm preservation",
         "n_directions": 4,
         "norm_preserve": True,
         "regularization": 0.3,
         "embed_regularization": 0.5,
@@ -97,6 +99,7 @@ METHODS = {
             "Zero regularization for maximum refusal removal."
         ),
         "n_directions": 8,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 3,
@@ -124,6 +127,7 @@ METHODS = {
             "separating trained-in refusal patterns from per-layer artifacts."
         ),
         "n_directions": 6,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
@@ -146,25 +150,31 @@ METHODS = {
             "Uses InformedAbliterationPipeline for the full feedback loop. "
             "Auto-detects alignment method (DPO/RLHF/CAI/SFT), maps concept "
             "cone geometry, performs cluster-aware layer selection, and gates "
-            "projection by safety-capability entanglement. Includes spectral "
-            "certification of abliteration completeness and Wasserstein-optimal "
-            "primary direction extraction."
         ),
-        "n_directions": 4,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
         "project_biases": True,
         "use_chat_template": True,
-        "use_whitened_svd": True,
         "true_iterative_refinement": True,
         "use_jailbreak_contrast": False,
-        "layer_adaptive_strength": False,
         "safety_neuron_masking": False,
         "per_expert_directions": False,
         "attention_head_surgery": False,
         "use_sae_features": False,
-        "use_wasserstein_optimal": True,
     },
     "surgical": {
         "label": "Surgical (Full SOTA MoE-Aware)",
@@ -176,6 +186,7 @@ METHODS = {
             "minimizing capability damage via precision targeting."
         ),
         "n_directions": 8,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
@@ -204,6 +215,7 @@ METHODS = {
             "techniques plus the inversion layer."
         ),
         "n_directions": 8,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
@@ -234,6 +246,7 @@ METHODS = {
             "Best for maximizing quality when compute budget allows ~50 trials."
         ),
         "n_directions": 4,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
@@ -275,6 +288,7 @@ METHODS = {
             "runtime overhead except lightweight steering hooks."
         ),
         "n_directions": 4,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
@@ -320,6 +334,7 @@ METHODS = {
             "abliterated models were created with."
         ),
         "n_directions": 1,
         "norm_preserve": False,
         "regularization": 0.0,
         "refinement_passes": 1,
@@ -347,6 +362,7 @@ METHODS = {
             "whitened SVD, no iterative refinement."
         ),
         "n_directions": 4,
         "norm_preserve": False,
         # Ridge alpha=0.3 → effective reg = alpha/(1+alpha) = 0.3/1.3 ≈ 0.231
         # For orthonormal V: P_V^alpha = 1/(1+alpha) * VV^T = 0.769 * VV^T
@@ -379,6 +395,7 @@ METHODS = {
             "over the (refusal_rate, KL_divergence) frontier."
         ),
         "n_directions": 2,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
@@ -414,6 +431,7 @@ METHODS = {
             "boundary rather than the statistical activation difference."
         ),
         "n_directions": 4,
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
@@ -566,6 +584,7 @@ class AbliterationPipeline:
         hub_token: str | None = None,
         hub_community_org: str | None = None,
         n_directions: int | None = None,
         norm_preserve: bool | None = None,
         regularization: float | None = None,
         refinement_passes: int | None = None,
@@ -659,6 +678,7 @@ class AbliterationPipeline:
         method_cfg = METHODS[method]
         self.method = method
         self.n_directions = n_directions if n_directions is not None else method_cfg["n_directions"]
         self.norm_preserve = norm_preserve if norm_preserve is not None else method_cfg["norm_preserve"]
         self.regularization = regularization if regularization is not None else method_cfg["regularization"]
         self.refinement_passes = refinement_passes if refinement_passes is not None else method_cfg["refinement_passes"]
@@ -936,7 +956,7 @@ class AbliterationPipeline:
         self.log(f"Loading model: {self.model_name}")
         self.log(f"Device: {self.device} | Dtype: {self.dtype}")
         self.log(f"Method: {method_label}")
-        self.log(f"  Directions: {self.n_directions} | Norm-preserve: {self.norm_preserve}")
         self.log(f"  Regularization: {self.regularization} | Refinement passes: {self.refinement_passes}")
         self.handle = load_model(
@@ -1400,18 +1420,26 @@ class AbliterationPipeline:
         else:
             max_length = 384 if collect_multi_pos else 256
         free_gb = dev.get_total_free_gb()
         if dev.is_gpu_available():
-            if self.max_seq_length is None and free_gb < 2.0:
                 max_length = 64
-                self.log(f"  Low GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
-            elif self.max_seq_length is None and free_gb < 4.0:
                 max_length = 128
-                self.log(f"  Tight GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
         device = self._get_model_device(model)
         # Batch prompts for throughput — hooks unbatch per-prompt activations
-        batch_size = 16 if free_gb > 4.0 else 8 if free_gb > 2.0 else 1
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:
@@ -1498,9 +1526,16 @@ class AbliterationPipeline:
             wasserstein_extractor = WassersteinOptimalExtractor()
             self.log("Using Wasserstein-optimal direction extraction (cost-minimizing GEP)")
         # Optionally use whitened SVD for cleaner direction extraction
         whitened_extractor = None
-        if self.use_whitened_svd and n_dirs > 1 and not self.use_wasserstein_optimal:
             from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
             whitened_extractor = WhitenedSVDExtractor()
             self.log("Using whitened SVD (covariance-normalized) for direction extraction")
@@ -1547,6 +1582,30 @@ class AbliterationPipeline:
                         if idx < 5:
                             self.log(f"  layer {idx}: Wasserstein extraction failed ({e}), falling back to SVD")
             if n_dirs == 1:
                 # Classic single-direction: difference-in-means
                 diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
@@ -3589,9 +3648,18 @@ class AbliterationPipeline:
             except Exception:
                 pass
         # Use whitened SVD when enabled (matching main _distill)
         whitened_extractor = None
-        if self.use_whitened_svd and n_dirs > 1 and wasserstein_extractor is None:
             from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
             whitened_extractor = WhitenedSVDExtractor()
@@ -3624,6 +3692,22 @@ class AbliterationPipeline:
                     except Exception:
                         pass  # Fall through to SVD
             if n_dirs == 1:
                 diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
                 norm = diff.norm()
@@ -5741,6 +5825,7 @@ class AbliterationPipeline:
             "method": self.method,
             "method_config": {
                 "n_directions": self.n_directions,
                 "norm_preserve": self.norm_preserve,
                 "regularization": self.regularization,
                 "refinement_passes": self.refinement_passes,

         "label": "Basic (Arditi et al.)",
         "description": "Single refusal direction via difference-in-means",
         "n_directions": 1,
+        "direction_method": "diff_means",
         "norm_preserve": False,
         "regularization": 0.0,
         "refinement_passes": 1,
         "label": "Advanced (Multi-direction + Norm-preserving)",
         "description": "SVD-based multi-direction extraction with norm preservation",
         "n_directions": 4,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.3,
         "embed_regularization": 0.5,
             "Zero regularization for maximum refusal removal."
         ),
         "n_directions": 8,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 3,
             "separating trained-in refusal patterns from per-layer artifacts."
         ),
         "n_directions": 6,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
             "Uses InformedAbliterationPipeline for the full feedback loop. "
             "Auto-detects alignment method (DPO/RLHF/CAI/SFT), maps concept "
             "cone geometry, performs cluster-aware layer selection, and gates "
+            "projection by safety-capability entanglement. Defaults to single "
+            "diff-of-means direction + Bayesian optimization (Heretic-style). "
+            "LEACE available via direction_method='leace'."
         ),
+        "n_directions": 1,
+        "direction_method": "diff_means",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
         "project_biases": True,
         "use_chat_template": True,
+        "use_whitened_svd": False,
         "true_iterative_refinement": True,
         "use_jailbreak_contrast": False,
+        "layer_adaptive_strength": True,
         "safety_neuron_masking": False,
         "per_expert_directions": False,
         "attention_head_surgery": False,
         "use_sae_features": False,
+        "use_wasserstein_optimal": False,
+        "use_kl_optimization": True,
+        "kl_budget": 0.5,
+        "float_layer_interpolation": True,
+        "winsorize_activations": True,
+        "winsorize_percentile": 0.01,
     },
     "surgical": {
         "label": "Surgical (Full SOTA MoE-Aware)",
             "minimizing capability damage via precision targeting."
         ),
         "n_directions": 8,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
             "techniques plus the inversion layer."
         ),
         "n_directions": 8,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
             "Best for maximizing quality when compute budget allows ~50 trials."
         ),
         "n_directions": 4,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
             "runtime overhead except lightweight steering hooks."
         ),
         "n_directions": 4,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 2,
             "abliterated models were created with."
         ),
         "n_directions": 1,
+        "direction_method": "diff_means",
         "norm_preserve": False,
         "regularization": 0.0,
         "refinement_passes": 1,
             "whitened SVD, no iterative refinement."
         ),
         "n_directions": 4,
+        "direction_method": "svd",
         "norm_preserve": False,
         # Ridge alpha=0.3 → effective reg = alpha/(1+alpha) = 0.3/1.3 ≈ 0.231
         # For orthonormal V: P_V^alpha = 1/(1+alpha) * VV^T = 0.769 * VV^T
             "over the (refusal_rate, KL_divergence) frontier."
         ),
         "n_directions": 2,
+        "direction_method": "diff_means",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
             "boundary rather than the statistical activation difference."
         ),
         "n_directions": 4,
+        "direction_method": "svd",
         "norm_preserve": True,
         "regularization": 0.0,
         "refinement_passes": 1,
         hub_token: str | None = None,
         hub_community_org: str | None = None,
         n_directions: int | None = None,
+        direction_method: str | None = None,
         norm_preserve: bool | None = None,
         regularization: float | None = None,
         refinement_passes: int | None = None,
         method_cfg = METHODS[method]
         self.method = method
         self.n_directions = n_directions if n_directions is not None else method_cfg["n_directions"]
+        self.direction_method = direction_method if direction_method is not None else method_cfg.get("direction_method", "svd")
         self.norm_preserve = norm_preserve if norm_preserve is not None else method_cfg["norm_preserve"]
         self.regularization = regularization if regularization is not None else method_cfg["regularization"]
         self.refinement_passes = refinement_passes if refinement_passes is not None else method_cfg["refinement_passes"]
         self.log(f"Loading model: {self.model_name}")
         self.log(f"Device: {self.device} | Dtype: {self.dtype}")
         self.log(f"Method: {method_label}")
+        self.log(f"  Directions: {self.n_directions} ({self.direction_method}) | Norm-preserve: {self.norm_preserve}")
         self.log(f"  Regularization: {self.regularization} | Refinement passes: {self.refinement_passes}")
         self.handle = load_model(
         else:
             max_length = 384 if collect_multi_pos else 256
         free_gb = dev.get_total_free_gb()
+        # Scale memory thresholds by model size — a 1.2B model needs far
+        # less KV-cache memory per token than a 7B model.  Baseline
+        # thresholds (4 / 2 GB) were tuned for 7B (hidden=4096, layers=32).
+        _h = self.handle.hidden_size if self.handle else 4096
+        _l = n_layers if n_layers else 32
+        _mem_scale = (_h / 4096) * (_l / 32)
+        _tight_gb = max(4.0 * _mem_scale, 0.5)
+        _low_gb = max(2.0 * _mem_scale, 0.25)
         if dev.is_gpu_available():
+            if self.max_seq_length is None and free_gb < _low_gb:
                 max_length = 64
+                self.log(f"  Low GPU memory ({free_gb:.1f} GB free, threshold {_low_gb:.1f} GB), using max_length={max_length}")
+            elif self.max_seq_length is None and free_gb < _tight_gb:
                 max_length = 128
+                self.log(f"  Tight GPU memory ({free_gb:.1f} GB free, threshold {_tight_gb:.1f} GB), using max_length={max_length}")
         device = self._get_model_device(model)
         # Batch prompts for throughput — hooks unbatch per-prompt activations
+        batch_size = 16 if free_gb > _tight_gb else 8 if free_gb > _low_gb else 1
         # Left-pad so position -1 is always the last real token in every batch element
         orig_padding_side = getattr(tokenizer, "padding_side", "right")
         if batch_size > 1:
             wasserstein_extractor = WassersteinOptimalExtractor()
             self.log("Using Wasserstein-optimal direction extraction (cost-minimizing GEP)")
+        # Optionally use LEACE for theoretically optimal concept erasure
+        leace_extractor = None
+        if self.direction_method == "leace":
+            from obliteratus.analysis.leace import LEACEExtractor
+            leace_extractor = LEACEExtractor()
+            self.log("Using LEACE (closed-form optimal concept erasure) for direction extraction")
         # Optionally use whitened SVD for cleaner direction extraction
         whitened_extractor = None
+        if self.use_whitened_svd and n_dirs > 1 and not self.use_wasserstein_optimal and leace_extractor is None:
             from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
             whitened_extractor = WhitenedSVDExtractor()
             self.log("Using whitened SVD (covariance-normalized) for direction extraction")
                         if idx < 5:
                             self.log(f"  layer {idx}: Wasserstein extraction failed ({e}), falling back to SVD")
+            if leace_extractor is not None:
+                # LEACE: closed-form optimal concept erasure direction
+                if idx in self._harmful_acts and idx in self._harmless_acts:
+                    try:
+                        l_result = leace_extractor.extract(
+                            self._harmful_acts[idx],
+                            self._harmless_acts[idx],
+                            layer_idx=idx,
+                        )
+                        self.refusal_directions[idx] = l_result.direction
+                        self.refusal_subspaces[idx] = l_result.direction.unsqueeze(0)
+                        norms[idx] = l_result.generalized_eigenvalue
+                        if idx < 5 or idx == n_layers - 1:
+                            self.log(
+                                f"  layer {idx}: LEACE eigenvalue={l_result.generalized_eigenvalue:.4f}, "
+                                f"erasure_loss={l_result.erasure_loss:.4f}, "
+                                f"cond={l_result.within_class_condition:.0f}"
+                            )
+                        continue
+                    except Exception as e:
+                        if idx < 5:
+                            self.log(f"  layer {idx}: LEACE failed ({e}), falling back to diff-of-means")
             if n_dirs == 1:
                 # Classic single-direction: difference-in-means
                 diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
             except Exception:
                 pass
+        # Use LEACE when enabled (matching main _distill)
+        leace_extractor = None
+        if self.direction_method == "leace":
+            try:
+                from obliteratus.analysis.leace import LEACEExtractor
+                leace_extractor = LEACEExtractor()
+            except Exception:
+                pass
         # Use whitened SVD when enabled (matching main _distill)
         whitened_extractor = None
+        if self.use_whitened_svd and n_dirs > 1 and wasserstein_extractor is None and leace_extractor is None:
             from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
             whitened_extractor = WhitenedSVDExtractor()
                     except Exception:
                         pass  # Fall through to SVD
+            # LEACE path (matching main _distill)
+            if leace_extractor is not None:
+                if idx in self._harmful_acts and idx in self._harmless_acts:
+                    try:
+                        l_result = leace_extractor.extract(
+                            self._harmful_acts[idx],
+                            self._harmless_acts[idx],
+                            layer_idx=idx,
+                        )
+                        self.refusal_directions[idx] = l_result.direction
+                        self.refusal_subspaces[idx] = l_result.direction.unsqueeze(0)
+                        norms[idx] = l_result.generalized_eigenvalue
+                        continue
+                    except Exception:
+                        pass  # Fall through to diff-of-means
             if n_dirs == 1:
                 diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
                 norm = diff.norm()
             "method": self.method,
             "method_config": {
                 "n_directions": self.n_directions,
+                "direction_method": self.direction_method,
                 "norm_preserve": self.norm_preserve,
                 "regularization": self.regularization,
                 "refinement_passes": self.refinement_passes,

obliteratus/analysis/leace.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""LEACE (LEAst-squares Concept Erasure) for refusal direction extraction.
+Closed-form optimal concept erasure that finds the minimum-rank edit to
+representations preventing any linear classifier from recovering the
+concept (harmful vs harmless).  Unlike SVD, LEACE produces directions
+that are provably optimal for erasure with minimal distortion.
+The key insight: instead of finding directions of maximum variance (SVD)
+or maximum mean difference (diff-of-means), LEACE solves a constrained
+optimization problem: find the smallest perturbation to representations
+such that no linear probe can distinguish harmful from harmless.
+Mathematical formulation (rank-1 erasure):
+    Given class-conditional means mu_0, mu_1 and within-class
+    covariance S_w:
+    1. Compute mean difference: delta = mu_1 - mu_0
+    2. Compute within-class covariance: S_w = (S_0 + S_1) / 2
+    3. Solve generalized eigenvalue problem: S_b v = lambda S_w v
+       where S_b = delta @ delta^T (between-class scatter)
+    4. The top generalized eigenvector is the LEACE direction
+    5. Erase by projecting out: x' = x - (x @ v) * v^T
+This is mathematically equivalent to Fisher's Linear Discriminant but
+applied as an erasure operation.  The direction maximizes class
+separability relative to within-class spread, making it the optimal
+single direction to remove for concept erasure.
+Advantages over SVD:
+    - Theoretically optimal: minimizes representation distortion for
+      guaranteed erasure of linear concept information
+    - Handles rogue dimensions naturally: within-class normalization
+      prevents high-variance but non-discriminative dimensions from
+      dominating
+    - No hyperparameters beyond regularization epsilon
+    - Closed-form solution (no iterative optimization)
+References:
+    - Belrose et al. (2023): LEACE: Perfect linear concept erasure in
+      closed form. NeurIPS 2023.
+    - Ravfogel et al. (2022): RLACE: Adversarial concept erasure
+      (iterative precursor to LEACE).
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+import torch
+@dataclass
+class LEACEResult:
+    """Result of LEACE direction extraction for a single layer."""
+    layer_idx: int
+    direction: torch.Tensor           # (hidden_dim,) unit vector
+    generalized_eigenvalue: float     # lambda from GEP (discriminability)
+    within_class_condition: float     # condition number of S_w
+    mean_diff_norm: float             # ||mu_1 - mu_0||
+    erasure_loss: float               # expected squared distortion from erasure
+class LEACEExtractor:
+    """Extract refusal directions via LEACE (closed-form concept erasure).
+    Finds the direction that maximally separates harmful from harmless
+    activations relative to within-class variance, then erases it.
+    This is the provably optimal rank-1 concept erasure.
+    """
+    def __init__(
+        self,
+        regularization_eps: float = 1e-4,
+        shrinkage: float = 0.0,
+    ):
+        """
+        Args:
+            regularization_eps: Tikhonov regularization for S_w inversion.
+                Larger values produce more conservative (but stable) results.
+            shrinkage: Ledoit-Wolf shrinkage toward identity (0..1).
+                0 = no shrinkage, 1 = full shrinkage to scaled identity.
+                Useful when n_samples < hidden_dim.
+        """
+        self.regularization_eps = regularization_eps
+        self.shrinkage = shrinkage
+    def extract(
+        self,
+        harmful_activations: list[torch.Tensor],
+        harmless_activations: list[torch.Tensor],
+        layer_idx: int = 0,
+    ) -> LEACEResult:
+        """Extract the LEACE direction for a single layer.
+        Args:
+            harmful_activations: List of (hidden_dim,) tensors from harmful prompts.
+            harmless_activations: List of (hidden_dim,) tensors from harmless prompts.
+            layer_idx: Layer index (for metadata).
+        Returns:
+            LEACEResult with the optimal erasure direction.
+        """
+        H = torch.stack(harmful_activations).float()   # (n_h, d)
+        B = torch.stack(harmless_activations).float()   # (n_b, d)
+        if H.dim() == 3:
+            H = H.squeeze(1)
+        if B.dim() == 3:
+            B = B.squeeze(1)
+        n_h, d = H.shape
+        n_b = B.shape[0]
+        # Class-conditional means
+        mu_h = H.mean(dim=0)  # (d,)
+        mu_b = B.mean(dim=0)  # (d,)
+        # Mean difference (between-class direction)
+        delta = mu_h - mu_b  # (d,)
+        delta_norm = delta.norm().item()
+        # Within-class covariance: S_w = (S_h + S_b) / 2
+        # where S_h = (H - mu_h)^T (H - mu_h) / (n_h - 1) etc.
+        H_centered = H - mu_h.unsqueeze(0)
+        B_centered = B - mu_b.unsqueeze(0)
+        S_h = (H_centered.T @ H_centered) / max(n_h - 1, 1)
+        S_b = (B_centered.T @ B_centered) / max(n_b - 1, 1)
+        S_w = (S_h + S_b) / 2.0  # (d, d)
+        # Apply Ledoit-Wolf shrinkage if requested
+        if self.shrinkage > 0:
+            trace_S_w = S_w.trace().item()
+            S_w = (1 - self.shrinkage) * S_w + self.shrinkage * (trace_S_w / d) * torch.eye(d, device=S_w.device)
+        # Regularize S_w for numerical stability
+        S_w_reg = S_w + self.regularization_eps * torch.eye(d, device=S_w.device)
+        # Condition number of S_w (for diagnostics)
+        try:
+            eigs_w = torch.linalg.eigvalsh(S_w_reg)
+            eigs_w = eigs_w.clamp(min=0)
+            pos_eigs = eigs_w[eigs_w > eigs_w.max() * 1e-10]
+            condition = (pos_eigs.max() / pos_eigs.min()).item() if pos_eigs.numel() > 0 else float('inf')
+        except Exception:
+            condition = float('inf')
+        # LEACE direction via S_w^{-1} @ delta
+        # The generalized eigenvector for rank-1 S_between = delta @ delta^T
+        # reduces to: v = S_w^{-1} @ delta (up to normalization)
+        try:
+            # Use solve for numerical stability (avoids explicit inverse)
+            v = torch.linalg.solve(S_w_reg, delta)  # (d,)
+        except torch.linalg.LinAlgError:
+            # Fallback: pseudoinverse
+            v = torch.linalg.lstsq(S_w_reg, delta.unsqueeze(1)).solution.squeeze(1)
+        # Normalize to unit length
+        v_norm = v.norm()
+        if v_norm > 1e-8:
+            direction = v / v_norm
+        else:
+            # Degenerate case: fall back to normalized mean difference
+            direction = delta / max(delta_norm, 1e-8)
+        # Generalized eigenvalue: lambda = delta^T @ S_w^{-1} @ delta
+        # This measures how discriminable the classes are after whitening
+        gen_eigenvalue = (delta @ v).item()
+        # Erasure loss: expected squared distortion E[||x - x'||^2]
+        # For rank-1 projection: loss = v^T @ S_total @ v where S_total
+        # is the total (pooled) covariance
+        all_acts = torch.cat([H, B], dim=0)
+        mu_total = all_acts.mean(dim=0)
+        centered_total = all_acts - mu_total.unsqueeze(0)
+        S_total = (centered_total.T @ centered_total) / max(all_acts.shape[0] - 1, 1)
+        erasure_loss = (direction @ S_total @ direction).item()
+        return LEACEResult(
+            layer_idx=layer_idx,
+            direction=direction,
+            generalized_eigenvalue=gen_eigenvalue,
+            within_class_condition=condition,
+            mean_diff_norm=delta_norm,
+            erasure_loss=erasure_loss,
+        )
+    def extract_all_layers(
+        self,
+        harmful_acts: dict[int, list[torch.Tensor]],
+        harmless_acts: dict[int, list[torch.Tensor]],
+    ) -> dict[int, LEACEResult]:
+        """Extract LEACE directions for all layers.
+        Args:
+            harmful_acts: {layer_idx: [activations]} from activation collection.
+            harmless_acts: {layer_idx: [activations]} from activation collection.
+        Returns:
+            {layer_idx: LEACEResult} for each layer.
+        """
+        results = {}
+        for idx in sorted(harmful_acts.keys()):
+            if idx not in harmless_acts:
+                continue
+            results[idx] = self.extract(
+                harmful_acts[idx],
+                harmless_acts[idx],
+                layer_idx=idx,
+            )
+        return results
+    @staticmethod
+    def compare_with_diff_of_means(
+        leace_result: LEACEResult,
+        harmful_mean: torch.Tensor,
+        harmless_mean: torch.Tensor,
+    ) -> dict[str, float]:
+        """Compare LEACE direction with simple diff-of-means.
+        Returns cosine similarity and diagnostic metrics showing how much
+        the within-class normalization rotates the direction.
+        """
+        diff = harmful_mean.squeeze() - harmless_mean.squeeze()
+        diff_norm = diff.norm()
+        if diff_norm > 1e-8:
+            diff_normalized = diff / diff_norm
+        else:
+            diff_normalized = diff
+        cosine_sim = (leace_result.direction @ diff_normalized).abs().item()
+        return {
+            "cosine_similarity": cosine_sim,
+            "leace_eigenvalue": leace_result.generalized_eigenvalue,
+            "leace_erasure_loss": leace_result.erasure_loss,
+            "within_class_condition": leace_result.within_class_condition,
+            "mean_diff_norm": leace_result.mean_diff_norm,
+        }

obliteratus/bayesian_optimizer.py CHANGED Viewed

@@ -142,28 +142,35 @@ def _parametric_layer_weight(
     min_weight: float,
     spread: float,
 ) -> float:
-    """Compute ablation weight for a layer using a parametric bell curve.
-    This is the Heretic-style parametric kernel:
-    - max_weight: peak ablation strength (0..1)
-    - peak_position: normalized position of peak (0..1 maps to layer 0..n_layers-1)
-    - min_weight: minimum ablation weight at the tails
-    - spread: controls width of the bell curve (higher = wider)
-    Returns a value in [min_weight, max_weight] representing how strongly
-    to ablate this layer (1.0 = full projection, 0.0 = no projection).
     """
     if n_layers <= 1:
         return max_weight
     normalized_pos = layer_idx / (n_layers - 1)
-    peak = peak_position
-    # Gaussian-shaped kernel
-    dist = abs(normalized_pos - peak)
-    sigma = max(spread, 0.01)
-    gauss = math.exp(-0.5 * (dist / sigma) ** 2)
-    return min_weight + (max_weight - min_weight) * gauss
 def _interpolate_direction(
@@ -171,37 +178,56 @@ def _interpolate_direction(
     layer_idx: int,
     float_dir_idx: float,
 ) -> torch.Tensor:
-    """Get an interpolated refusal direction from a float-valued index.
-    Non-integer values interpolate between adjacent SVD directions in the
-    refusal subspace, unlocking a continuous space of directions beyond
-    the discrete top-k.
     Args:
-        pipeline: Pipeline with extracted refusal subspaces.
-        layer_idx: Which layer's subspace to use.
-        float_dir_idx: Continuous direction index (e.g., 0.7 interpolates
-            between direction 0 and direction 1).
     Returns:
         Normalized direction tensor.
     """
-    subspace = pipeline.refusal_subspaces.get(layer_idx)
-    if subspace is None or subspace.shape[0] == 0:
         return pipeline.refusal_directions.get(layer_idx, torch.zeros(1))
-    n_dirs = subspace.shape[0]
-    # Clamp to valid range
-    float_dir_idx = max(0.0, min(float_dir_idx, n_dirs - 1))
     lo = int(float_dir_idx)
-    hi = min(lo + 1, n_dirs - 1)
     if lo == hi:
-        d = subspace[lo]
     else:
         alpha = float_dir_idx - lo
-        d = (1.0 - alpha) * subspace[lo] + alpha * subspace[hi]
     norm = d.norm()
     if norm > 1e-8:
@@ -342,9 +368,14 @@ def run_bayesian_optimization(
         for live_data, saved_clone in original_params:  # noqa: F821
             live_data.copy_(saved_clone.to(live_data.device))
-    # Warm-start values for the parametric kernel
-    # Estimate peak position from strongest layer
-    if pipeline._strong_layers:
         peak_layer = pipeline._strong_layers[0]
         warm_peak = peak_layer / max(n_total_layers - 1, 1)
     else:
@@ -356,56 +387,56 @@ def run_bayesian_optimization(
     # Suppress Optuna's verbose logging
     optuna.logging.set_verbosity(optuna.logging.WARNING)
-    # Max SVD directions available (for float direction interpolation)
-    max_n_dirs = max(
-        (pipeline.refusal_subspaces[idx].shape[0]
-         for idx in pipeline._strong_layers
-         if idx in pipeline.refusal_subspaces),
-        default=1,
-    )
     # ── Phase 1: Parametric kernel optimization (compact search space) ──
     def objective(trial: optuna.Trial) -> tuple[float, float]:
         """Multi-objective: minimize (refusal_rate, kl_divergence)."""
         _restore_all()
-        # Parametric kernel: 4 params describe the entire layer weighting
-        max_weight = trial.suggest_float("max_weight", 0.5, 1.0)
-        peak_position = trial.suggest_float("peak_position", 0.1, 0.9)
-        min_weight = trial.suggest_float("min_weight", 0.0, 0.3)
-        spread = trial.suggest_float("spread", 0.1, 0.6)
-        # Component-specific scaling (Heretic insight: MLP more damaging)
-        attn_scale = trial.suggest_float("attn_scale", 0.5, 1.0)
-        mlp_scale = trial.suggest_float("mlp_scale", 0.3, 1.0)
-        # Float direction index (continuous interpolation between SVD dirs)
-        dir_idx = trial.suggest_float("dir_idx", 0.0, max(max_n_dirs - 1, 0.0))
-        # Compute per-layer regularization from parametric kernel
-        layer_regs: dict[int, float] = {}
         for idx in pipeline._strong_layers:
-            weight = _parametric_layer_weight(
-                idx, n_total_layers, max_weight, peak_position, min_weight, spread,
-            )
-            # Convert weight to regularization (weight=1 → reg=0, weight=0 → reg=1)
-            layer_regs[idx] = 1.0 - weight
         # Apply projection with trial's parameters
         for idx in pipeline._strong_layers:
-            if idx not in pipeline.refusal_subspaces:
                 continue
-            # Use interpolated direction
             direction = _interpolate_direction(pipeline, idx, dir_idx)
             d_col = direction.to(device=next(layer_modules[idx].parameters()).device)
             d_col = d_col.unsqueeze(-1) if d_col.dim() == 1 else d_col
-            reg = layer_regs[idx]
-            # Attention projection (with attn_scale)
-            attn_reg = 1.0 - (1.0 - reg) * attn_scale
             try:
                 attn = get_attention_module(layer_modules[idx], arch)
                 pipeline._project_out_advanced(
@@ -416,8 +447,8 @@ def run_bayesian_optimization(
             except (AttributeError, RuntimeError):
                 pass
-            # MLP/FFN projection (with mlp_scale)
-            mlp_reg = 1.0 - (1.0 - reg) * mlp_scale
             try:
                 ffn = get_ffn_module(layer_modules[idx], arch)
                 count = pipeline._project_out_advanced(
@@ -439,18 +470,20 @@ def run_bayesian_optimization(
         refusal = _measure_refusal_rate(pipeline, n_prompts=n_refusal_prompts)
         kl = _measure_kl_divergence(pipeline, reference_logits, kl_prompts)
-        # Track best combined score
         nonlocal best_score, best_result
         combined = refusal + 0.5 * kl
         if combined < best_score:
             best_score = combined
-            best_result = dict(layer_regs)
         pipeline.log(
             f"  Trial {trial.number + 1}/{n_trials}: "
             f"refusal={refusal:.0%}, KL={kl:.4f} "
-            f"(peak={peak_position:.2f}, spread={spread:.2f}, "
-            f"attn={attn_scale:.2f}, mlp={mlp_scale:.2f}, dir={dir_idx:.2f})"
         )
         return refusal, kl
@@ -462,16 +495,33 @@ def run_bayesian_optimization(
         study_name="obliteratus_parametric_optimization",
     )
-    # Enqueue warm-start trial with analysis-derived estimates
-    warm_params = {
-        "max_weight": 0.9,
-        "peak_position": warm_peak,
-        "min_weight": 0.05,
-        "spread": 0.3,
-        "attn_scale": 0.8,
-        "mlp_scale": 0.6,
-        "dir_idx": 0.0,
-    }
     study.enqueue_trial(warm_params)
     pipeline.log(f"Bayesian optimization: running {n_trials} trials (parametric kernel)...")
@@ -490,25 +540,32 @@ def run_bayesian_optimization(
         p = best_trial.params
         best_result = {}
         for idx in pipeline._strong_layers:
-            weight = _parametric_layer_weight(
                 idx, n_total_layers,
-                p["max_weight"], p["peak_position"],
-                p["min_weight"], p["spread"],
             )
-            best_result[idx] = 1.0 - weight
         pipeline.log(
             f"  Best trial: refusal={best_trial.values[0]:.0%}, "
             f"KL={best_trial.values[1]:.4f}"
         )
         pipeline.log(
-            f"  Kernel: peak={p['peak_position']:.2f}, spread={p['spread']:.2f}, "
-            f"max={p['max_weight']:.2f}, min={p['min_weight']:.2f}"
         )
         pipeline.log(
-            f"  Components: attn={p['attn_scale']:.2f}, mlp={p['mlp_scale']:.2f}, "
-            f"dir_idx={p['dir_idx']:.2f}"
         )
         # Store the best direction index for use during EXCISE
         best_dir_idx = p.get("dir_idx", 0.0)
@@ -518,9 +575,9 @@ def run_bayesian_optimization(
                 new_dir = _interpolate_direction(pipeline, idx, best_dir_idx)
                 pipeline.refusal_directions[idx] = new_dir
-        # Store component scales for use in EXCISE
-        pipeline._bayesian_attn_scale = p.get("attn_scale", 1.0)
-        pipeline._bayesian_mlp_scale = p.get("mlp_scale", 1.0)
     elif best_result:
         pipeline.log(f"  Using best combined score: {best_score:.4f}")

     min_weight: float,
     spread: float,
 ) -> float:
+    """Compute ablation weight for a layer using a piecewise-linear tent kernel.
+    Faithful reproduction of Heretic's parametric kernel (p-e-w/heretic):
+    - max_weight: peak ablation strength at peak_position
+    - peak_position: normalized position of peak (0..1)
+    - min_weight: weight at the edges of the tent
+    - spread: normalized distance from peak to tent edge (min_weight_distance)
+    Layers beyond ``spread`` from the peak get weight 0 (skipped entirely).
+    Within the tent, weight drops linearly from max_weight to min_weight.
+    This matches Heretic's actual formula::
+        distance = abs(layer_index - max_weight_position)
+        if distance > min_weight_distance: skip
+        weight = max_weight + (distance / min_weight_distance) * (min_weight - max_weight)
     """
     if n_layers <= 1:
         return max_weight
     normalized_pos = layer_idx / (n_layers - 1)
+    dist = abs(normalized_pos - peak_position)
+    min_weight_distance = max(spread, 0.01)
+    # Hard cutoff: layers outside the tent get 0 weight (Heretic skips them)
+    if dist > min_weight_distance:
+        return 0.0
+    # Linear interpolation: max_weight at peak → min_weight at edges
+    return max_weight + (dist / min_weight_distance) * (min_weight - max_weight)
 def _interpolate_direction(
     layer_idx: int,
     float_dir_idx: float,
 ) -> torch.Tensor:
+    """Get an interpolated refusal direction from a float-valued layer index.
+    Faithful reproduction of Heretic's direction interpolation: the index
+    selects which *layer's* diff-of-means direction to use, with float
+    values interpolating between adjacent layers' directions.  This is
+    fundamentally different from interpolating between SVD components
+    within a single layer — it searches across the layer axis.
+    From Heretic source (model.py)::
+        weight, index = math.modf(direction_index + 1)
+        refusal_direction = F.normalize(
+            refusal_directions[int(index)].lerp(
+                refusal_directions[int(index) + 1], weight), p=2, dim=0)
     Args:
+        pipeline: Pipeline with extracted refusal directions per layer.
+        layer_idx: The layer being projected (used as fallback).
+        float_dir_idx: Continuous direction index — selects which layer's
+            direction to use (e.g., 5.3 interpolates 70% layer-5 + 30% layer-6).
     Returns:
         Normalized direction tensor.
     """
+    # Build sorted list of layer indices that have refusal directions
+    sorted_layers = sorted(pipeline.refusal_directions.keys())
+    if not sorted_layers:
         return pipeline.refusal_directions.get(layer_idx, torch.zeros(1))
+    n_layers_with_dirs = len(sorted_layers)
+    # Heretic uses direction_index + 1 offset; we map float_dir_idx into
+    # the sorted layer list, clamped to valid range.
+    float_dir_idx = max(0.0, min(float_dir_idx, n_layers_with_dirs - 1))
     lo = int(float_dir_idx)
+    hi = min(lo + 1, n_layers_with_dirs - 1)
+    lo_layer = sorted_layers[lo]
+    hi_layer = sorted_layers[hi]
+    d_lo = pipeline.refusal_directions[lo_layer]
+    d_hi = pipeline.refusal_directions[hi_layer]
     if lo == hi:
+        d = d_lo
     else:
+        # Linear interpolation between adjacent layers' directions
         alpha = float_dir_idx - lo
+        d = (1.0 - alpha) * d_lo + alpha * d_hi
     norm = d.norm()
     if norm > 1e-8:
         for live_data, saved_clone in original_params:  # noqa: F821
             live_data.copy_(saved_clone.to(live_data.device))
+    # Warm-start values for the parametric kernel.
+    # If the informed pipeline provided analysis-derived warm-start params,
+    # use those (they're much better than the default heuristic).
+    informed_warm = getattr(pipeline, "_informed_warm_start", None)
+    if informed_warm:
+        warm_peak = informed_warm.get("peak_position", 0.5)
+        pipeline.log(f"  Using analysis-informed warm-start (peak={warm_peak:.2f})")
+    elif pipeline._strong_layers:
         peak_layer = pipeline._strong_layers[0]
         warm_peak = peak_layer / max(n_total_layers - 1, 1)
     else:
     # Suppress Optuna's verbose logging
     optuna.logging.set_verbosity(optuna.logging.WARNING)
+    # Max layers with directions (for float direction interpolation)
+    n_layers_with_dirs = len([
+        idx for idx in pipeline._strong_layers
+        if idx in pipeline.refusal_directions
+    ])
     # ── Phase 1: Parametric kernel optimization (compact search space) ──
+    # Heretic uses SEPARATE kernel parameters for attention and MLP,
+    # allowing them to peak at different layers (8 params + 1 dir_idx = 9).
     def objective(trial: optuna.Trial) -> tuple[float, float]:
         """Multi-objective: minimize (refusal_rate, kl_divergence)."""
         _restore_all()
+        # Attention kernel: 4 params
+        attn_max = trial.suggest_float("attn_max_weight", 0.5, 1.0)
+        attn_peak = trial.suggest_float("attn_peak_position", 0.1, 0.9)
+        attn_min = trial.suggest_float("attn_min_weight", 0.0, 0.3)
+        attn_spread = trial.suggest_float("attn_spread", 0.1, 0.6)
+        # MLP kernel: 4 params (separate — can peak at a different layer)
+        mlp_max = trial.suggest_float("mlp_max_weight", 0.3, 1.0)
+        mlp_peak = trial.suggest_float("mlp_peak_position", 0.1, 0.9)
+        mlp_min = trial.suggest_float("mlp_min_weight", 0.0, 0.3)
+        mlp_spread = trial.suggest_float("mlp_spread", 0.1, 0.6)
+        # Float direction index (cross-layer interpolation, Heretic-style)
+        dir_idx = trial.suggest_float("dir_idx", 0.0, max(n_layers_with_dirs - 1, 0.0))
+        # Compute per-layer, per-component regularization from kernels
+        attn_regs: dict[int, float] = {}
+        mlp_regs: dict[int, float] = {}
         for idx in pipeline._strong_layers:
+            attn_w = _parametric_layer_weight(idx, n_total_layers, attn_max, attn_peak, attn_min, attn_spread)
+            mlp_w = _parametric_layer_weight(idx, n_total_layers, mlp_max, mlp_peak, mlp_min, mlp_spread)
+            attn_regs[idx] = 1.0 - attn_w
+            mlp_regs[idx] = 1.0 - mlp_w
         # Apply projection with trial's parameters
         for idx in pipeline._strong_layers:
+            if idx not in pipeline.refusal_directions:
                 continue
+            # Use cross-layer interpolated direction
             direction = _interpolate_direction(pipeline, idx, dir_idx)
             d_col = direction.to(device=next(layer_modules[idx].parameters()).device)
             d_col = d_col.unsqueeze(-1) if d_col.dim() == 1 else d_col
+            # Attention projection (with per-component kernel)
+            attn_reg = attn_regs[idx]
             try:
                 attn = get_attention_module(layer_modules[idx], arch)
                 pipeline._project_out_advanced(
             except (AttributeError, RuntimeError):
                 pass
+            # MLP/FFN projection (with per-component kernel)
+            mlp_reg = mlp_regs[idx]
             try:
                 ffn = get_ffn_module(layer_modules[idx], arch)
                 count = pipeline._project_out_advanced(
         refusal = _measure_refusal_rate(pipeline, n_prompts=n_refusal_prompts)
         kl = _measure_kl_divergence(pipeline, reference_logits, kl_prompts)
+        # Track best combined score (use average of attn/mlp regs for layer_regs)
         nonlocal best_score, best_result
         combined = refusal + 0.5 * kl
         if combined < best_score:
             best_score = combined
+            best_result = {
+                idx: (attn_regs[idx] + mlp_regs[idx]) / 2.0
+                for idx in pipeline._strong_layers
+            }
         pipeline.log(
             f"  Trial {trial.number + 1}/{n_trials}: "
             f"refusal={refusal:.0%}, KL={kl:.4f} "
+            f"(attn_peak={attn_peak:.2f}, mlp_peak={mlp_peak:.2f}, dir={dir_idx:.2f})"
         )
         return refusal, kl
         study_name="obliteratus_parametric_optimization",
     )
+    # Enqueue warm-start trial with analysis-derived estimates.
+    # Translate informed pipeline params to the new per-component format.
+    if informed_warm:
+        iw = informed_warm
+        warm_params = {
+            "attn_max_weight": iw.get("max_weight", 0.9),
+            "attn_peak_position": iw.get("peak_position", warm_peak),
+            "attn_min_weight": iw.get("min_weight", 0.05),
+            "attn_spread": iw.get("spread", 0.3),
+            "mlp_max_weight": iw.get("max_weight", 0.9) * iw.get("mlp_scale", 0.6),
+            "mlp_peak_position": iw.get("peak_position", warm_peak),
+            "mlp_min_weight": iw.get("min_weight", 0.05),
+            "mlp_spread": iw.get("spread", 0.3),
+            "dir_idx": iw.get("dir_idx", 0.0),
+        }
+    else:
+        warm_params = {
+            "attn_max_weight": 0.9,
+            "attn_peak_position": warm_peak,
+            "attn_min_weight": 0.05,
+            "attn_spread": 0.3,
+            "mlp_max_weight": 0.6,
+            "mlp_peak_position": warm_peak,
+            "mlp_min_weight": 0.05,
+            "mlp_spread": 0.3,
+            "dir_idx": 0.0,
+        }
     study.enqueue_trial(warm_params)
     pipeline.log(f"Bayesian optimization: running {n_trials} trials (parametric kernel)...")
         p = best_trial.params
         best_result = {}
         for idx in pipeline._strong_layers:
+            attn_w = _parametric_layer_weight(
+                idx, n_total_layers,
+                p["attn_max_weight"], p["attn_peak_position"],
+                p["attn_min_weight"], p["attn_spread"],
+            )
+            mlp_w = _parametric_layer_weight(
                 idx, n_total_layers,
+                p["mlp_max_weight"], p["mlp_peak_position"],
+                p["mlp_min_weight"], p["mlp_spread"],
             )
+            best_result[idx] = (attn_w + mlp_w) / 2.0  # average for layer-level reg
+            best_result[idx] = 1.0 - best_result[idx]
         pipeline.log(
             f"  Best trial: refusal={best_trial.values[0]:.0%}, "
             f"KL={best_trial.values[1]:.4f}"
         )
         pipeline.log(
+            f"  Attn kernel: peak={p['attn_peak_position']:.2f}, "
+            f"spread={p['attn_spread']:.2f}, max={p['attn_max_weight']:.2f}"
         )
         pipeline.log(
+            f"  MLP kernel:  peak={p['mlp_peak_position']:.2f}, "
+            f"spread={p['mlp_spread']:.2f}, max={p['mlp_max_weight']:.2f}"
         )
+        pipeline.log(f"  dir_idx={p['dir_idx']:.2f}")
         # Store the best direction index for use during EXCISE
         best_dir_idx = p.get("dir_idx", 0.0)
                 new_dir = _interpolate_direction(pipeline, idx, best_dir_idx)
                 pipeline.refusal_directions[idx] = new_dir
+        # Store component scales for use in EXCISE (backward compat)
+        pipeline._bayesian_attn_scale = p.get("attn_max_weight", 1.0)
+        pipeline._bayesian_mlp_scale = p.get("mlp_max_weight", 1.0)
     elif best_result:
         pipeline.log(f"  Using best combined score: {best_score:.4f}")

obliteratus/cli.py CHANGED Viewed

@@ -109,7 +109,12 @@ def main(argv: list[str] | None = None):
             ],
             help="Liberation method (default: advanced)",
         )
-        p.add_argument("--n-directions", type=int, default=None, help="Override: number of SVD directions to extract")
         p.add_argument("--regularization", type=float, default=None, help="Override: fraction to preserve (0.0-1.0)")
         p.add_argument("--refinement-passes", type=int, default=None, help="Override: number of iterative passes")
         p.add_argument(
@@ -591,6 +596,7 @@ def _cmd_abliterate(args):
         dtype=args.dtype,
         method=method,
         n_directions=args.n_directions,
         regularization=args.regularization,
         refinement_passes=args.refinement_passes,
         quantization=args.quantization,

             ],
             help="Liberation method (default: advanced)",
         )
+        p.add_argument("--n-directions", type=int, default=None, help="Override: number of refusal directions to extract")
+        p.add_argument(
+            "--direction-method", type=str, default=None,
+            choices=["diff_means", "svd", "leace"],
+            help="Direction extraction method: diff_means (simple, robust), svd (multi-direction), leace (optimal erasure)",
+        )
         p.add_argument("--regularization", type=float, default=None, help="Override: fraction to preserve (0.0-1.0)")
         p.add_argument("--refinement-passes", type=int, default=None, help="Override: number of iterative passes")
         p.add_argument(
         dtype=args.dtype,
         method=method,
         n_directions=args.n_directions,
+        direction_method=getattr(args, "direction_method", None),
         regularization=args.regularization,
         refinement_passes=args.refinement_passes,
         quantization=args.quantization,

obliteratus/informed_pipeline.py CHANGED Viewed

@@ -73,15 +73,17 @@ INFORMED_METHOD = {
     "description": (
         "Runs analysis modules between PROBE and DISTILL to auto-configure "
         "direction extraction, layer selection, and projection strategy based "
-        "on the model's actual refusal geometry."
     ),
-    "n_directions": 4,            # overridden by analysis
     "norm_preserve": True,
     "regularization": 0.0,        # overridden by analysis
     "refinement_passes": 2,       # overridden by analysis
     "project_biases": True,
     "use_chat_template": True,
-    "use_whitened_svd": True,     # overridden by analysis
     "true_iterative_refinement": True,
 }
@@ -126,7 +128,8 @@ class AnalysisInsights:
     clean_layers: list[int] = field(default_factory=list)
     # Derived configuration
-    recommended_n_directions: int = 4
     recommended_regularization: float = 0.0
     recommended_refinement_passes: int = 2
     recommended_layers: list[int] = field(default_factory=list)
@@ -217,12 +220,19 @@ class InformedAbliterationPipeline(AbliterationPipeline):
             hub_token=hub_token,
             hub_community_org=hub_community_org,
             quantization=quantization,
-            # Set informed defaults
             norm_preserve=True,
             project_biases=True,
             use_chat_template=True,
-            use_whitened_svd=True,
             true_iterative_refinement=True,
         )
         self.method = "informed"
@@ -311,7 +321,11 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         if self._run_defense:
             self._analyze_defense_robustness()
-        # 5. Derive configuration from insights
         self._derive_configuration()
         elapsed = time.time() - t0
@@ -392,6 +406,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         sample_layers = candidate_layers[::step]
         polyhedral_count = 0
         best_cone_result = None
         best_strength = 0.0
@@ -405,34 +420,43 @@ class InformedAbliterationPipeline(AbliterationPipeline):
                 layer_idx=layer_idx,
             )
             if result.is_polyhedral:
                 polyhedral_count += 1
-            # Track the strongest layer's cone analysis
             general_strength = result.general_direction.norm().item() if result.general_direction.numel() > 1 else 0
             if general_strength > best_strength:
                 best_strength = general_strength
                 best_cone_result = result
-        if best_cone_result is not None:
-            self._insights.cone_is_polyhedral = best_cone_result.is_polyhedral
-            self._insights.cone_dimensionality = best_cone_result.cone_dimensionality
-            self._insights.mean_pairwise_cosine = best_cone_result.mean_pairwise_cosine
-            # Store per-category directions for category-aware excision
-            for cd in best_cone_result.category_directions:
-                self._insights.per_category_directions[cd.category] = cd.direction
-                self._insights.direction_specificity[cd.category] = cd.specificity
-            cone_type = "POLYHEDRAL" if best_cone_result.is_polyhedral else "LINEAR"
-            self.log(f"  Cone type: {cone_type}")
-            self.log(f"  Dimensionality: {best_cone_result.cone_dimensionality:.2f}")
-            self.log(f"  Mean pairwise cosine: {best_cone_result.mean_pairwise_cosine:.3f}")
-            self.log(f"  Categories detected: {best_cone_result.category_count}")
-            self.log(f"  Polyhedral at {polyhedral_count}/{len(sample_layers)} sampled layers")
-            for cd in sorted(best_cone_result.category_directions, key=lambda x: -x.strength)[:5]:
-                self.log(f"    {cd.category:15s}  DSI={cd.specificity:.3f}  str={cd.strength:.3f}")
         else:
             self.log("  No cone results — using default linear assumption")
@@ -517,6 +541,71 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         self.log(f"  Most entangled layers: {emap.most_entangled_layers}")
         self.log(f"  Cleanest layers: {emap.least_entangled_layers}")
     # ── Configuration Derivation ─────────────────────────────────────
     def _derive_configuration(self):
@@ -528,18 +617,32 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         self.log("-" * 50)
         insights = self._insights
-        # 1. n_directions: based on cone geometry
-        if insights.cone_is_polyhedral:
-            # Polyhedral cone → need more directions to capture all facets
             n_dirs = max(4, min(8, int(insights.cone_dimensionality * 2)))
             self.log(f"  Polyhedral cone (dim={insights.cone_dimensionality:.1f}) "
-                     f"→ n_directions={n_dirs}")
         else:
-            # Linear cone → fewer directions suffice
-            n_dirs = max(1, min(4, int(insights.cone_dimensionality + 1)))
             self.log(f"  Linear cone (dim={insights.cone_dimensionality:.1f}) "
-                     f"→ n_directions={n_dirs}")
         insights.recommended_n_directions = n_dirs
         self.n_directions = n_dirs
         # 2. regularization: based on alignment method + entanglement
@@ -586,15 +689,22 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         # 4. Layer selection: cluster-aware + entanglement-gated
         if insights.cluster_representative_layers:
-            # Start from cluster representatives
             base_layers = list(insights.cluster_representative_layers)
-            # Expand: add all layers from clusters that have strong signals
-            all_cluster_layers = []
             for cluster in insights.direction_clusters:
-                all_cluster_layers.extend(cluster)
-            if all_cluster_layers:
-                base_layers = sorted(set(all_cluster_layers))
             # Gate: remove highly entangled layers
             skip = set()
@@ -621,13 +731,9 @@ class InformedAbliterationPipeline(AbliterationPipeline):
             self.log(f"  RSI={insights.mean_refusal_sparsity_index:.2f} "
                      f"→ standard dense projection")
-        # 6. Whitened SVD: always use for multi-direction, skip for single
-        if n_dirs > 1:
-            self.use_whitened_svd = True
-            self.log(f"  Multi-direction ({n_dirs}) → whitened SVD enabled")
-        else:
-            self.use_whitened_svd = False
-            self.log("  Single direction → standard diff-in-means")
     # ── Informed DISTILL ─────────────────────────────────────────────
@@ -650,7 +756,38 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         n_layers = len(self._harmful_means)
         norms: dict[int, float] = {}
-        if self.use_whitened_svd and self.n_directions > 1:
             from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
             whitened_extractor = WhitenedSVDExtractor()
             self.log(f"Using whitened SVD with {self.n_directions} directions")
@@ -658,6 +795,29 @@ class InformedAbliterationPipeline(AbliterationPipeline):
             whitened_extractor = None
         for idx in range(n_layers):
             if self.n_directions == 1:
                 diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
                 norm = diff.norm().item()
@@ -691,6 +851,41 @@ class InformedAbliterationPipeline(AbliterationPipeline):
                 self.refusal_directions[idx] = primary / primary.norm()
                 norms[idx] = S[:k].sum().item()
         # Layer selection: use analysis-recommended layers if available,
         # otherwise fall back to knee detection
         if self._insights.recommended_layers:
@@ -728,15 +923,117 @@ class InformedAbliterationPipeline(AbliterationPipeline):
     def _excise_informed(self):
         """Excise refusal directions with analysis-informed strategy.
-        Uses sparse surgery if analysis recommends it, otherwise falls
-        back to the standard projection with analysis-tuned parameters.
         """
         if self._insights.use_sparse_surgery:
             self._excise_sparse()
         else:
-            # Standard excision with analysis-tuned parameters
-            # (regularization, norm_preserve, etc. already configured)
-            self._excise()
     def _excise_sparse(self):
         """Sparse direction surgery — only modifies high-projection rows."""
@@ -825,14 +1122,22 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         1. Residual refusal signal (via activation probing)
         2. Self-repair / Ouroboros effect (via defense robustness)
         3. Triggers additional targeted passes at compensating layers
         """
         # Run standard verification first
         self._verify()
         # Check if Ouroboros compensation is needed
         refusal_rate = self._quality_metrics.get("refusal_rate", 0.0)
         ouroboros_pass = 0
         while (refusal_rate > self._ouroboros_threshold
                and ouroboros_pass < self._max_ouroboros_passes):
             ouroboros_pass += 1
@@ -849,9 +1154,9 @@ class InformedAbliterationPipeline(AbliterationPipeline):
             self._distill_inner()
             self.log(f"Found {len(self._strong_layers)} layers with residual refusal")
-            # Re-excise at the new strong layers
             if self._strong_layers:
-                self._excise()
             else:
                 self.log("No strong layers found — stopping Ouroboros compensation")
                 break
@@ -859,7 +1164,24 @@ class InformedAbliterationPipeline(AbliterationPipeline):
             # Re-verify
             self._verify()
             refusal_rate = self._quality_metrics.get("refusal_rate", 0.0)
-            self.log(f"After Ouroboros pass {ouroboros_pass}: refusal rate = {refusal_rate:.0%}")
         self._report.ouroboros_passes = ouroboros_pass
         self._report.final_refusal_rate = refusal_rate
@@ -903,6 +1225,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
             },
             "derived_config": {
                 "n_directions": insights.recommended_n_directions,
                 "regularization": insights.recommended_regularization,
                 "refinement_passes": insights.recommended_refinement_passes,
                 "layers_used": insights.recommended_layers,
@@ -981,6 +1304,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
         lines.append("Derived Configuration:")
         lines.append(f"  n_directions: {insights.recommended_n_directions}")
         lines.append(f"  regularization: {insights.recommended_regularization}")
         lines.append(f"  refinement_passes: {insights.recommended_refinement_passes}")
         lines.append(f"  sparse surgery: {insights.use_sparse_surgery}")

     "description": (
         "Runs analysis modules between PROBE and DISTILL to auto-configure "
         "direction extraction, layer selection, and projection strategy based "
+        "on the model's actual refusal geometry. Defaults to single diff-of-means "
+        "direction + Bayesian optimization (Heretic-style)."
     ),
+    "n_directions": 1,            # overridden by analysis
+    "direction_method": "diff_means",  # overridden by analysis; "leace" also available
     "norm_preserve": True,
     "regularization": 0.0,        # overridden by analysis
     "refinement_passes": 2,       # overridden by analysis
     "project_biases": True,
     "use_chat_template": True,
+    "use_whitened_svd": False,    # overridden by analysis
     "true_iterative_refinement": True,
 }
     clean_layers: list[int] = field(default_factory=list)
     # Derived configuration
+    recommended_n_directions: int = 1
+    recommended_direction_method: str = "diff_means"
     recommended_regularization: float = 0.0
     recommended_refinement_passes: int = 2
     recommended_layers: list[int] = field(default_factory=list)
             hub_token=hub_token,
             hub_community_org=hub_community_org,
             quantization=quantization,
+            # Set informed defaults: single direction + Bayesian opt
+            n_directions=1,
+            direction_method="diff_means",
             norm_preserve=True,
             project_biases=True,
             use_chat_template=True,
+            use_whitened_svd=False,
             true_iterative_refinement=True,
+            use_kl_optimization=True,
+            float_layer_interpolation=True,
+            layer_adaptive_strength=True,
+            winsorize_activations=True,
+            winsorize_percentile=0.01,
         )
         self.method = "informed"
         if self._run_defense:
             self._analyze_defense_robustness()
+        # 5. Sparse Surgery Analysis (RSI computation)
+        if self._run_sparse:
+            self._analyze_sparsity()
+        # 6. Derive configuration from insights
         self._derive_configuration()
         elapsed = time.time() - t0
         sample_layers = candidate_layers[::step]
         polyhedral_count = 0
+        all_results = []
         best_cone_result = None
         best_strength = 0.0
                 layer_idx=layer_idx,
             )
+            all_results.append(result)
             if result.is_polyhedral:
                 polyhedral_count += 1
+            # Track the strongest layer's cone analysis for per-category directions
             general_strength = result.general_direction.norm().item() if result.general_direction.numel() > 1 else 0
             if general_strength > best_strength:
                 best_strength = general_strength
                 best_cone_result = result
+        if all_results:
+            # Aggregate cone geometry across sampled layers (majority vote +
+            # mean dimensionality) instead of relying on a single layer.
+            n_sampled = len(all_results)
+            is_polyhedral = polyhedral_count > n_sampled / 2
+            avg_dimensionality = sum(r.cone_dimensionality for r in all_results) / n_sampled
+            avg_pairwise_cos = sum(r.mean_pairwise_cosine for r in all_results) / n_sampled
+            self._insights.cone_is_polyhedral = is_polyhedral
+            self._insights.cone_dimensionality = avg_dimensionality
+            self._insights.mean_pairwise_cosine = avg_pairwise_cos
+            # Store per-category directions from the strongest layer
+            if best_cone_result is not None:
+                for cd in best_cone_result.category_directions:
+                    self._insights.per_category_directions[cd.category] = cd.direction
+                    self._insights.direction_specificity[cd.category] = cd.specificity
+            cone_type = "POLYHEDRAL" if is_polyhedral else "LINEAR"
+            self.log(f"  Cone type: {cone_type} (majority vote: {polyhedral_count}/{n_sampled} layers)")
+            self.log(f"  Avg dimensionality: {avg_dimensionality:.2f}")
+            self.log(f"  Avg pairwise cosine: {avg_pairwise_cos:.3f}")
+            if best_cone_result is not None:
+                self.log(f"  Categories detected: {best_cone_result.category_count}")
+                for cd in sorted(best_cone_result.category_directions, key=lambda x: -x.strength)[:5]:
+                    self.log(f"    {cd.category:15s}  DSI={cd.specificity:.3f}  str={cd.strength:.3f}")
         else:
             self.log("  No cone results — using default linear assumption")
         self.log(f"  Most entangled layers: {emap.most_entangled_layers}")
         self.log(f"  Cleanest layers: {emap.least_entangled_layers}")
+    def _analyze_sparsity(self):
+        """Compute Refusal Sparsity Index to decide sparse vs dense excision."""
+        self.log("\n[5/5] Refusal Sparsity Analysis")
+        self.log("-" * 40)
+        from obliteratus.analysis.sparse_surgery import SparseDirectionSurgeon
+        from obliteratus.strategies.utils import (
+            get_ffn_module,
+            get_layer_modules,
+        )
+        # Need refusal directions — use quick diff-in-means
+        quick_directions = {}
+        for idx in sorted(self._harmful_means.keys()):
+            diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze()
+            norm = diff.norm().item()
+            if norm > 1e-10:
+                quick_directions[idx] = diff / diff.norm()
+        if not quick_directions:
+            self.log("  No refusal directions — skipping sparsity analysis")
+            return
+        # Gather FFN output weights for representative layers (sample for speed)
+        layers = get_layer_modules(self.handle)
+        arch = self.handle.architecture
+        n_layers = len(layers)
+        sample_idxs = sorted(quick_directions.keys())
+        step = max(1, len(sample_idxs) // 8)
+        sample_idxs = sample_idxs[::step]
+        weights = {}
+        sampled_dirs = {}
+        for idx in sample_idxs:
+            if idx >= n_layers:
+                continue
+            try:
+                ffn = get_ffn_module(layers[idx], arch)
+                for name in ["down_proj", "c_proj", "dense_4h_to_h", "fc_out", "fc2", "w2"]:
+                    proj = getattr(ffn, name, None)
+                    if proj is not None and hasattr(proj, "weight"):
+                        W = proj.weight.data
+                        d = quick_directions[idx]
+                        if W.shape[-1] == d.shape[0]:
+                            weights[idx] = W
+                            sampled_dirs[idx] = d
+                            break
+            except (AttributeError, RuntimeError):
+                continue
+        if not weights:
+            self.log("  Could not access FFN weights — skipping sparsity analysis")
+            return
+        surgeon = SparseDirectionSurgeon(auto_sparsity=True)
+        plan = surgeon.plan_surgery(weights, sampled_dirs)
+        self._insights.mean_refusal_sparsity_index = plan.mean_refusal_sparsity_index
+        self._insights.recommended_sparsity = plan.recommended_sparsity
+        self.log(f"  Mean RSI: {plan.mean_refusal_sparsity_index:.3f}")
+        self.log(f"  Recommended sparsity: {plan.recommended_sparsity:.1%}")
+        self.log(f"  Most sparse layer: {plan.most_sparse_layer}")
+        self.log(f"  Most dense layer: {plan.most_dense_layer}")
     # ── Configuration Derivation ─────────────────────────────────────
     def _derive_configuration(self):
         self.log("-" * 50)
         insights = self._insights
+        # 1. n_directions + direction_method: based on cone geometry
+        # Default: single direction via diff-of-means (proven most robust).
+        # Only escalate to multi-direction when analysis confirms polyhedral geometry.
+        if insights.cone_is_polyhedral and insights.cone_dimensionality > 2.0:
+            # Clearly polyhedral cone → use multiple directions via SVD
             n_dirs = max(4, min(8, int(insights.cone_dimensionality * 2)))
+            self.direction_method = "svd"
+            self.use_whitened_svd = True
             self.log(f"  Polyhedral cone (dim={insights.cone_dimensionality:.1f}) "
+                     f"→ n_directions={n_dirs}, method=svd (whitened)")
+        elif insights.cone_is_polyhedral:
+            # Mildly polyhedral → LEACE gives better single-direction erasure
+            n_dirs = 1
+            self.direction_method = "leace"
+            self.use_whitened_svd = False
+            self.log(f"  Mildly polyhedral (dim={insights.cone_dimensionality:.1f}) "
+                     f"→ n_directions=1, method=leace")
         else:
+            # Linear cone → single direction via diff-of-means (simplest, most robust)
+            n_dirs = 1
+            self.direction_method = "diff_means"
+            self.use_whitened_svd = False
             self.log(f"  Linear cone (dim={insights.cone_dimensionality:.1f}) "
+                     f"→ n_directions=1, method=diff_means")
         insights.recommended_n_directions = n_dirs
+        insights.recommended_direction_method = self.direction_method
         self.n_directions = n_dirs
         # 2. regularization: based on alignment method + entanglement
         # 4. Layer selection: cluster-aware + entanglement-gated
         if insights.cluster_representative_layers:
+            # Start from cluster representatives (strongest per cluster)
             base_layers = list(insights.cluster_representative_layers)
+            # Conservative expansion: for each cluster, add at most the top-2
+            # strongest layers (by refusal norm) beyond the representative,
+            # to avoid over-modifying weak layers in large clusters.
+            norms = {}
+            for idx in self._harmful_means:
+                if idx in self._harmless_means:
+                    norms[idx] = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze().norm().item()
             for cluster in insights.direction_clusters:
+                ranked = sorted(cluster, key=lambda ly: norms.get(ly, 0), reverse=True)
+                # Add up to 2 additional strong layers per cluster
+                for ly in ranked[:3]:  # representative + up to 2 more
+                    base_layers.append(ly)
+            base_layers = sorted(set(base_layers))
             # Gate: remove highly entangled layers
             skip = set()
             self.log(f"  RSI={insights.mean_refusal_sparsity_index:.2f} "
                      f"→ standard dense projection")
+        # 6. Direction method summary (already set in step 1)
+        self.log(f"  Direction method: {self.direction_method} "
+                 f"(whitened_svd={'on' if self.use_whitened_svd else 'off'})")
     # ── Informed DISTILL ─────────────────────────────────────────────
         n_layers = len(self._harmful_means)
         norms: dict[int, float] = {}
+        # ── Small-model direction cap (matching base _distill) ────────
+        # On small models, each SVD direction removes a proportionally
+        # larger fraction of weight energy.  Cap to prevent over-ablation.
+        hidden_size = self.handle.hidden_size if self.handle else 0
+        total_params = getattr(self.handle, 'total_params', 0) if self.handle else 0
+        if total_params == 0 and self.handle:
+            try:
+                total_params = sum(p.numel() for p in self.handle.model.parameters())
+            except Exception:
+                pass
+        if self.n_directions > 1 and (
+            (0 < hidden_size < 2048)
+            or (0 < total_params < 2_000_000_000)
+            or n_layers <= 16
+        ):
+            max_dirs = max(1, min(self.n_directions, 2))
+            if max_dirs < self.n_directions:
+                self.log(
+                    f"Capped n_directions from {self.n_directions} to {max_dirs} "
+                    f"for small model (hidden={hidden_size}, "
+                    f"params={total_params / 1e9:.1f}B, layers={n_layers})"
+                )
+                self.n_directions = max_dirs
+        # LEACE extractor for optimal concept erasure
+        leace_extractor = None
+        if self.direction_method == "leace":
+            from obliteratus.analysis.leace import LEACEExtractor
+            leace_extractor = LEACEExtractor()
+            self.log(f"Using LEACE (closed-form optimal concept erasure)")
+        if self.use_whitened_svd and self.n_directions > 1 and leace_extractor is None:
             from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
             whitened_extractor = WhitenedSVDExtractor()
             self.log(f"Using whitened SVD with {self.n_directions} directions")
             whitened_extractor = None
         for idx in range(n_layers):
+            # LEACE path: theoretically optimal single-direction erasure
+            if leace_extractor is not None:
+                if idx in self._harmful_acts and idx in self._harmless_acts:
+                    try:
+                        l_result = leace_extractor.extract(
+                            self._harmful_acts[idx],
+                            self._harmless_acts[idx],
+                            layer_idx=idx,
+                        )
+                        self.refusal_directions[idx] = l_result.direction
+                        self.refusal_subspaces[idx] = l_result.direction.unsqueeze(0)
+                        norms[idx] = l_result.generalized_eigenvalue
+                        if idx < 5 or idx == n_layers - 1:
+                            self.log(
+                                f"  layer {idx}: LEACE eigenvalue={l_result.generalized_eigenvalue:.4f}, "
+                                f"erasure_loss={l_result.erasure_loss:.4f}"
+                            )
+                        continue
+                    except Exception as e:
+                        if idx < 5:
+                            self.log(f"  layer {idx}: LEACE failed ({e}), falling back")
             if self.n_directions == 1:
                 diff = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze(0)
                 norm = diff.norm().item()
                 self.refusal_directions[idx] = primary / primary.norm()
                 norms[idx] = S[:k].sum().item()
+        # Enrich subspaces with per-category cone directions when available.
+        # This uses the actual refusal cone generators instead of purely
+        # data-agnostic SVD components.
+        cat_dirs = self._insights.per_category_directions
+        if cat_dirs and self._insights.cone_is_polyhedral and self.n_directions > 1:
+            cat_tensors = list(cat_dirs.values())
+            # Stack and orthogonalize category directions
+            cat_stack = torch.stack(cat_tensors)  # (n_cats, hidden)
+            cat_norms = cat_stack.norm(dim=1, keepdim=True).clamp(min=1e-8)
+            cat_stack = cat_stack / cat_norms
+            # Blend into strong-signal layers: replace later SVD components
+            # with category directions (which are geometrically meaningful)
+            n_cat = cat_stack.shape[0]
+            for idx in norms:
+                sub = self.refusal_subspaces.get(idx)
+                if sub is None or sub.shape[0] <= 1:
+                    continue
+                # Keep the first SVD direction (strongest), replace remaining
+                # with category directions projected to be orthogonal to it
+                primary = sub[0:1]  # (1, hidden)
+                # Project category directions orthogonal to primary
+                cos = (cat_stack @ primary.squeeze(0))  # (n_cat,)
+                ortho_cats = cat_stack - cos.unsqueeze(1) * primary
+                ortho_norms = ortho_cats.norm(dim=1)
+                # Keep only directions that survived orthogonalization
+                valid = ortho_norms > 0.1
+                if valid.sum() > 0:
+                    ortho_cats = ortho_cats[valid]
+                    ortho_cats = ortho_cats / ortho_cats.norm(dim=1, keepdim=True)
+                    # Take up to (n_directions - 1) category directions
+                    n_take = min(self.n_directions - 1, ortho_cats.shape[0])
+                    new_sub = torch.cat([primary, ortho_cats[:n_take]], dim=0)
+                    self.refusal_subspaces[idx] = new_sub
+            self.log(f"Enriched subspaces with {n_cat} per-category cone directions")
         # Layer selection: use analysis-recommended layers if available,
         # otherwise fall back to knee detection
         if self._insights.recommended_layers:
     def _excise_informed(self):
         """Excise refusal directions with analysis-informed strategy.
+        Uses Bayesian optimization (when available) with analysis-derived
+        warm-start parameters, falling back to sparse surgery or standard
+        projection.  This is the key integration: analysis maps the geometry,
+        Bayesian optimization finds the optimal projection strength.
         """
         if self._insights.use_sparse_surgery:
             self._excise_sparse()
+            return
+        # Enable Bayesian optimization using analysis insights for warm-start.
+        # The analysis provides much better initial parameters than the default
+        # heuristic (strongest-layer-based peak), dramatically narrowing the
+        # search space and improving convergence.
+        self._configure_bayesian_warm_start()
+        self._excise()
+    def _configure_bayesian_warm_start(self):
+        """Configure Bayesian optimization with analysis-derived warm-start.
+        Translates analysis insights into a much tighter search space:
+        - peak_position from cluster representative layers
+        - spread from cluster structure (narrow clusters → narrow spread)
+        - component scaling from entanglement analysis
+        - KL budget from alignment method detection
+        """
+        insights = self._insights
+        # Enable Bayesian optimization (50 trials default, same as heretic)
+        self._bayesian_trials = 50
+        # Also set heretic-compatible flags on the pipeline so the base
+        # _excise_inner() picks them up during Bayesian optimization.
+        self.layer_adaptive_strength = True
+        self.float_layer_interpolation = True
+        self.use_kl_optimization = True
+        # KL budget: tighter for methods that are fragile (CAI, RLHF),
+        # looser for concentrated methods (DPO, SFT).
+        method = insights.detected_alignment_method
+        if method == "dpo":
+            self.kl_budget = 0.5
+        elif method == "rlhf":
+            self.kl_budget = 0.3
+        elif method == "cai":
+            self.kl_budget = 0.2
+        elif method == "sft":
+            self.kl_budget = 0.4
+        else:
+            self.kl_budget = 0.35
+        self.log(f"Bayesian optimization enabled (50 trials, KL budget={self.kl_budget})")
+        self.log("Analysis insights will warm-start the optimizer")
+        # Compute analysis-derived warm-start for the parametric kernel.
+        # The Bayesian optimizer reads these from the pipeline if present.
+        n_layers = len(self._harmful_means) if self._harmful_means else 32
+        if insights.cluster_representative_layers and n_layers > 1:
+            # Peak position: normalized position of the strongest cluster rep
+            norms = {}
+            for idx in self._harmful_means:
+                if idx in self._harmless_means:
+                    norms[idx] = (self._harmful_means[idx] - self._harmless_means[idx]).squeeze().norm().item()
+            reps = insights.cluster_representative_layers
+            if norms:
+                best_rep = max(reps, key=lambda ly: norms.get(ly, 0))
+            else:
+                best_rep = reps[len(reps) // 2]
+            warm_peak = best_rep / max(n_layers - 1, 1)
+            # Spread: narrow if clusters are tight, wide if clusters span many layers
+            if insights.direction_clusters:
+                cluster_widths = [
+                    (max(c) - min(c)) / max(n_layers - 1, 1)
+                    for c in insights.direction_clusters if len(c) > 1
+                ]
+                warm_spread = max(0.1, min(0.6, sum(cluster_widths) / len(cluster_widths) if cluster_widths else 0.3))
+            else:
+                warm_spread = 0.3
+            # Min weight: higher if high persistence (refusal spread across all layers)
+            warm_min = min(0.3, max(0.0, insights.direction_persistence * 0.2))
+            # Attn/MLP scaling: reduce MLP scaling if entanglement is high
+            # (MLP projections cause more capability damage)
+            if insights.entanglement_score > 0.5:
+                warm_mlp = 0.4
+                warm_attn = 0.7
+            else:
+                warm_mlp = 0.6
+                warm_attn = 0.8
         else:
+            warm_peak = 0.5
+            warm_spread = 0.3
+            warm_min = 0.05
+            warm_mlp = 0.6
+            warm_attn = 0.8
+        # Store warm-start params for the Bayesian optimizer to pick up
+        self._informed_warm_start = {
+            "max_weight": 0.9,
+            "peak_position": warm_peak,
+            "min_weight": warm_min,
+            "spread": warm_spread,
+            "attn_scale": warm_attn,
+            "mlp_scale": warm_mlp,
+            "dir_idx": 0.0,
+        }
+        self.log(
+            f"  Warm-start: peak={warm_peak:.2f}, spread={warm_spread:.2f}, "
+            f"min={warm_min:.2f}, attn={warm_attn:.2f}, mlp={warm_mlp:.2f}"
+        )
     def _excise_sparse(self):
         """Sparse direction surgery — only modifies high-projection rows."""
         1. Residual refusal signal (via activation probing)
         2. Self-repair / Ouroboros effect (via defense robustness)
         3. Triggers additional targeted passes at compensating layers
+        KL-gated: stops early if model damage (KL divergence) is getting
+        worse even though refusal persists.  This prevents the death spiral
+        where each pass damages the model without removing refusal.
         """
         # Run standard verification first
         self._verify()
         # Check if Ouroboros compensation is needed
         refusal_rate = self._quality_metrics.get("refusal_rate", 0.0)
+        prev_kl = self._quality_metrics.get("kl_divergence", 0.0)
         ouroboros_pass = 0
+        # KL budget: stop if KL exceeds this threshold (model too damaged)
+        kl_ceiling = getattr(self, "kl_budget", 0.5) * 2.0  # 2x budget as hard ceiling
         while (refusal_rate > self._ouroboros_threshold
                and ouroboros_pass < self._max_ouroboros_passes):
             ouroboros_pass += 1
             self._distill_inner()
             self.log(f"Found {len(self._strong_layers)} layers with residual refusal")
+            # Re-excise at the new strong layers using informed strategy
             if self._strong_layers:
+                self._excise_informed()
             else:
                 self.log("No strong layers found — stopping Ouroboros compensation")
                 break
             # Re-verify
             self._verify()
             refusal_rate = self._quality_metrics.get("refusal_rate", 0.0)
+            current_kl = self._quality_metrics.get("kl_divergence", 0.0)
+            self.log(f"After Ouroboros pass {ouroboros_pass}: refusal={refusal_rate:.0%}, KL={current_kl:.4f}")
+            # KL-gated early stopping: if KL is rising and exceeds ceiling,
+            # the model is being damaged faster than refusal is being removed.
+            if current_kl > kl_ceiling:
+                self.log(
+                    f"KL divergence {current_kl:.4f} exceeds ceiling {kl_ceiling:.4f} — "
+                    f"stopping to prevent further model damage"
+                )
+                break
+            if ouroboros_pass > 1 and current_kl > prev_kl * 1.5 and refusal_rate > 0.3:
+                self.log(
+                    f"KL rising sharply ({prev_kl:.4f} → {current_kl:.4f}) with "
+                    f"refusal still at {refusal_rate:.0%} — stopping (diminishing returns)"
+                )
+                break
+            prev_kl = current_kl
         self._report.ouroboros_passes = ouroboros_pass
         self._report.final_refusal_rate = refusal_rate
             },
             "derived_config": {
                 "n_directions": insights.recommended_n_directions,
+                "direction_method": insights.recommended_direction_method,
                 "regularization": insights.recommended_regularization,
                 "refinement_passes": insights.recommended_refinement_passes,
                 "layers_used": insights.recommended_layers,
         lines.append("Derived Configuration:")
         lines.append(f"  n_directions: {insights.recommended_n_directions}")
+        lines.append(f"  direction_method: {insights.recommended_direction_method}")
         lines.append(f"  regularization: {insights.recommended_regularization}")
         lines.append(f"  refinement_passes: {insights.recommended_refinement_passes}")
         lines.append(f"  sparse surgery: {insights.use_sparse_surgery}")

scripts/run_benchmark_remote.sh CHANGED Viewed

@@ -144,12 +144,18 @@ def _patched_collect(self, layer_modules, prompts, label):
             torch.cuda.mem_get_info(i)[0] / (1024 ** 3)
             for i in range(torch.cuda.device_count())
         )
-        if free_gb < 2.0:
             max_length = 64
-            self.log(f"  Low GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
-        elif free_gb < 4.0:
             max_length = 128
-            self.log(f"  Tight GPU memory ({free_gb:.1f} GB free), using max_length={max_length}")
     device = self._get_model_device(model)

             torch.cuda.mem_get_info(i)[0] / (1024 ** 3)
             for i in range(torch.cuda.device_count())
         )
+        # Scale thresholds by model size (baseline: 7B with hidden=4096, 32 layers)
+        _h = self.handle.hidden_size if self.handle else 4096
+        _l = n_layers if n_layers else 32
+        _ms = (_h / 4096) * (_l / 32)
+        _tight = max(4.0 * _ms, 0.5)
+        _low = max(2.0 * _ms, 0.25)
+        if free_gb < _low:
             max_length = 64
+            self.log(f"  Low GPU memory ({free_gb:.1f} GB free, threshold {_low:.1f} GB), using max_length={max_length}")
+        elif free_gb < _tight:
             max_length = 128
+            self.log(f"  Tight GPU memory ({free_gb:.1f} GB free, threshold {_tight:.1f} GB), using max_length={max_length}")
     device = self._get_model_device(model)

tests/test_informed_pipeline.py CHANGED Viewed

@@ -50,7 +50,8 @@ class TestAnalysisInsights:
         assert insights.cluster_count == 0
         assert insights.direction_persistence == 0.0
         assert insights.use_sparse_surgery is False
-        assert insights.recommended_n_directions == 4
         assert insights.recommended_regularization == 0.0
         assert insights.recommended_refinement_passes == 2
         assert insights.recommended_layers == []
@@ -86,12 +87,16 @@ class TestInformedMethod:
         assert cfg["norm_preserve"] is True
         assert cfg["project_biases"] is True
         assert cfg["use_chat_template"] is True
-        assert cfg["use_whitened_svd"] is True
         assert cfg["true_iterative_refinement"] is True
     def test_informed_method_standalone(self):
         assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
-        assert INFORMED_METHOD["n_directions"] == 4
         assert INFORMED_METHOD["norm_preserve"] is True
@@ -121,8 +126,10 @@ class TestPipelineInit:
         assert pipeline.norm_preserve is True
         assert pipeline.project_biases is True
         assert pipeline.use_chat_template is True
-        assert pipeline.use_whitened_svd is True
         assert pipeline.true_iterative_refinement is True
     def test_custom_flags(self):
         p = InformedAbliterationPipeline(
@@ -162,17 +169,31 @@ class TestConfigurationDerivation:
             cone_dimensionality=3.5,
         )
         p._derive_configuration()
-        # Polyhedral with dim 3.5 → n_dirs = max(4, min(8, int(3.5*2))) = 7
         assert p.n_directions == 7
-    def test_linear_cone_fewer_directions(self):
         p = self._make_pipeline_with_insights(
             cone_is_polyhedral=False,
             cone_dimensionality=1.0,
         )
         p._derive_configuration()
-        # Linear with dim 1.0 → n_dirs = max(1, min(4, int(1.0+1))) = 2
-        assert p.n_directions == 2
     def test_dpo_zero_regularization(self):
         p = self._make_pipeline_with_insights(
@@ -282,6 +303,7 @@ class TestConfigurationDerivation:
         p._derive_configuration()
         assert p.n_directions > 1
         assert p.use_whitened_svd is True
     def test_no_whitened_svd_for_single_direction(self):
         p = self._make_pipeline_with_insights(
@@ -289,9 +311,9 @@ class TestConfigurationDerivation:
             cone_dimensionality=0.5,
         )
         p._derive_configuration()
-        # dim 0.5 → max(1, min(4, int(0.5+1))) = 1
         assert p.n_directions == 1
         assert p.use_whitened_svd is False
 # ---------------------------------------------------------------------------
@@ -320,10 +342,12 @@ class TestFormatInsights:
     def test_format_includes_derived_config(self, insights):
         insights.recommended_n_directions = 6
         insights.recommended_regularization = 0.2
         insights.recommended_refinement_passes = 3
         text = InformedAbliterationPipeline.format_insights(insights)
         assert "n_directions: 6" in text
         assert "regularization: 0.2" in text
         assert "refinement_passes: 3" in text
@@ -372,14 +396,16 @@ class TestEdgeCases:
             model_name="test",
             on_log=lambda m: None,
         )
-        # Very high dimensionality
         p._insights.cone_is_polyhedral = True
         p._insights.cone_dimensionality = 10.0
         p._derive_configuration()
         assert p.n_directions <= 8  # capped
-        # Very low dimensionality
         p._insights.cone_is_polyhedral = False
         p._insights.cone_dimensionality = 0.1
         p._derive_configuration()
-        assert p.n_directions >= 1  # at least 1

         assert insights.cluster_count == 0
         assert insights.direction_persistence == 0.0
         assert insights.use_sparse_surgery is False
+        assert insights.recommended_n_directions == 1
+        assert insights.recommended_direction_method == "diff_means"
         assert insights.recommended_regularization == 0.0
         assert insights.recommended_refinement_passes == 2
         assert insights.recommended_layers == []
         assert cfg["norm_preserve"] is True
         assert cfg["project_biases"] is True
         assert cfg["use_chat_template"] is True
+        assert cfg["use_whitened_svd"] is False
         assert cfg["true_iterative_refinement"] is True
+        assert cfg["n_directions"] == 1
+        assert cfg["direction_method"] == "diff_means"
+        assert cfg["use_kl_optimization"] is True
     def test_informed_method_standalone(self):
         assert INFORMED_METHOD["label"] == "Informed (Analysis-Guided)"
+        assert INFORMED_METHOD["n_directions"] == 1
+        assert INFORMED_METHOD["direction_method"] == "diff_means"
         assert INFORMED_METHOD["norm_preserve"] is True
         assert pipeline.norm_preserve is True
         assert pipeline.project_biases is True
         assert pipeline.use_chat_template is True
+        assert pipeline.use_whitened_svd is False
         assert pipeline.true_iterative_refinement is True
+        assert pipeline.direction_method == "diff_means"
+        assert pipeline.n_directions == 1
     def test_custom_flags(self):
         p = InformedAbliterationPipeline(
             cone_dimensionality=3.5,
         )
         p._derive_configuration()
+        # Clearly polyhedral with dim 3.5 > 2.0 → SVD multi-direction
         assert p.n_directions == 7
+        assert p.direction_method == "svd"
+        assert p.use_whitened_svd is True
+    def test_mildly_polyhedral_uses_leace(self):
+        p = self._make_pipeline_with_insights(
+            cone_is_polyhedral=True,
+            cone_dimensionality=1.5,
+        )
+        p._derive_configuration()
+        # Mildly polyhedral (dim <= 2.0) → single LEACE direction
+        assert p.n_directions == 1
+        assert p.direction_method == "leace"
+    def test_linear_cone_uses_diff_means(self):
         p = self._make_pipeline_with_insights(
             cone_is_polyhedral=False,
             cone_dimensionality=1.0,
         )
         p._derive_configuration()
+        # Linear cone → single diff-of-means direction
+        assert p.n_directions == 1
+        assert p.direction_method == "diff_means"
+        assert p.use_whitened_svd is False
     def test_dpo_zero_regularization(self):
         p = self._make_pipeline_with_insights(
         p._derive_configuration()
         assert p.n_directions > 1
         assert p.use_whitened_svd is True
+        assert p.direction_method == "svd"
     def test_no_whitened_svd_for_single_direction(self):
         p = self._make_pipeline_with_insights(
             cone_dimensionality=0.5,
         )
         p._derive_configuration()
         assert p.n_directions == 1
         assert p.use_whitened_svd is False
+        assert p.direction_method == "diff_means"
 # ---------------------------------------------------------------------------
     def test_format_includes_derived_config(self, insights):
         insights.recommended_n_directions = 6
+        insights.recommended_direction_method = "svd"
         insights.recommended_regularization = 0.2
         insights.recommended_refinement_passes = 3
         text = InformedAbliterationPipeline.format_insights(insights)
         assert "n_directions: 6" in text
+        assert "direction_method: svd" in text
         assert "regularization: 0.2" in text
         assert "refinement_passes: 3" in text
             model_name="test",
             on_log=lambda m: None,
         )
+        # Very high dimensionality → multi-direction SVD, capped at 8
         p._insights.cone_is_polyhedral = True
         p._insights.cone_dimensionality = 10.0
         p._derive_configuration()
         assert p.n_directions <= 8  # capped
+        assert p.direction_method == "svd"
+        # Very low dimensionality → single diff-of-means
         p._insights.cone_is_polyhedral = False
         p._insights.cone_dimensionality = 0.1
         p._derive_configuration()
+        assert p.n_directions == 1
+        assert p.direction_method == "diff_means"

tests/test_leace.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""Tests for LEACE (LEAst-squares Concept Erasure) direction extraction."""
+from __future__ import annotations
+import pytest
+import torch
+from obliteratus.analysis.leace import LEACEExtractor, LEACEResult
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+@pytest.fixture
+def extractor():
+    return LEACEExtractor(regularization_eps=1e-4)
+@pytest.fixture
+def separable_data():
+    """Generate clearly separable harmful/harmless activations."""
+    torch.manual_seed(42)
+    d = 64
+    n = 20
+    # Harmful activations: cluster around [1, 0, 0, ...]
+    harmful_dir = torch.zeros(d)
+    harmful_dir[0] = 1.0
+    harmful = [harmful_dir + 0.1 * torch.randn(d) for _ in range(n)]
+    # Harmless activations: cluster around [-1, 0, 0, ...]
+    harmless = [-harmful_dir + 0.1 * torch.randn(d) for _ in range(n)]
+    return harmful, harmless
+@pytest.fixture
+def isotropic_data():
+    """Data where classes differ only in mean, with isotropic variance."""
+    torch.manual_seed(123)
+    d = 32
+    n = 30
+    direction = torch.randn(d)
+    direction = direction / direction.norm()
+    harmful = [direction * 2.0 + torch.randn(d) for _ in range(n)]
+    harmless = [-direction * 2.0 + torch.randn(d) for _ in range(n)]
+    return harmful, harmless, direction
+# ---------------------------------------------------------------------------
+# LEACEResult
+# ---------------------------------------------------------------------------
+class TestLEACEResult:
+    def test_result_fields(self, extractor, separable_data):
+        harmful, harmless = separable_data
+        result = extractor.extract(harmful, harmless, layer_idx=5)
+        assert isinstance(result, LEACEResult)
+        assert result.layer_idx == 5
+        assert result.direction.shape == (64,)
+        assert result.generalized_eigenvalue > 0
+        assert result.within_class_condition > 0
+        assert result.mean_diff_norm > 0
+        assert result.erasure_loss >= 0
+    def test_direction_is_unit_vector(self, extractor, separable_data):
+        harmful, harmless = separable_data
+        result = extractor.extract(harmful, harmless)
+        norm = result.direction.norm().item()
+        assert abs(norm - 1.0) < 1e-5
+# ---------------------------------------------------------------------------
+# Direction quality
+# ---------------------------------------------------------------------------
+class TestDirectionQuality:
+    def test_finds_true_direction(self, extractor, separable_data):
+        """LEACE should find a direction aligned with the true separation axis."""
+        harmful, harmless = separable_data
+        result = extractor.extract(harmful, harmless)
+        # True direction is [1, 0, 0, ...]
+        true_dir = torch.zeros(64)
+        true_dir[0] = 1.0
+        cosine = (result.direction @ true_dir).abs().item()
+        # With 20 samples in 64 dims, some noise is expected
+        assert cosine > 0.5, f"LEACE direction not aligned with true direction: {cosine}"
+    def test_isotropic_matches_diff_of_means(self, extractor, isotropic_data):
+        """With isotropic noise, LEACE should roughly match diff-of-means."""
+        harmful, harmless, true_dir = isotropic_data
+        result = extractor.extract(harmful, harmless)
+        # Diff of means
+        diff = torch.stack(harmful).mean(0) - torch.stack(harmless).mean(0)
+        diff_normalized = diff / diff.norm()
+        cosine = (result.direction @ diff_normalized).abs().item()
+        # With finite samples and regularization, some deviation is expected
+        assert cosine > 0.5
+    def test_leace_differs_from_diff_means_with_anisotropic_noise(self):
+        """With anisotropic noise, LEACE should find a better direction than diff-of-means."""
+        torch.manual_seed(77)
+        d = 64
+        n = 50
+        # True refusal direction
+        true_dir = torch.zeros(d)
+        true_dir[0] = 1.0
+        # Add anisotropic noise: high variance in dim 1 (NOT the refusal direction)
+        noise_scale = torch.ones(d) * 0.1
+        noise_scale[1] = 5.0  # Rogue dimension
+        harmful = [true_dir * 0.5 + torch.randn(d) * noise_scale for _ in range(n)]
+        harmless = [-true_dir * 0.5 + torch.randn(d) * noise_scale for _ in range(n)]
+        extractor = LEACEExtractor()
+        result = extractor.extract(harmful, harmless)
+        cosine_to_true = (result.direction @ true_dir).abs().item()
+        # LEACE should still find the true direction, not be distracted by rogue dim
+        assert cosine_to_true > 0.5, f"LEACE distracted by rogue dimension: {cosine_to_true}"
+# ---------------------------------------------------------------------------
+# Comparison with diff-of-means
+# ---------------------------------------------------------------------------
+class TestCompareWithDiffOfMeans:
+    def test_comparison_output(self, extractor, separable_data):
+        harmful, harmless = separable_data
+        result = extractor.extract(harmful, harmless)
+        harmful_mean = torch.stack(harmful).mean(0)
+        harmless_mean = torch.stack(harmless).mean(0)
+        comparison = LEACEExtractor.compare_with_diff_of_means(
+            result, harmful_mean, harmless_mean,
+        )
+        assert "cosine_similarity" in comparison
+        assert "leace_eigenvalue" in comparison
+        assert "leace_erasure_loss" in comparison
+        assert "within_class_condition" in comparison
+        assert "mean_diff_norm" in comparison
+        assert 0 <= comparison["cosine_similarity"] <= 1.0
+# ---------------------------------------------------------------------------
+# Multi-layer extraction
+# ---------------------------------------------------------------------------
+class TestMultiLayer:
+    def test_extract_all_layers(self, extractor):
+        torch.manual_seed(42)
+        d = 32
+        n = 15
+        harmful_acts = {}
+        harmless_acts = {}
+        for layer in [0, 1, 2, 5]:
+            harmful_acts[layer] = [torch.randn(d) + 0.5 for _ in range(n)]
+            harmless_acts[layer] = [torch.randn(d) - 0.5 for _ in range(n)]
+        results = extractor.extract_all_layers(harmful_acts, harmless_acts)
+        assert set(results.keys()) == {0, 1, 2, 5}
+        for idx, result in results.items():
+            assert result.layer_idx == idx
+            assert result.direction.shape == (d,)
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+class TestEdgeCases:
+    def test_single_sample(self, extractor):
+        """Should handle single sample per class gracefully."""
+        d = 32
+        harmful = [torch.randn(d)]
+        harmless = [torch.randn(d)]
+        result = extractor.extract(harmful, harmless)
+        assert result.direction.shape == (d,)
+        assert torch.isfinite(result.direction).all()
+    def test_identical_activations(self, extractor):
+        """Should handle case where harmful == harmless."""
+        d = 32
+        x = torch.randn(d)
+        harmful = [x.clone() for _ in range(5)]
+        harmless = [x.clone() for _ in range(5)]
+        result = extractor.extract(harmful, harmless)
+        assert result.direction.shape == (d,)
+        # Direction norm should be ~0 or direction is a fallback
+        assert torch.isfinite(result.direction).all()
+    def test_3d_input_squeezed(self, extractor):
+        """Should handle (n, 1, d) shaped inputs."""
+        d = 32
+        harmful = [torch.randn(1, d) for _ in range(10)]
+        harmless = [torch.randn(1, d) for _ in range(10)]
+        result = extractor.extract(harmful, harmless)
+        assert result.direction.shape == (d,)
+    def test_shrinkage(self):
+        """Shrinkage should produce valid results."""
+        torch.manual_seed(42)
+        d = 64
+        n = 10  # n < d → need shrinkage
+        harmful = [torch.randn(d) + 0.3 for _ in range(n)]
+        harmless = [torch.randn(d) - 0.3 for _ in range(n)]
+        extractor = LEACEExtractor(shrinkage=0.5)
+        result = extractor.extract(harmful, harmless)
+        assert result.direction.shape == (d,)
+        assert torch.isfinite(result.direction).all()