obliteratus

Running on Zero

App Files Files Community

pliny-the-prompter commited on Mar 5

Commit

54c44c0

verified ·

1 Parent(s): a46d378

Upload 128 files

Browse files

Files changed (10) hide show

README.md +1 -1
app.py +15 -8
docs/RESEARCH_SURVEY.md +2 -2
docs/theory_journal.md +1 -1
notebooks/abliterate.ipynb +1 -1
obliteratus/abliterate.py +2 -2
obliteratus/interactive.py +1 -1
obliteratus/local_ui.py +8 -1
paper/references.bib +8 -5
scripts/benchmark_sota_comparison.py +1 -1

README.md CHANGED Viewed

@@ -526,7 +526,7 @@ Works with any HuggingFace transformer, including: GPT-2, LLaMA, Mistral, Falcon
 ## References
 - Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
-- Gulmez, G. (2025). *Gabliteration: SVD-Based Multi-Direction Refusal Removal.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
 - grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
 - Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
 - Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)

 ## References
 - Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
+- Gülmez, G. (2026). *Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
 - grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
 - Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
 - Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)

app.py CHANGED Viewed

@@ -117,7 +117,7 @@ _obliterate_counter: int = 0
 # Flag to suppress session_model_dd.change when obliterate programmatically
 # sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
-_skip_session_load: bool = False
 # ---------------------------------------------------------------------------
 # Model presets — 100+ models organized by provider
@@ -1870,7 +1870,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
         # Set skip flag so the .change handler doesn't trigger a wasteful
         # GPU re-allocation — the model is already loaded.
         global _skip_session_load
-        _skip_session_load = True
         _dd_update = gr.update(
             choices=_get_session_model_choices(),
             value=_last_obliterated_label or None,
@@ -1950,13 +1950,17 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
     # ZeroGPU safety: detect whether we need to reload from checkpoint.
     # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
     # model as None (garbage-collected) or with stale/meta tensors.
     _needs_reload = model is None or tokenizer is None
     if not _needs_reload:
         try:
             dev = next(model.parameters()).device
-            if torch.cuda.is_available() and dev.type != "cuda":
                 model.to("cuda")
-        except (StopIteration, RuntimeError):
             _needs_reload = True
     # Reload from saved checkpoint if model is missing or stale
@@ -2114,8 +2118,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
     # Skip if the obliterate function just set the dropdown value — the model
     # is already loaded and we'd just waste GPU quota re-allocating.
     global _skip_session_load
-    if _skip_session_load:
-        _skip_session_load = False
         if choice and _state.get("status") == "ready":
             yield (
                 f"**Ready!** `{choice}` is loaded — just type in the chat below.",
@@ -2362,13 +2366,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
     # ZeroGPU safety: detect whether we need to reload from checkpoint.
     # Model may be None (garbage-collected after GPU deallocation) or stale.
     _needs_reload = abliterated_model is None or tokenizer is None
     if not _needs_reload:
         try:
             dev = next(abliterated_model.parameters()).device
-            if torch.cuda.is_available() and dev.type != "cuda":
                 abliterated_model.to("cuda")
-        except (StopIteration, RuntimeError):
             _needs_reload = True
     if _needs_reload:

 # Flag to suppress session_model_dd.change when obliterate programmatically
 # sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
+_skip_session_load: int = 0  # counter (not bool) — obliterate sets to 2 for both dropdowns
 # ---------------------------------------------------------------------------
 # Model presets — 100+ models organized by provider
         # Set skip flag so the .change handler doesn't trigger a wasteful
         # GPU re-allocation — the model is already loaded.
         global _skip_session_load
+        _skip_session_load = 2  # both session_model_dd and ab_session_model_dd fire .change
         _dd_update = gr.update(
             choices=_get_session_model_choices(),
             value=_last_obliterated_label or None,
     # ZeroGPU safety: detect whether we need to reload from checkpoint.
     # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
     # model as None (garbage-collected) or with stale/meta tensors.
+    # Meta tensors raise NotImplementedError on .to(), not RuntimeError,
+    # so we catch Exception broadly here.
     _needs_reload = model is None or tokenizer is None
     if not _needs_reload:
         try:
             dev = next(model.parameters()).device
+            if dev.type == "meta":
+                _needs_reload = True
+            elif torch.cuda.is_available() and dev.type != "cuda":
                 model.to("cuda")
+        except Exception:
             _needs_reload = True
     # Reload from saved checkpoint if model is missing or stale
     # Skip if the obliterate function just set the dropdown value — the model
     # is already loaded and we'd just waste GPU quota re-allocating.
     global _skip_session_load
+    if _skip_session_load > 0:
+        _skip_session_load -= 1
         if choice and _state.get("status") == "ready":
             yield (
                 f"**Ready!** `{choice}` is loaded — just type in the chat below.",
     # ZeroGPU safety: detect whether we need to reload from checkpoint.
     # Model may be None (garbage-collected after GPU deallocation) or stale.
+    # Meta tensors raise NotImplementedError on .to(), so catch broadly.
     _needs_reload = abliterated_model is None or tokenizer is None
     if not _needs_reload:
         try:
             dev = next(abliterated_model.parameters()).device
+            if dev.type == "meta":
+                _needs_reload = True
+            elif torch.cuda.is_available() and dev.type != "cuda":
                 abliterated_model.to("cuda")
+        except Exception:
             _needs_reload = True
     if _needs_reload:

docs/RESEARCH_SURVEY.md CHANGED Viewed

@@ -116,7 +116,7 @@ The paper also analyzes how adversarial suffixes (e.g., GCG-generated) suppress
 ## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
-**Author:** Gokdeniz Gulmez (independent research)
 **arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
 **Version:** v3, revised January 28, 2026
 **Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
@@ -733,7 +733,7 @@ MI research helps make AI safe but could be used adversarially. The same techniq
 1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
-2. Gulmez, G. (2025). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
 3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)

 ## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
+**Author:** Gökdeniz Gülmez (independent research)
 **arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
 **Version:** v3, revised January 28, 2026
 **Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
 1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
+2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
 3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)

docs/theory_journal.md CHANGED Viewed

@@ -1802,7 +1802,7 @@ implementations diverge from the closed-form GRRO solution.
 ## References
 1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
-2. Gulmez, G. (2025). Gabliteration. arXiv:2512.18901.
 3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
 4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
 5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.

 ## References
 1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
+2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. arXiv:2512.18901.
 3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
 4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
 5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.

notebooks/abliterate.ipynb CHANGED Viewed

@@ -53,7 +53,7 @@
     "id": "install"
    },
    "outputs": [],
-   "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
   },
   {
    "cell_type": "markdown",

     "id": "install"
    },
    "outputs": [],
+   "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n    print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")"
   },
   {
    "cell_type": "markdown",

obliteratus/abliterate.py CHANGED Viewed

@@ -334,7 +334,7 @@ METHODS = {
         "layer_selection": "middle60",
     },
     "gabliteration": {
-        "label": "Gabliteration (Gulmez 2025 Baseline)",
         "description": (
             "Faithful reproduction of Gabliteration (arXiv:2512.18901). "
             "SVD-based multi-direction extraction (top-4), ridge-regularized "
@@ -2494,7 +2494,7 @@ class AbliterationPipeline:
         References:
         - SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
-        - Gabliteration (Gulmez, 2025): multi-direction SVD abliteration
         - SAFEx (Lai et al., NeurIPS 2025): safety expert identification
         """
         if not self._routing_harmful or not self._routing_harmless:

         "layer_selection": "middle60",
     },
     "gabliteration": {
+        "label": "Gabliteration (Gülmez 2026 Baseline)",
         "description": (
             "Faithful reproduction of Gabliteration (arXiv:2512.18901). "
             "SVD-based multi-direction extraction (top-4), ridge-regularized "
         References:
         - SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
+        - Gabliteration (Gülmez, 2026): multi-direction SVD abliteration
         - SAFEx (Lai et al., NeurIPS 2025): safety expert identification
         """
         if not self._routing_harmful or not self._routing_harmless:

obliteratus/interactive.py CHANGED Viewed

@@ -24,7 +24,7 @@ def _detect_compute_tier() -> str:
         import torch
         if torch.cuda.is_available():
-            vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3)
             if vram_gb >= 20:
                 return "large"
             elif vram_gb >= 8:

         import torch
         if torch.cuda.is_available():
+            vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
             if vram_gb >= 20:
                 return "large"
             elif vram_gb >= 8:

obliteratus/local_ui.py CHANGED Viewed

@@ -10,6 +10,7 @@ Usage:
 from __future__ import annotations
 import os
 import platform
 import shutil
 import sys
@@ -48,7 +49,7 @@ def _detect_gpu() -> list[dict]:
                     {
                         "index": i,
                         "name": props.name,
-                        "vram_gb": round(props.total_mem / 1024**3, 1),
                         "compute": f"{props.major}.{props.minor}",
                     }
                 )
@@ -292,6 +293,12 @@ def launch_local_ui(
     console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
     start = time.time()
     from app import launch as app_launch
     elapsed = time.time() - start

 from __future__ import annotations
 import os
+import pathlib
 import platform
 import shutil
 import sys
                     {
                         "index": i,
                         "name": props.name,
+                        "vram_gb": round(props.total_memory / 1024**3, 1),
                         "compute": f"{props.major}.{props.minor}",
                     }
                 )
     console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
     start = time.time()
+    # app.py lives at the project root, one level above this package.
+    # When installed via pip the root isn't on sys.path, so add it.
+    _project_root = str(pathlib.Path(__file__).resolve().parent.parent)
+    if _project_root not in sys.path:
+        sys.path.insert(0, _project_root)
     from app import launch as app_launch
     elapsed = time.time() - start

paper/references.bib CHANGED Viewed

@@ -7,11 +7,14 @@
   year={2024}
 }
-@article{gabliteration2024,
-  title={{Gabliteration}: {SVD}-Based Multi-Direction Refusal Removal},
-  author={Gabriel, Saul and {contributors}},
-  journal={arXiv preprint arXiv:2512.18901},
-  year={2024}
 }
 @misc{grimjim2025,

   year={2024}
 }
+@misc{gabliteration2024,
+  title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models},
+  author={G\"{o}kdeniz G\"{u}lmez},
+  year={2026},
+  eprint={2512.18901},
+  archivePrefix={arXiv},
+  primaryClass={cs.AI},
+  url={https://arxiv.org/abs/2512.18901}
 }
 @misc{grimjim2025,

scripts/benchmark_sota_comparison.py CHANGED Viewed

@@ -7,7 +7,7 @@ comparison tables with standardized community metrics.
 Baselines included:
   1. FailSpy/abliterator (2024) — Community workhorse baseline
-  2. Gabliteration (Gulmez 2025) — SVD multi-direction + ridge regularization
   3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
   4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization

 Baselines included:
   1. FailSpy/abliterator (2024) — Community workhorse baseline
+  2. Gabliteration (Gülmez 2026) — SVD multi-direction + ridge regularization
   3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
   4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization