Spaces:
Running on Zero
Running on Zero
Upload 128 files
Browse files- README.md +1 -1
- app.py +15 -8
- docs/RESEARCH_SURVEY.md +2 -2
- docs/theory_journal.md +1 -1
- notebooks/abliterate.ipynb +1 -1
- obliteratus/abliterate.py +2 -2
- obliteratus/interactive.py +1 -1
- obliteratus/local_ui.py +8 -1
- paper/references.bib +8 -5
- scripts/benchmark_sota_comparison.py +1 -1
README.md
CHANGED
|
@@ -526,7 +526,7 @@ Works with any HuggingFace transformer, including: GPT-2, LLaMA, Mistral, Falcon
|
|
| 526 |
## References
|
| 527 |
|
| 528 |
- Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
|
| 529 |
-
-
|
| 530 |
- grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
|
| 531 |
- Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
|
| 532 |
- Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)
|
|
|
|
| 526 |
## References
|
| 527 |
|
| 528 |
- Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
|
| 529 |
+
- Gülmez, G. (2026). *Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
|
| 530 |
- grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
|
| 531 |
- Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
|
| 532 |
- Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)
|
app.py
CHANGED
|
@@ -117,7 +117,7 @@ _obliterate_counter: int = 0
|
|
| 117 |
|
| 118 |
# Flag to suppress session_model_dd.change when obliterate programmatically
|
| 119 |
# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
|
| 120 |
-
_skip_session_load:
|
| 121 |
|
| 122 |
# ---------------------------------------------------------------------------
|
| 123 |
# Model presets — 100+ models organized by provider
|
|
@@ -1870,7 +1870,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
|
|
| 1870 |
# Set skip flag so the .change handler doesn't trigger a wasteful
|
| 1871 |
# GPU re-allocation — the model is already loaded.
|
| 1872 |
global _skip_session_load
|
| 1873 |
-
_skip_session_load =
|
| 1874 |
_dd_update = gr.update(
|
| 1875 |
choices=_get_session_model_choices(),
|
| 1876 |
value=_last_obliterated_label or None,
|
|
@@ -1950,13 +1950,17 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
|
|
| 1950 |
# ZeroGPU safety: detect whether we need to reload from checkpoint.
|
| 1951 |
# Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
|
| 1952 |
# model as None (garbage-collected) or with stale/meta tensors.
|
|
|
|
|
|
|
| 1953 |
_needs_reload = model is None or tokenizer is None
|
| 1954 |
if not _needs_reload:
|
| 1955 |
try:
|
| 1956 |
dev = next(model.parameters()).device
|
| 1957 |
-
if
|
|
|
|
|
|
|
| 1958 |
model.to("cuda")
|
| 1959 |
-
except
|
| 1960 |
_needs_reload = True
|
| 1961 |
|
| 1962 |
# Reload from saved checkpoint if model is missing or stale
|
|
@@ -2114,8 +2118,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
|
|
| 2114 |
# Skip if the obliterate function just set the dropdown value — the model
|
| 2115 |
# is already loaded and we'd just waste GPU quota re-allocating.
|
| 2116 |
global _skip_session_load
|
| 2117 |
-
if _skip_session_load:
|
| 2118 |
-
_skip_session_load =
|
| 2119 |
if choice and _state.get("status") == "ready":
|
| 2120 |
yield (
|
| 2121 |
f"**Ready!** `{choice}` is loaded — just type in the chat below.",
|
|
@@ -2362,13 +2366,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
|
|
| 2362 |
|
| 2363 |
# ZeroGPU safety: detect whether we need to reload from checkpoint.
|
| 2364 |
# Model may be None (garbage-collected after GPU deallocation) or stale.
|
|
|
|
| 2365 |
_needs_reload = abliterated_model is None or tokenizer is None
|
| 2366 |
if not _needs_reload:
|
| 2367 |
try:
|
| 2368 |
dev = next(abliterated_model.parameters()).device
|
| 2369 |
-
if
|
|
|
|
|
|
|
| 2370 |
abliterated_model.to("cuda")
|
| 2371 |
-
except
|
| 2372 |
_needs_reload = True
|
| 2373 |
|
| 2374 |
if _needs_reload:
|
|
|
|
| 117 |
|
| 118 |
# Flag to suppress session_model_dd.change when obliterate programmatically
|
| 119 |
# sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
|
| 120 |
+
_skip_session_load: int = 0 # counter (not bool) — obliterate sets to 2 for both dropdowns
|
| 121 |
|
| 122 |
# ---------------------------------------------------------------------------
|
| 123 |
# Model presets — 100+ models organized by provider
|
|
|
|
| 1870 |
# Set skip flag so the .change handler doesn't trigger a wasteful
|
| 1871 |
# GPU re-allocation — the model is already loaded.
|
| 1872 |
global _skip_session_load
|
| 1873 |
+
_skip_session_load = 2 # both session_model_dd and ab_session_model_dd fire .change
|
| 1874 |
_dd_update = gr.update(
|
| 1875 |
choices=_get_session_model_choices(),
|
| 1876 |
value=_last_obliterated_label or None,
|
|
|
|
| 1950 |
# ZeroGPU safety: detect whether we need to reload from checkpoint.
|
| 1951 |
# Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
|
| 1952 |
# model as None (garbage-collected) or with stale/meta tensors.
|
| 1953 |
+
# Meta tensors raise NotImplementedError on .to(), not RuntimeError,
|
| 1954 |
+
# so we catch Exception broadly here.
|
| 1955 |
_needs_reload = model is None or tokenizer is None
|
| 1956 |
if not _needs_reload:
|
| 1957 |
try:
|
| 1958 |
dev = next(model.parameters()).device
|
| 1959 |
+
if dev.type == "meta":
|
| 1960 |
+
_needs_reload = True
|
| 1961 |
+
elif torch.cuda.is_available() and dev.type != "cuda":
|
| 1962 |
model.to("cuda")
|
| 1963 |
+
except Exception:
|
| 1964 |
_needs_reload = True
|
| 1965 |
|
| 1966 |
# Reload from saved checkpoint if model is missing or stale
|
|
|
|
| 2118 |
# Skip if the obliterate function just set the dropdown value — the model
|
| 2119 |
# is already loaded and we'd just waste GPU quota re-allocating.
|
| 2120 |
global _skip_session_load
|
| 2121 |
+
if _skip_session_load > 0:
|
| 2122 |
+
_skip_session_load -= 1
|
| 2123 |
if choice and _state.get("status") == "ready":
|
| 2124 |
yield (
|
| 2125 |
f"**Ready!** `{choice}` is loaded — just type in the chat below.",
|
|
|
|
| 2366 |
|
| 2367 |
# ZeroGPU safety: detect whether we need to reload from checkpoint.
|
| 2368 |
# Model may be None (garbage-collected after GPU deallocation) or stale.
|
| 2369 |
+
# Meta tensors raise NotImplementedError on .to(), so catch broadly.
|
| 2370 |
_needs_reload = abliterated_model is None or tokenizer is None
|
| 2371 |
if not _needs_reload:
|
| 2372 |
try:
|
| 2373 |
dev = next(abliterated_model.parameters()).device
|
| 2374 |
+
if dev.type == "meta":
|
| 2375 |
+
_needs_reload = True
|
| 2376 |
+
elif torch.cuda.is_available() and dev.type != "cuda":
|
| 2377 |
abliterated_model.to("cuda")
|
| 2378 |
+
except Exception:
|
| 2379 |
_needs_reload = True
|
| 2380 |
|
| 2381 |
if _needs_reload:
|
docs/RESEARCH_SURVEY.md
CHANGED
|
@@ -116,7 +116,7 @@ The paper also analyzes how adversarial suffixes (e.g., GCG-generated) suppress
|
|
| 116 |
|
| 117 |
## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
|
| 118 |
|
| 119 |
-
**Author:**
|
| 120 |
**arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
|
| 121 |
**Version:** v3, revised January 28, 2026
|
| 122 |
**Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
|
|
@@ -733,7 +733,7 @@ MI research helps make AI safe but could be used adversarially. The same techniq
|
|
| 733 |
|
| 734 |
1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
|
| 735 |
|
| 736 |
-
2.
|
| 737 |
|
| 738 |
3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)
|
| 739 |
|
|
|
|
| 116 |
|
| 117 |
## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
|
| 118 |
|
| 119 |
+
**Author:** Gökdeniz Gülmez (independent research)
|
| 120 |
**arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
|
| 121 |
**Version:** v3, revised January 28, 2026
|
| 122 |
**Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
|
|
|
|
| 733 |
|
| 734 |
1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
|
| 735 |
|
| 736 |
+
2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
|
| 737 |
|
| 738 |
3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)
|
| 739 |
|
docs/theory_journal.md
CHANGED
|
@@ -1802,7 +1802,7 @@ implementations diverge from the closed-form GRRO solution.
|
|
| 1802 |
## References
|
| 1803 |
|
| 1804 |
1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
|
| 1805 |
-
2.
|
| 1806 |
3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
|
| 1807 |
4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
|
| 1808 |
5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.
|
|
|
|
| 1802 |
## References
|
| 1803 |
|
| 1804 |
1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
|
| 1805 |
+
2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. arXiv:2512.18901.
|
| 1806 |
3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
|
| 1807 |
4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
|
| 1808 |
5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.
|
notebooks/abliterate.ipynb
CHANGED
|
@@ -53,7 +53,7 @@
|
|
| 53 |
"id": "install"
|
| 54 |
},
|
| 55 |
"outputs": [],
|
| 56 |
-
"source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).
|
| 57 |
},
|
| 58 |
{
|
| 59 |
"cell_type": "markdown",
|
|
|
|
| 53 |
"id": "install"
|
| 54 |
},
|
| 55 |
"outputs": [],
|
| 56 |
+
"source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")"
|
| 57 |
},
|
| 58 |
{
|
| 59 |
"cell_type": "markdown",
|
obliteratus/abliterate.py
CHANGED
|
@@ -334,7 +334,7 @@ METHODS = {
|
|
| 334 |
"layer_selection": "middle60",
|
| 335 |
},
|
| 336 |
"gabliteration": {
|
| 337 |
-
"label": "Gabliteration (
|
| 338 |
"description": (
|
| 339 |
"Faithful reproduction of Gabliteration (arXiv:2512.18901). "
|
| 340 |
"SVD-based multi-direction extraction (top-4), ridge-regularized "
|
|
@@ -2494,7 +2494,7 @@ class AbliterationPipeline:
|
|
| 2494 |
|
| 2495 |
References:
|
| 2496 |
- SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
|
| 2497 |
-
- Gabliteration (
|
| 2498 |
- SAFEx (Lai et al., NeurIPS 2025): safety expert identification
|
| 2499 |
"""
|
| 2500 |
if not self._routing_harmful or not self._routing_harmless:
|
|
|
|
| 334 |
"layer_selection": "middle60",
|
| 335 |
},
|
| 336 |
"gabliteration": {
|
| 337 |
+
"label": "Gabliteration (Gülmez 2026 Baseline)",
|
| 338 |
"description": (
|
| 339 |
"Faithful reproduction of Gabliteration (arXiv:2512.18901). "
|
| 340 |
"SVD-based multi-direction extraction (top-4), ridge-regularized "
|
|
|
|
| 2494 |
|
| 2495 |
References:
|
| 2496 |
- SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
|
| 2497 |
+
- Gabliteration (Gülmez, 2026): multi-direction SVD abliteration
|
| 2498 |
- SAFEx (Lai et al., NeurIPS 2025): safety expert identification
|
| 2499 |
"""
|
| 2500 |
if not self._routing_harmful or not self._routing_harmless:
|
obliteratus/interactive.py
CHANGED
|
@@ -24,7 +24,7 @@ def _detect_compute_tier() -> str:
|
|
| 24 |
import torch
|
| 25 |
|
| 26 |
if torch.cuda.is_available():
|
| 27 |
-
vram_gb = torch.cuda.get_device_properties(0).
|
| 28 |
if vram_gb >= 20:
|
| 29 |
return "large"
|
| 30 |
elif vram_gb >= 8:
|
|
|
|
| 24 |
import torch
|
| 25 |
|
| 26 |
if torch.cuda.is_available():
|
| 27 |
+
vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
| 28 |
if vram_gb >= 20:
|
| 29 |
return "large"
|
| 30 |
elif vram_gb >= 8:
|
obliteratus/local_ui.py
CHANGED
|
@@ -10,6 +10,7 @@ Usage:
|
|
| 10 |
from __future__ import annotations
|
| 11 |
|
| 12 |
import os
|
|
|
|
| 13 |
import platform
|
| 14 |
import shutil
|
| 15 |
import sys
|
|
@@ -48,7 +49,7 @@ def _detect_gpu() -> list[dict]:
|
|
| 48 |
{
|
| 49 |
"index": i,
|
| 50 |
"name": props.name,
|
| 51 |
-
"vram_gb": round(props.
|
| 52 |
"compute": f"{props.major}.{props.minor}",
|
| 53 |
}
|
| 54 |
)
|
|
@@ -292,6 +293,12 @@ def launch_local_ui(
|
|
| 292 |
console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
|
| 293 |
start = time.time()
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
from app import launch as app_launch
|
| 296 |
|
| 297 |
elapsed = time.time() - start
|
|
|
|
| 10 |
from __future__ import annotations
|
| 11 |
|
| 12 |
import os
|
| 13 |
+
import pathlib
|
| 14 |
import platform
|
| 15 |
import shutil
|
| 16 |
import sys
|
|
|
|
| 49 |
{
|
| 50 |
"index": i,
|
| 51 |
"name": props.name,
|
| 52 |
+
"vram_gb": round(props.total_memory / 1024**3, 1),
|
| 53 |
"compute": f"{props.major}.{props.minor}",
|
| 54 |
}
|
| 55 |
)
|
|
|
|
| 293 |
console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
|
| 294 |
start = time.time()
|
| 295 |
|
| 296 |
+
# app.py lives at the project root, one level above this package.
|
| 297 |
+
# When installed via pip the root isn't on sys.path, so add it.
|
| 298 |
+
_project_root = str(pathlib.Path(__file__).resolve().parent.parent)
|
| 299 |
+
if _project_root not in sys.path:
|
| 300 |
+
sys.path.insert(0, _project_root)
|
| 301 |
+
|
| 302 |
from app import launch as app_launch
|
| 303 |
|
| 304 |
elapsed = time.time() - start
|
paper/references.bib
CHANGED
|
@@ -7,11 +7,14 @@
|
|
| 7 |
year={2024}
|
| 8 |
}
|
| 9 |
|
| 10 |
-
@
|
| 11 |
-
title={{Gabliteration}:
|
| 12 |
-
author={
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
|
| 17 |
@misc{grimjim2025,
|
|
|
|
| 7 |
year={2024}
|
| 8 |
}
|
| 9 |
|
| 10 |
+
@misc{gabliteration2024,
|
| 11 |
+
title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models},
|
| 12 |
+
author={G\"{o}kdeniz G\"{u}lmez},
|
| 13 |
+
year={2026},
|
| 14 |
+
eprint={2512.18901},
|
| 15 |
+
archivePrefix={arXiv},
|
| 16 |
+
primaryClass={cs.AI},
|
| 17 |
+
url={https://arxiv.org/abs/2512.18901}
|
| 18 |
}
|
| 19 |
|
| 20 |
@misc{grimjim2025,
|
scripts/benchmark_sota_comparison.py
CHANGED
|
@@ -7,7 +7,7 @@ comparison tables with standardized community metrics.
|
|
| 7 |
|
| 8 |
Baselines included:
|
| 9 |
1. FailSpy/abliterator (2024) — Community workhorse baseline
|
| 10 |
-
2. Gabliteration (
|
| 11 |
3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
|
| 12 |
4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization
|
| 13 |
|
|
|
|
| 7 |
|
| 8 |
Baselines included:
|
| 9 |
1. FailSpy/abliterator (2024) — Community workhorse baseline
|
| 10 |
+
2. Gabliteration (Gülmez 2026) — SVD multi-direction + ridge regularization
|
| 11 |
3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
|
| 12 |
4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization
|
| 13 |
|