pliny-the-prompter commited on
Commit
54c44c0
·
verified ·
1 Parent(s): a46d378

Upload 128 files

Browse files
README.md CHANGED
@@ -526,7 +526,7 @@ Works with any HuggingFace transformer, including: GPT-2, LLaMA, Mistral, Falcon
526
  ## References
527
 
528
  - Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
529
- - Gulmez, G. (2025). *Gabliteration: SVD-Based Multi-Direction Refusal Removal.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
530
  - grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
531
  - Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
532
  - Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)
 
526
  ## References
527
 
528
  - Arditi et al. (2024). *Refusal in Language Models Is Mediated by a Single Direction.* [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
529
+ - Gülmez, G. (2026). *Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models.* [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
530
  - grimjim (2025). *Norm-Preserving Biprojected Abliteration.* [HuggingFace](https://huggingface.co/grimjim)
531
  - Turner et al. (2023). *Activation Addition: Steering Language Models Without Optimization.* [arXiv:2308.10248](https://arxiv.org/abs/2308.10248)
532
  - Rimsky et al. (2024). *Steering Llama 2 via Contrastive Activation Addition.* [arXiv:2312.06681](https://arxiv.org/abs/2312.06681)
app.py CHANGED
@@ -117,7 +117,7 @@ _obliterate_counter: int = 0
117
 
118
  # Flag to suppress session_model_dd.change when obliterate programmatically
119
  # sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
120
- _skip_session_load: bool = False
121
 
122
  # ---------------------------------------------------------------------------
123
  # Model presets — 100+ models organized by provider
@@ -1870,7 +1870,7 @@ def obliterate(model_choice: str, method_choice: str, hub_repo: str,
1870
  # Set skip flag so the .change handler doesn't trigger a wasteful
1871
  # GPU re-allocation — the model is already loaded.
1872
  global _skip_session_load
1873
- _skip_session_load = True
1874
  _dd_update = gr.update(
1875
  choices=_get_session_model_choices(),
1876
  value=_last_obliterated_label or None,
@@ -1950,13 +1950,17 @@ def chat_respond(message: str, history: list[dict], system_prompt: str,
1950
  # ZeroGPU safety: detect whether we need to reload from checkpoint.
1951
  # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
1952
  # model as None (garbage-collected) or with stale/meta tensors.
 
 
1953
  _needs_reload = model is None or tokenizer is None
1954
  if not _needs_reload:
1955
  try:
1956
  dev = next(model.parameters()).device
1957
- if torch.cuda.is_available() and dev.type != "cuda":
 
 
1958
  model.to("cuda")
1959
- except (StopIteration, RuntimeError):
1960
  _needs_reload = True
1961
 
1962
  # Reload from saved checkpoint if model is missing or stale
@@ -2114,8 +2118,8 @@ def load_bench_into_chat(choice: str, progress=gr.Progress()):
2114
  # Skip if the obliterate function just set the dropdown value — the model
2115
  # is already loaded and we'd just waste GPU quota re-allocating.
2116
  global _skip_session_load
2117
- if _skip_session_load:
2118
- _skip_session_load = False
2119
  if choice and _state.get("status") == "ready":
2120
  yield (
2121
  f"**Ready!** `{choice}` is loaded — just type in the chat below.",
@@ -2362,13 +2366,16 @@ def ab_chat_respond(message: str, history_left: list[dict], history_right: list[
2362
 
2363
  # ZeroGPU safety: detect whether we need to reload from checkpoint.
2364
  # Model may be None (garbage-collected after GPU deallocation) or stale.
 
2365
  _needs_reload = abliterated_model is None or tokenizer is None
2366
  if not _needs_reload:
2367
  try:
2368
  dev = next(abliterated_model.parameters()).device
2369
- if torch.cuda.is_available() and dev.type != "cuda":
 
 
2370
  abliterated_model.to("cuda")
2371
- except (StopIteration, RuntimeError):
2372
  _needs_reload = True
2373
 
2374
  if _needs_reload:
 
117
 
118
  # Flag to suppress session_model_dd.change when obliterate programmatically
119
  # sets the dropdown value (prevents wasteful GPU re-allocation on ZeroGPU)
120
+ _skip_session_load: int = 0 # counter (not bool) — obliterate sets to 2 for both dropdowns
121
 
122
  # ---------------------------------------------------------------------------
123
  # Model presets — 100+ models organized by provider
 
1870
  # Set skip flag so the .change handler doesn't trigger a wasteful
1871
  # GPU re-allocation — the model is already loaded.
1872
  global _skip_session_load
1873
+ _skip_session_load = 2 # both session_model_dd and ab_session_model_dd fire .change
1874
  _dd_update = gr.update(
1875
  choices=_get_session_model_choices(),
1876
  value=_last_obliterated_label or None,
 
1950
  # ZeroGPU safety: detect whether we need to reload from checkpoint.
1951
  # Between GPU allocations, ZeroGPU may deallocate GPU memory, leaving
1952
  # model as None (garbage-collected) or with stale/meta tensors.
1953
+ # Meta tensors raise NotImplementedError on .to(), not RuntimeError,
1954
+ # so we catch Exception broadly here.
1955
  _needs_reload = model is None or tokenizer is None
1956
  if not _needs_reload:
1957
  try:
1958
  dev = next(model.parameters()).device
1959
+ if dev.type == "meta":
1960
+ _needs_reload = True
1961
+ elif torch.cuda.is_available() and dev.type != "cuda":
1962
  model.to("cuda")
1963
+ except Exception:
1964
  _needs_reload = True
1965
 
1966
  # Reload from saved checkpoint if model is missing or stale
 
2118
  # Skip if the obliterate function just set the dropdown value — the model
2119
  # is already loaded and we'd just waste GPU quota re-allocating.
2120
  global _skip_session_load
2121
+ if _skip_session_load > 0:
2122
+ _skip_session_load -= 1
2123
  if choice and _state.get("status") == "ready":
2124
  yield (
2125
  f"**Ready!** `{choice}` is loaded — just type in the chat below.",
 
2366
 
2367
  # ZeroGPU safety: detect whether we need to reload from checkpoint.
2368
  # Model may be None (garbage-collected after GPU deallocation) or stale.
2369
+ # Meta tensors raise NotImplementedError on .to(), so catch broadly.
2370
  _needs_reload = abliterated_model is None or tokenizer is None
2371
  if not _needs_reload:
2372
  try:
2373
  dev = next(abliterated_model.parameters()).device
2374
+ if dev.type == "meta":
2375
+ _needs_reload = True
2376
+ elif torch.cuda.is_available() and dev.type != "cuda":
2377
  abliterated_model.to("cuda")
2378
+ except Exception:
2379
  _needs_reload = True
2380
 
2381
  if _needs_reload:
docs/RESEARCH_SURVEY.md CHANGED
@@ -116,7 +116,7 @@ The paper also analyzes how adversarial suffixes (e.g., GCG-generated) suppress
116
 
117
  ## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
118
 
119
- **Author:** Gokdeniz Gulmez (independent research)
120
  **arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
121
  **Version:** v3, revised January 28, 2026
122
  **Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
@@ -733,7 +733,7 @@ MI research helps make AI safe but could be used adversarially. The same techniq
733
 
734
  1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
735
 
736
- 2. Gulmez, G. (2025). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
737
 
738
  3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)
739
 
 
116
 
117
  ## 2. Gabliteration (arXiv:2512.18901) — Multi-Direction Subspace Approach {#2-gabliteration}
118
 
119
+ **Author:** Gökdeniz Gülmez (independent research)
120
  **arXiv:** [2512.18901](https://arxiv.org/abs/2512.18901)
121
  **Version:** v3, revised January 28, 2026
122
  **Models:** [Hugging Face collection](https://huggingface.co/collections/Goekdeniz-Guelmez/gabliteration)
 
733
 
734
  1. Arditi, A., Obeso, O., Syed, A., Paleka, D., Panickssery, N., Gurnee, W., & Nanda, N. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024. [arXiv:2406.11717](https://arxiv.org/abs/2406.11717)
735
 
736
+ 2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. [arXiv:2512.18901](https://arxiv.org/abs/2512.18901)
737
 
738
  3. grimjim. (2025). Norm-Preserving Biprojected Abliteration / MPOA. [HuggingFace Blog](https://huggingface.co/blog/grimjim/norm-preserving-biprojected-abliteration) | [Projected Abliteration](https://huggingface.co/blog/grimjim/projected-abliteration) | [Code](https://github.com/jim-plus/llm-abliteration)
739
 
docs/theory_journal.md CHANGED
@@ -1802,7 +1802,7 @@ implementations diverge from the closed-form GRRO solution.
1802
  ## References
1803
 
1804
  1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
1805
- 2. Gulmez, G. (2025). Gabliteration. arXiv:2512.18901.
1806
  3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
1807
  4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
1808
  5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.
 
1802
  ## References
1803
 
1804
  1. Arditi, A. et al. (2024). Refusal in Language Models Is Mediated by a Single Direction. NeurIPS 2024.
1805
+ 2. Gülmez, G. (2026). Gabliteration: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models. arXiv:2512.18901.
1806
  3. grimjim (2025). Norm-Preserving Biprojected Abliteration (MPOA). HuggingFace.
1807
  4. Wollschlager, T. et al. (2025). The Geometry of Refusal. ICML 2025.
1808
  5. Joad et al. (2026). There Is More to Refusal than a Single Direction. arXiv:2602.02132.
notebooks/abliterate.ipynb CHANGED
@@ -53,7 +53,7 @@
53
  "id": "install"
54
  },
55
  "outputs": [],
56
- "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB\")"
57
  },
58
  {
59
  "cell_type": "markdown",
 
53
  "id": "install"
54
  },
55
  "outputs": [],
56
+ "source": "!pip install -q git+https://github.com/elder-plinius/OBLITERATUS.git\n!pip install -q accelerate bitsandbytes\n\nimport torch\nprint(f\"PyTorch {torch.__version__}\")\nprint(f\"CUDA available: {torch.cuda.is_available()}\")\nif torch.cuda.is_available():\n print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n print(f\"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")"
57
  },
58
  {
59
  "cell_type": "markdown",
obliteratus/abliterate.py CHANGED
@@ -334,7 +334,7 @@ METHODS = {
334
  "layer_selection": "middle60",
335
  },
336
  "gabliteration": {
337
- "label": "Gabliteration (Gulmez 2025 Baseline)",
338
  "description": (
339
  "Faithful reproduction of Gabliteration (arXiv:2512.18901). "
340
  "SVD-based multi-direction extraction (top-4), ridge-regularized "
@@ -2494,7 +2494,7 @@ class AbliterationPipeline:
2494
 
2495
  References:
2496
  - SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
2497
- - Gabliteration (Gulmez, 2025): multi-direction SVD abliteration
2498
  - SAFEx (Lai et al., NeurIPS 2025): safety expert identification
2499
  """
2500
  if not self._routing_harmful or not self._routing_harmless:
 
334
  "layer_selection": "middle60",
335
  },
336
  "gabliteration": {
337
+ "label": "Gabliteration (Gülmez 2026 Baseline)",
338
  "description": (
339
  "Faithful reproduction of Gabliteration (arXiv:2512.18901). "
340
  "SVD-based multi-direction extraction (top-4), ridge-regularized "
 
2494
 
2495
  References:
2496
  - SteerMoE (Fayyaz et al., 2025): expert activation frequency analysis
2497
+ - Gabliteration (Gülmez, 2026): multi-direction SVD abliteration
2498
  - SAFEx (Lai et al., NeurIPS 2025): safety expert identification
2499
  """
2500
  if not self._routing_harmful or not self._routing_harmless:
obliteratus/interactive.py CHANGED
@@ -24,7 +24,7 @@ def _detect_compute_tier() -> str:
24
  import torch
25
 
26
  if torch.cuda.is_available():
27
- vram_gb = torch.cuda.get_device_properties(0).total_mem / (1024**3)
28
  if vram_gb >= 20:
29
  return "large"
30
  elif vram_gb >= 8:
 
24
  import torch
25
 
26
  if torch.cuda.is_available():
27
+ vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
28
  if vram_gb >= 20:
29
  return "large"
30
  elif vram_gb >= 8:
obliteratus/local_ui.py CHANGED
@@ -10,6 +10,7 @@ Usage:
10
  from __future__ import annotations
11
 
12
  import os
 
13
  import platform
14
  import shutil
15
  import sys
@@ -48,7 +49,7 @@ def _detect_gpu() -> list[dict]:
48
  {
49
  "index": i,
50
  "name": props.name,
51
- "vram_gb": round(props.total_mem / 1024**3, 1),
52
  "compute": f"{props.major}.{props.minor}",
53
  }
54
  )
@@ -292,6 +293,12 @@ def launch_local_ui(
292
  console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
293
  start = time.time()
294
 
 
 
 
 
 
 
295
  from app import launch as app_launch
296
 
297
  elapsed = time.time() - start
 
10
  from __future__ import annotations
11
 
12
  import os
13
+ import pathlib
14
  import platform
15
  import shutil
16
  import sys
 
49
  {
50
  "index": i,
51
  "name": props.name,
52
+ "vram_gb": round(props.total_memory / 1024**3, 1),
53
  "compute": f"{props.major}.{props.minor}",
54
  }
55
  )
 
293
  console.print("[dim]Loading OBLITERATUS UI (this may take a moment on first run)...[/dim]")
294
  start = time.time()
295
 
296
+ # app.py lives at the project root, one level above this package.
297
+ # When installed via pip the root isn't on sys.path, so add it.
298
+ _project_root = str(pathlib.Path(__file__).resolve().parent.parent)
299
+ if _project_root not in sys.path:
300
+ sys.path.insert(0, _project_root)
301
+
302
  from app import launch as app_launch
303
 
304
  elapsed = time.time() - start
paper/references.bib CHANGED
@@ -7,11 +7,14 @@
7
  year={2024}
8
  }
9
 
10
- @article{gabliteration2024,
11
- title={{Gabliteration}: {SVD}-Based Multi-Direction Refusal Removal},
12
- author={Gabriel, Saul and {contributors}},
13
- journal={arXiv preprint arXiv:2512.18901},
14
- year={2024}
 
 
 
15
  }
16
 
17
  @misc{grimjim2025,
 
7
  year={2024}
8
  }
9
 
10
+ @misc{gabliteration2024,
11
+ title={{Gabliteration}: Adaptive Multi-Directional Neural Weight Modification for Selective Behavioral Alteration in Large Language Models},
12
+ author={G\"{o}kdeniz G\"{u}lmez},
13
+ year={2026},
14
+ eprint={2512.18901},
15
+ archivePrefix={arXiv},
16
+ primaryClass={cs.AI},
17
+ url={https://arxiv.org/abs/2512.18901}
18
  }
19
 
20
  @misc{grimjim2025,
scripts/benchmark_sota_comparison.py CHANGED
@@ -7,7 +7,7 @@ comparison tables with standardized community metrics.
7
 
8
  Baselines included:
9
  1. FailSpy/abliterator (2024) — Community workhorse baseline
10
- 2. Gabliteration (Gulmez 2025) — SVD multi-direction + ridge regularization
11
  3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
12
  4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization
13
 
 
7
 
8
  Baselines included:
9
  1. FailSpy/abliterator (2024) — Community workhorse baseline
10
+ 2. Gabliteration (Gülmez 2026) — SVD multi-direction + ridge regularization
11
  3. Heretic / p-e-w (2025) — Bayesian TPE auto-tuning (current SOTA for quality)
12
  4. Wollschlager RDO (ICML 2025) — Gradient-based direction optimization
13