pliny-the-prompter commited on
Commit
7fa1eee
·
verified ·
1 Parent(s): 4f13809

Upload 133 files

Browse files
app.py CHANGED
@@ -1089,16 +1089,20 @@ def _generate_analysis_figs(pipeline, model_label: str = "") -> list:
1089
 
1090
  suffix = f" — {model_label}" if model_label else ""
1091
 
 
 
1092
  heatmap_fig = plot_cross_layer_heatmap(
1093
  result,
1094
- output_path=tempfile.mktemp(suffix=".png"),
1095
  title=f"Cross-Layer Direction Alignment{suffix}",
1096
  )
1097
  figs.append(heatmap_fig)
1098
 
 
 
1099
  drift_fig = plot_angular_drift(
1100
  result,
1101
- output_path=tempfile.mktemp(suffix=".png"),
1102
  title=f"Refusal Direction Angular Drift{suffix}",
1103
  )
1104
  figs.append(drift_fig)
@@ -1121,9 +1125,11 @@ def _generate_analysis_figs(pipeline, model_label: str = "") -> list:
1121
  proxy_harmless[idx] = torch.zeros_like(d_f).unsqueeze(0)
1122
  proxy_harmful[idx] = (d_f * norm).unsqueeze(0)
1123
 
 
 
1124
  topo_fig = plot_refusal_topology(
1125
  directions, proxy_harmful, proxy_harmless, list(strong_layers),
1126
- output_path=tempfile.mktemp(suffix=".png"),
1127
  title=f"Refusal Topology Map{suffix}",
1128
  )
1129
  figs.append(topo_fig)
@@ -5081,7 +5087,7 @@ The winner is saved locally — push it to HuggingFace Hub from the **Push to Hu
5081
  Download all intermediate data from your last obliteration run as a ZIP archive.
5082
 
5083
  **Contents:**
5084
- - `refusal_directions.pt` — Per-layer refusal direction tensors (load with `torch.load()`)
5085
  - `config.json` — Full pipeline configuration, strong layers, direction dimensions
5086
  - `results.csv` — Quality metrics (perplexity, coherence, refusal rate)
5087
  - `pipeline_log.txt` — Complete pipeline execution log
@@ -5540,6 +5546,20 @@ if __name__ == "__main__":
5540
  _parser.add_argument("--auth", type=str, default=None, help="Basic auth as user:pass")
5541
  _args = _parser.parse_args()
5542
  _auth = tuple(_args.auth.split(":", 1)) if _args.auth else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5543
  launch(
5544
  server_name=_args.host,
5545
  server_port=_args.port,
 
1089
 
1090
  suffix = f" — {model_label}" if model_label else ""
1091
 
1092
+ _fd1, _heatmap_path = tempfile.mkstemp(suffix=".png")
1093
+ os.close(_fd1)
1094
  heatmap_fig = plot_cross_layer_heatmap(
1095
  result,
1096
+ output_path=_heatmap_path,
1097
  title=f"Cross-Layer Direction Alignment{suffix}",
1098
  )
1099
  figs.append(heatmap_fig)
1100
 
1101
+ _fd2, _drift_path = tempfile.mkstemp(suffix=".png")
1102
+ os.close(_fd2)
1103
  drift_fig = plot_angular_drift(
1104
  result,
1105
+ output_path=_drift_path,
1106
  title=f"Refusal Direction Angular Drift{suffix}",
1107
  )
1108
  figs.append(drift_fig)
 
1125
  proxy_harmless[idx] = torch.zeros_like(d_f).unsqueeze(0)
1126
  proxy_harmful[idx] = (d_f * norm).unsqueeze(0)
1127
 
1128
+ _fd3, _topo_path = tempfile.mkstemp(suffix=".png")
1129
+ os.close(_fd3)
1130
  topo_fig = plot_refusal_topology(
1131
  directions, proxy_harmful, proxy_harmless, list(strong_layers),
1132
+ output_path=_topo_path,
1133
  title=f"Refusal Topology Map{suffix}",
1134
  )
1135
  figs.append(topo_fig)
 
5087
  Download all intermediate data from your last obliteration run as a ZIP archive.
5088
 
5089
  **Contents:**
5090
+ - `refusal_directions.pt` — Per-layer refusal direction tensors (load with `torch.load(..., weights_only=True)`)
5091
  - `config.json` — Full pipeline configuration, strong layers, direction dimensions
5092
  - `results.csv` — Quality metrics (perplexity, coherence, refusal rate)
5093
  - `pipeline_log.txt` — Complete pipeline execution log
 
5546
  _parser.add_argument("--auth", type=str, default=None, help="Basic auth as user:pass")
5547
  _args = _parser.parse_args()
5548
  _auth = tuple(_args.auth.split(":", 1)) if _args.auth else None
5549
+ if _args.share and _auth is None:
5550
+ import warnings as _w
5551
+ _w.warn(
5552
+ "WARNING: --share creates a public link without authentication. "
5553
+ "Anyone with the link can access the UI. Use --auth user:pass to restrict access.",
5554
+ stacklevel=1,
5555
+ )
5556
+ if _args.host == "0.0.0.0" and _auth is None and not os.environ.get("SPACE_ID"):
5557
+ import warnings as _w
5558
+ _w.warn(
5559
+ "WARNING: Binding to 0.0.0.0 exposes the UI to all network interfaces without authentication. "
5560
+ "Use --auth user:pass or --host 127.0.0.1 for local-only access.",
5561
+ stacklevel=1,
5562
+ )
5563
  launch(
5564
  server_name=_args.host,
5565
  server_port=_args.port,
docs/index.html CHANGED
@@ -2015,7 +2015,14 @@ function setAblMethod(m) {
2015
  ablMethod = m;
2016
  document.querySelectorAll('.method-radio').forEach(el => el.classList.remove('selected'));
2017
  document.getElementById('method-' + m).classList.add('selected');
2018
- document.getElementById('method-details').innerHTML = METHOD_INFO[m].desc;
 
 
 
 
 
 
 
2019
  updateCmdDisplay();
2020
  }
2021
 
@@ -2127,7 +2134,11 @@ async function simulatePipeline() {
2127
  stages.forEach(s => s.classList.remove('active','done'));
2128
  connectors.forEach(c => c.classList.remove('active'));
2129
  const modelName = ablSelectedModel || 'meta-llama/Llama-3.1-8B-Instruct';
2130
- logEl.innerHTML = `<div class="log-line stage-line">[ ABLITERATION PIPELINE — ${modelName} ]</div>`;
 
 
 
 
2131
 
2132
  function addLog(text, cls='') {
2133
  const line = document.createElement('div');
 
2015
  ablMethod = m;
2016
  document.querySelectorAll('.method-radio').forEach(el => el.classList.remove('selected'));
2017
  document.getElementById('method-' + m).classList.add('selected');
2018
+ const detailsEl = document.getElementById('method-details');
2019
+ // METHOD_INFO descriptions are hardcoded constants — safe for innerHTML.
2020
+ // Guard against unexpected keys to avoid prototype pollution.
2021
+ if (Object.prototype.hasOwnProperty.call(METHOD_INFO, m)) {
2022
+ detailsEl.innerHTML = METHOD_INFO[m].desc;
2023
+ } else {
2024
+ detailsEl.textContent = '';
2025
+ }
2026
  updateCmdDisplay();
2027
  }
2028
 
 
2134
  stages.forEach(s => s.classList.remove('active','done'));
2135
  connectors.forEach(c => c.classList.remove('active'));
2136
  const modelName = ablSelectedModel || 'meta-llama/Llama-3.1-8B-Instruct';
2137
+ logEl.textContent = '';
2138
+ const headerLine = document.createElement('div');
2139
+ headerLine.className = 'log-line stage-line';
2140
+ headerLine.textContent = `[ ABLITERATION PIPELINE — ${modelName} ]`;
2141
+ logEl.appendChild(headerLine);
2142
 
2143
  function addLog(text, cls='') {
2144
  const line = document.createElement('div');
obliteratus/adaptive_defaults.py CHANGED
@@ -428,10 +428,6 @@ def get_adaptive_recommendation(
428
  (arch_class, reasoning_class, param_bucket),
429
  ]
430
 
431
- # Also check model-specific records (exact model name match)
432
- # This is for the future when we have enough data per-model
433
- model_short = model_name.split("/")[-1].lower() if model_name else ""
434
-
435
  bucket = None
436
  used_key = None
437
  for key in candidates:
@@ -689,8 +685,6 @@ def format_recommendation(rec: AdaptiveRecommendation) -> str:
689
  lines.append("| Rank | Method | Mean Score | Runs |")
690
  lines.append("|------|--------|------------|------|")
691
  for i, (name, score) in enumerate(rec.method_ranking[:8], 1):
692
- ms_runs = 0
693
- # Get run count from the knowledge (not stored directly, but we have n_method_records for winner)
694
  lines.append(f"| {i} | `{name}` | {score:.4f} | — |")
695
  lines.append("")
696
 
@@ -828,7 +822,7 @@ def refresh_knowledge_base() -> dict[str, Any] | None:
828
  return None
829
 
830
  knowledge = build_knowledge_base(records)
831
- snapshot_path = save_snapshot(knowledge)
832
 
833
  # Also compute and log global insights for visibility
834
  insights = get_global_insights(knowledge)
 
428
  (arch_class, reasoning_class, param_bucket),
429
  ]
430
 
 
 
 
 
431
  bucket = None
432
  used_key = None
433
  for key in candidates:
 
685
  lines.append("| Rank | Method | Mean Score | Runs |")
686
  lines.append("|------|--------|------------|------|")
687
  for i, (name, score) in enumerate(rec.method_ranking[:8], 1):
 
 
688
  lines.append(f"| {i} | `{name}` | {score:.4f} | — |")
689
  lines.append("")
690
 
 
822
  return None
823
 
824
  knowledge = build_knowledge_base(records)
825
+ save_snapshot(knowledge)
826
 
827
  # Also compute and log global insights for visibility
828
  insights = get_global_insights(knowledge)
obliteratus/architecture_profiles.py CHANGED
@@ -20,7 +20,10 @@ import logging
20
  import re
21
  from dataclasses import dataclass, field
22
  from enum import Enum
23
- from typing import Any
 
 
 
24
 
25
  logger = logging.getLogger(__name__)
26
 
 
20
  import re
21
  from dataclasses import dataclass, field
22
  from enum import Enum
23
+ from typing import Any, TYPE_CHECKING
24
+
25
+ if TYPE_CHECKING:
26
+ from obliteratus.adaptive_defaults import AdaptiveRecommendation
27
 
28
  logger = logging.getLogger(__name__)
29
 
obliteratus/bayesian_optimizer.py CHANGED
@@ -30,7 +30,6 @@ References:
30
  from __future__ import annotations
31
 
32
  import logging
33
- import math
34
  from typing import TYPE_CHECKING
35
 
36
  import torch
 
30
  from __future__ import annotations
31
 
32
  import logging
 
33
  from typing import TYPE_CHECKING
34
 
35
  import torch
obliteratus/cli.py CHANGED
@@ -483,7 +483,7 @@ def _cmd_recommend(args):
483
 
484
 
485
  def _cmd_tourney(args):
486
- from obliteratus.tourney import TourneyRunner, render_bracket
487
 
488
  def on_log(msg):
489
  console.print(msg)
 
483
 
484
 
485
  def _cmd_tourney(args):
486
+ from obliteratus.tourney import TourneyRunner
487
 
488
  def on_log(msg):
489
  console.print(msg)
obliteratus/evaluation/heretic_eval.py CHANGED
@@ -639,6 +639,7 @@ def _run_lm_eval_python(
639
  output_dir: str | None,
640
  ) -> dict:
641
  """Run lm-evaluation-harness via Python API."""
 
642
  import lm_eval
643
 
644
  # Build per-task num_fewshot overrides
 
639
  output_dir: str | None,
640
  ) -> dict:
641
  """Run lm-evaluation-harness via Python API."""
642
+ model_path = _sanitize_model_path(model_path)
643
  import lm_eval
644
 
645
  # Build per-task num_fewshot overrides
obliteratus/informed_pipeline.py CHANGED
@@ -785,7 +785,7 @@ class InformedAbliterationPipeline(AbliterationPipeline):
785
  if self.direction_method == "leace":
786
  from obliteratus.analysis.leace import LEACEExtractor
787
  leace_extractor = LEACEExtractor()
788
- self.log(f"Using LEACE (closed-form optimal concept erasure)")
789
 
790
  if self.use_whitened_svd and self.n_directions > 1 and leace_extractor is None:
791
  from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
 
785
  if self.direction_method == "leace":
786
  from obliteratus.analysis.leace import LEACEExtractor
787
  leace_extractor = LEACEExtractor()
788
+ self.log("Using LEACE (closed-form optimal concept erasure)")
789
 
790
  if self.use_whitened_svd and self.n_directions > 1 and leace_extractor is None:
791
  from obliteratus.analysis.whitened_svd import WhitenedSVDExtractor
obliteratus/mlx_backend.py CHANGED
@@ -387,7 +387,6 @@ def save_model(
387
  """
388
  _require_mlx()
389
 
390
- from mlx_lm import convert # type: ignore[import-untyped]
391
 
392
  out = Path(output_dir)
393
  out.mkdir(parents=True, exist_ok=True)
@@ -426,7 +425,6 @@ def torch_tensor_to_mlx(tensor: "torch.Tensor") -> Any: # noqa: F821
426
  """Convert a PyTorch tensor to an MLX array."""
427
  _require_mlx()
428
  import mlx.core as mx # type: ignore[import-untyped]
429
- import numpy as np
430
 
431
  # Move to CPU and convert via numpy
432
  np_array = tensor.detach().cpu().float().numpy()
 
387
  """
388
  _require_mlx()
389
 
 
390
 
391
  out = Path(output_dir)
392
  out.mkdir(parents=True, exist_ok=True)
 
425
  """Convert a PyTorch tensor to an MLX array."""
426
  _require_mlx()
427
  import mlx.core as mx # type: ignore[import-untyped]
 
428
 
429
  # Move to CPU and convert via numpy
430
  np_array = tensor.detach().cpu().float().numpy()
obliteratus/telemetry.py CHANGED
@@ -367,20 +367,31 @@ def _sync_to_hub_bg() -> None:
367
  ensuring all data lands in the same dataset repository.
368
  Uses _sync_in_progress event to prevent overlapping uploads.
369
  """
370
- if _sync_in_progress.is_set():
371
- return # Another sync is already running
372
- _sync_in_progress.set()
 
 
 
373
  try:
374
  repo = _TELEMETRY_REPO
375
  if not repo:
 
376
  return
377
  if not TELEMETRY_FILE.exists():
 
 
 
 
 
 
378
  return
379
 
380
  from huggingface_hub import HfApi
381
  if not _ensure_hub_repo(repo):
 
382
  return
383
- api = HfApi(token=os.environ.get("HF_TOKEN"))
384
  slug = _instance_slug()
385
  api.upload_file(
386
  path_or_fileobj=str(TELEMETRY_FILE),
@@ -389,13 +400,16 @@ def _sync_to_hub_bg() -> None:
389
  repo_type="dataset",
390
  commit_message=f"Auto-sync telemetry from {slug}",
391
  )
392
- logger.info(f"Synced telemetry to {repo}/data/{slug}.jsonl")
393
  except Exception as e:
394
- logger.warning(f"Hub sync failed: {e}")
395
  finally:
396
  _sync_in_progress.clear()
397
 
398
 
 
 
 
399
  def _schedule_hub_sync() -> None:
400
  """Schedule a debounced background sync of local telemetry to Hub.
401
 
@@ -404,7 +418,7 @@ def _schedule_hub_sync() -> None:
404
  - Telemetry is disabled
405
  - Last sync was less than _HUB_SYNC_INTERVAL seconds ago
406
  """
407
- global _hub_sync_last
408
  if not _TELEMETRY_REPO:
409
  return
410
  if not is_enabled():
@@ -418,6 +432,23 @@ def _schedule_hub_sync() -> None:
418
 
419
  t = threading.Thread(target=_sync_to_hub_bg, daemon=True)
420
  t.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
 
422
 
423
  def fetch_hub_records(max_records: int = 10000) -> list[dict[str, Any]]:
@@ -515,8 +546,14 @@ def _fetch_via_git_clone(repo: str, max_records: int) -> list[dict[str, Any]]:
515
  clone_dir = Path(tempfile.mkdtemp(prefix="obliteratus_telemetry_"))
516
 
517
  try:
518
- env = dict(os.environ)
 
 
 
 
 
519
  env["GIT_LFS_SKIP_SMUDGE"] = "1"
 
520
  result = subprocess.run(
521
  ["git", "clone", "--depth", "1", clone_url, str(clone_dir)],
522
  capture_output=True, text=True, timeout=60, env=env,
 
367
  ensuring all data lands in the same dataset repository.
368
  Uses _sync_in_progress event to prevent overlapping uploads.
369
  """
370
+ # Atomic check-and-set to prevent concurrent syncs (Event lacks
371
+ # compare-and-swap, so we use a lock for correctness).
372
+ with _hub_sync_lock:
373
+ if _sync_in_progress.is_set():
374
+ return # Another sync is already running
375
+ _sync_in_progress.set()
376
  try:
377
  repo = _TELEMETRY_REPO
378
  if not repo:
379
+ logger.debug("Hub sync skipped: no telemetry repo configured")
380
  return
381
  if not TELEMETRY_FILE.exists():
382
+ logger.debug("Hub sync skipped: telemetry file does not exist")
383
+ return
384
+
385
+ token = os.environ.get("HF_TOKEN")
386
+ if not token:
387
+ logger.warning("Hub sync skipped: HF_TOKEN not set — auto-sync requires a write token")
388
  return
389
 
390
  from huggingface_hub import HfApi
391
  if not _ensure_hub_repo(repo):
392
+ logger.warning("Hub sync skipped: could not verify repo %s exists", repo)
393
  return
394
+ api = HfApi(token=token)
395
  slug = _instance_slug()
396
  api.upload_file(
397
  path_or_fileobj=str(TELEMETRY_FILE),
 
400
  repo_type="dataset",
401
  commit_message=f"Auto-sync telemetry from {slug}",
402
  )
403
+ logger.info("Synced telemetry to %s/data/%s.jsonl", repo, slug)
404
  except Exception as e:
405
+ logger.warning("Hub sync failed (will retry on next benchmark): %s", e)
406
  finally:
407
  _sync_in_progress.clear()
408
 
409
 
410
+ _active_sync_thread: threading.Thread | None = None
411
+
412
+
413
  def _schedule_hub_sync() -> None:
414
  """Schedule a debounced background sync of local telemetry to Hub.
415
 
 
418
  - Telemetry is disabled
419
  - Last sync was less than _HUB_SYNC_INTERVAL seconds ago
420
  """
421
+ global _hub_sync_last, _active_sync_thread
422
  if not _TELEMETRY_REPO:
423
  return
424
  if not is_enabled():
 
432
 
433
  t = threading.Thread(target=_sync_to_hub_bg, daemon=True)
434
  t.start()
435
+ _active_sync_thread = t
436
+
437
+
438
+ def _flush_sync_on_exit() -> None:
439
+ """Atexit handler: wait for any in-flight Hub sync to finish.
440
+
441
+ On ZeroGPU Spaces the worker process is killed after each request.
442
+ Without this, the daemon sync thread gets killed mid-upload and
443
+ telemetry silently fails to reach the Hub.
444
+ """
445
+ t = _active_sync_thread
446
+ if t is not None and t.is_alive():
447
+ t.join(timeout=30)
448
+
449
+
450
+ import atexit
451
+ atexit.register(_flush_sync_on_exit)
452
 
453
 
454
  def fetch_hub_records(max_records: int = 10000) -> list[dict[str, Any]]:
 
546
  clone_dir = Path(tempfile.mkdtemp(prefix="obliteratus_telemetry_"))
547
 
548
  try:
549
+ # Only pass necessary env vars to subprocess — avoid leaking secrets
550
+ _safe_keys = {"PATH", "HOME", "USER", "LANG", "LC_ALL", "TMPDIR",
551
+ "GIT_TERMINAL_PROMPT", "GIT_LFS_SKIP_SMUDGE",
552
+ "HTTP_PROXY", "HTTPS_PROXY", "NO_PROXY",
553
+ "http_proxy", "https_proxy", "no_proxy"}
554
+ env = {k: v for k, v in os.environ.items() if k in _safe_keys}
555
  env["GIT_LFS_SKIP_SMUDGE"] = "1"
556
+ env["GIT_TERMINAL_PROMPT"] = "0"
557
  result = subprocess.run(
558
  ["git", "clone", "--depth", "1", clone_url, str(clone_dir)],
559
  capture_output=True, text=True, timeout=60, env=env,
obliteratus/tourney.py CHANGED
@@ -871,7 +871,6 @@ class TourneyRunner:
871
  verify_sample_size: int = 30,
872
  ) -> Contender:
873
  """Run a single abliteration method and return its Contender result."""
874
- import torch
875
 
876
  t0 = time.time()
877
  contender = Contender(method=method)
@@ -1017,7 +1016,7 @@ class TourneyRunner:
1017
  )
1018
 
1019
  n_methods = len(self.methods)
1020
- self.log(f"OBLITERATUS TOURNEY")
1021
  self.log(f"Model: {self.model_name}")
1022
  self.log(f"Contenders: {n_methods} methods")
1023
  self.log(f"Dataset: {self.dataset_key}")
@@ -1247,7 +1246,7 @@ class TourneyRunner:
1247
  if resuming and resume_round_spec:
1248
  # We have an interrupted round to finish — schedule it first,
1249
  # then let the dynamic scheduling add subsequent rounds.
1250
- ir = resume_round_spec
1251
  skip_completed_rounds = len(result.rounds)
1252
  else:
1253
  skip_completed_rounds = 0
@@ -1357,11 +1356,11 @@ class TourneyRunner:
1357
  quantization=self.quantization,
1358
  methods=self.methods,
1359
  )
1360
- self.log(f"\nGPU SESSION INTERRUPTED — checkpoint saved")
1361
  self.log(f" Reason: {exc}")
1362
  self.log(f" Completed: {len(rnd.contenders)} methods in round {round_num}")
1363
  self.log(f" Remaining: {len(still_remaining)} methods")
1364
- self.log(f" Click Run again to resume automatically.")
1365
  raise
1366
 
1367
  rnd.contenders.append(contender)
 
871
  verify_sample_size: int = 30,
872
  ) -> Contender:
873
  """Run a single abliteration method and return its Contender result."""
 
874
 
875
  t0 = time.time()
876
  contender = Contender(method=method)
 
1016
  )
1017
 
1018
  n_methods = len(self.methods)
1019
+ self.log("OBLITERATUS TOURNEY")
1020
  self.log(f"Model: {self.model_name}")
1021
  self.log(f"Contenders: {n_methods} methods")
1022
  self.log(f"Dataset: {self.dataset_key}")
 
1246
  if resuming and resume_round_spec:
1247
  # We have an interrupted round to finish — schedule it first,
1248
  # then let the dynamic scheduling add subsequent rounds.
1249
+ _ = resume_round_spec # noqa: F841 — consumed by dynamic scheduling below
1250
  skip_completed_rounds = len(result.rounds)
1251
  else:
1252
  skip_completed_rounds = 0
 
1356
  quantization=self.quantization,
1357
  methods=self.methods,
1358
  )
1359
+ self.log("\nGPU SESSION INTERRUPTED — checkpoint saved")
1360
  self.log(f" Reason: {exc}")
1361
  self.log(f" Completed: {len(rnd.contenders)} methods in round {round_num}")
1362
  self.log(f" Remaining: {len(still_remaining)} methods")
1363
+ self.log(" Click Run again to resume automatically.")
1364
  raise
1365
 
1366
  rnd.contenders.append(contender)