Spaces:
Running
Running
| """Bar chart for the frontier-LLMs-as-Scammer comparison. | |
| Shows bypass rate (single-shot) for each frontier model alongside our | |
| trained 0.5B Scammer LoRA Phase 1 (single-shot AND best-of-8). Mirrors the | |
| defender-side story: a small trained adversary should match or beat | |
| untrained large models because evading rule-based defenses is a learnable | |
| structure, not a capacity problem. | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| from pathlib import Path | |
| import matplotlib.pyplot as plt | |
| REPO = Path(__file__).resolve().parent.parent.parent | |
| CSV_PATH = REPO / "logs" / "scammer_frontier_comparison.csv" | |
| OUT_DIR = REPO / "plots" / "chakravyuh_plots" | |
| NICE_NAMES = { | |
| "scammer-lora-phase1-best-of-8": ("Scammer LoRA\n0.5B + r=16 (BO8)", "#0a7e44"), | |
| "scammer-lora-phase1-single-shot": ("Scammer LoRA\n0.5B + r=16 (SS)", "#34a853"), | |
| "hf-scammer-llama-3.3-70b-instruct": ("Llama-3.3-70B\n70B (untrained)", "#3b82f6"), | |
| "hf-scammer-qwen2.5-72b-instruct": ("Qwen2.5-72B\n72B (untrained)", "#3b82f6"), | |
| "hf-scammer-qwen2.5-7b-instruct": ("Qwen2.5-7B base\n7B (untrained)", "#f59e0b"), | |
| "hf-scammer-deepseek-v3-0324": ("DeepSeek-V3\n671B MoE (untrained)", "#ef4444"), | |
| "hf-scammer-gpt-oss-120b": ("gpt-oss-120b\n120B (untrained)", "#3b82f6"), | |
| "hf-scammer-gemma-3-27b-it": ("gemma-3-27b\n27B (untrained)", "#ef4444"), | |
| } | |
| ORDER = [ | |
| "scammer-lora-phase1-best-of-8", | |
| "hf-scammer-gpt-oss-120b", | |
| "hf-scammer-llama-3.3-70b-instruct", | |
| "hf-scammer-qwen2.5-7b-instruct", | |
| "scammer-lora-phase1-single-shot", | |
| "hf-scammer-qwen2.5-72b-instruct", | |
| "hf-scammer-gemma-3-27b-it", | |
| "hf-scammer-deepseek-v3-0324", | |
| ] | |
| def main() -> int: | |
| OUT_DIR.mkdir(parents=True, exist_ok=True) | |
| with CSV_PATH.open() as f: | |
| rows = {r["provider"]: r for r in csv.DictReader(f)} | |
| labels: list[str] = [] | |
| rates: list[float] = [] | |
| cis: list[tuple[float, float]] = [] | |
| colors: list[str] = [] | |
| for prov in ORDER: | |
| if prov not in rows: | |
| continue | |
| r = rows[prov] | |
| label, color = NICE_NAMES.get(prov, (prov, "#9ca3af")) | |
| labels.append(label) | |
| rates.append(float(r["bypass_rate"]) * 100) | |
| cis.append((float(r["ci_low"]) * 100, float(r["ci_high"]) * 100)) | |
| colors.append(color) | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| x = list(range(len(labels))) | |
| yerr_lo = [r - c[0] for r, c in zip(rates, cis)] | |
| yerr_hi = [c[1] - r for r, c in zip(rates, cis)] | |
| ax.bar( | |
| x, rates, color=colors, edgecolor="#1f2937", linewidth=0.6, | |
| yerr=[yerr_lo, yerr_hi], capsize=4, | |
| ) | |
| ax.set_xticks(x) | |
| ax.set_xticklabels(labels, rotation=25, ha="right", fontsize=9) | |
| ax.set_ylabel("Scripted-defender bypass rate (%)", fontsize=11) | |
| ax.set_title( | |
| "Frontier-LLMs-as-Scammer · n=16 attack categories · defender = ScriptedAnalyzer\n" | |
| "Our trained 0.5B beats every untrained frontier model (BO8) — including 671B DeepSeek-V3", | |
| fontsize=11, | |
| ) | |
| for i, v in enumerate(rates): | |
| ax.text(i, v + 2, f"{v:.1f}%", ha="center", fontsize=8) | |
| ax.set_ylim(0, 100) | |
| fig.tight_layout() | |
| out = OUT_DIR / "scammer_frontier_bar.png" | |
| fig.savefig(out, dpi=150, bbox_inches="tight") | |
| print(f"Wrote {out}") | |
| return 0 | |
| if __name__ == "__main__": | |
| raise SystemExit(main()) | |