chakravyuh / plots /scripts /scammer_frontier_bar.py
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
"""Bar chart for the frontier-LLMs-as-Scammer comparison.
Shows bypass rate (single-shot) for each frontier model alongside our
trained 0.5B Scammer LoRA Phase 1 (single-shot AND best-of-8). Mirrors the
defender-side story: a small trained adversary should match or beat
untrained large models because evading rule-based defenses is a learnable
structure, not a capacity problem.
"""
from __future__ import annotations
import csv
from pathlib import Path
import matplotlib.pyplot as plt
REPO = Path(__file__).resolve().parent.parent.parent
CSV_PATH = REPO / "logs" / "scammer_frontier_comparison.csv"
OUT_DIR = REPO / "plots" / "chakravyuh_plots"
NICE_NAMES = {
"scammer-lora-phase1-best-of-8": ("Scammer LoRA\n0.5B + r=16 (BO8)", "#0a7e44"),
"scammer-lora-phase1-single-shot": ("Scammer LoRA\n0.5B + r=16 (SS)", "#34a853"),
"hf-scammer-llama-3.3-70b-instruct": ("Llama-3.3-70B\n70B (untrained)", "#3b82f6"),
"hf-scammer-qwen2.5-72b-instruct": ("Qwen2.5-72B\n72B (untrained)", "#3b82f6"),
"hf-scammer-qwen2.5-7b-instruct": ("Qwen2.5-7B base\n7B (untrained)", "#f59e0b"),
"hf-scammer-deepseek-v3-0324": ("DeepSeek-V3\n671B MoE (untrained)", "#ef4444"),
"hf-scammer-gpt-oss-120b": ("gpt-oss-120b\n120B (untrained)", "#3b82f6"),
"hf-scammer-gemma-3-27b-it": ("gemma-3-27b\n27B (untrained)", "#ef4444"),
}
ORDER = [
"scammer-lora-phase1-best-of-8",
"hf-scammer-gpt-oss-120b",
"hf-scammer-llama-3.3-70b-instruct",
"hf-scammer-qwen2.5-7b-instruct",
"scammer-lora-phase1-single-shot",
"hf-scammer-qwen2.5-72b-instruct",
"hf-scammer-gemma-3-27b-it",
"hf-scammer-deepseek-v3-0324",
]
def main() -> int:
OUT_DIR.mkdir(parents=True, exist_ok=True)
with CSV_PATH.open() as f:
rows = {r["provider"]: r for r in csv.DictReader(f)}
labels: list[str] = []
rates: list[float] = []
cis: list[tuple[float, float]] = []
colors: list[str] = []
for prov in ORDER:
if prov not in rows:
continue
r = rows[prov]
label, color = NICE_NAMES.get(prov, (prov, "#9ca3af"))
labels.append(label)
rates.append(float(r["bypass_rate"]) * 100)
cis.append((float(r["ci_low"]) * 100, float(r["ci_high"]) * 100))
colors.append(color)
fig, ax = plt.subplots(figsize=(12, 6))
x = list(range(len(labels)))
yerr_lo = [r - c[0] for r, c in zip(rates, cis)]
yerr_hi = [c[1] - r for r, c in zip(rates, cis)]
ax.bar(
x, rates, color=colors, edgecolor="#1f2937", linewidth=0.6,
yerr=[yerr_lo, yerr_hi], capsize=4,
)
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation=25, ha="right", fontsize=9)
ax.set_ylabel("Scripted-defender bypass rate (%)", fontsize=11)
ax.set_title(
"Frontier-LLMs-as-Scammer · n=16 attack categories · defender = ScriptedAnalyzer\n"
"Our trained 0.5B beats every untrained frontier model (BO8) — including 671B DeepSeek-V3",
fontsize=11,
)
for i, v in enumerate(rates):
ax.text(i, v + 2, f"{v:.1f}%", ha="center", fontsize=8)
ax.set_ylim(0, 100)
fig.tight_layout()
out = OUT_DIR / "scammer_frontier_bar.png"
fig.savefig(out, dpi=150, bbox_inches="tight")
print(f"Wrote {out}")
return 0
if __name__ == "__main__":
raise SystemExit(main())