chakravyuh / plots /scripts /frontier_comparison_bar.py
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
"""Render a side-by-side FPR / F1 bar chart for the 7-model frontier comparison.
The README hero already has the per-difficulty plot; this complement adds the
hero artifact for the open-weight frontier comparison so judges scrolling
the README see the whole story without leaving for the CSV. Output PNGs land
in ``plots/chakravyuh_plots/`` so the README can embed them via raw GitHub URL.
"""
from __future__ import annotations
import csv
from pathlib import Path
import matplotlib.pyplot as plt
REPO = Path(__file__).resolve().parent.parent.parent
CSV_PATH = REPO / "logs" / "frontier_comparison.csv"
OUT_DIR = REPO / "plots" / "chakravyuh_plots"
# v2 LoRA aggregate is sourced from logs/eval_v2.json (n=174, n_benign=30) —
# included as the first bar so the visual contrasts the trained model with
# the untrained Qwen base and the larger frontier models.
V2_LORA = {
"label": "v2 LoRA (this work)\n7B + LoRA",
"fpr": 0.0667,
"f1": 0.990,
"color": "#0a7e44", # emerald
}
NICE_NAMES = {
"scripted": ("Scripted baseline\n—", "#9ca3af"),
"hf-qwen2.5-7b-instruct": ("Qwen2.5-7B base\n7B (no LoRA)", "#f59e0b"),
"hf-llama-3.3-70b-instruct": ("Llama-3.3-70B\n70B", "#3b82f6"),
"hf-qwen2.5-72b-instruct": ("Qwen2.5-72B\n72B", "#3b82f6"),
"hf-deepseek-v3-0324": ("DeepSeek-V3\n671B MoE", "#ef4444"),
"hf-gpt-oss-120b": ("gpt-oss-120b\n120B", "#3b82f6"),
"hf-gemma-3-27b-it": ("gemma-3-27b\n27B", "#ef4444"),
"hf-deepseek-r1": ("DeepSeek-R1\n671B MoE", "#3b82f6"),
}
def _load() -> list[dict]:
with CSV_PATH.open() as f:
return list(csv.DictReader(f))
def _bars(rows: list[dict]) -> list[tuple[str, float, float, str]]:
"""Return (label, fpr, f1, color) tuples in display order."""
out: list[tuple[str, float, float, str]] = [
(V2_LORA["label"], V2_LORA["fpr"], V2_LORA["f1"], V2_LORA["color"]),
]
# Order the rest in a deliberate narrative: base model, then ascending FPR
by_provider = {r["provider"]: r for r in rows}
order = [
"hf-qwen2.5-7b-instruct",
"hf-llama-3.3-70b-instruct",
"hf-qwen2.5-72b-instruct",
"hf-gpt-oss-120b",
"hf-deepseek-v3-0324",
"hf-gemma-3-27b-it",
"hf-deepseek-r1",
"scripted",
]
for prov in order:
if prov not in by_provider:
continue
r = by_provider[prov]
label, color = NICE_NAMES.get(prov, (prov, "#9ca3af"))
out.append((label, float(r["false_positive_rate"]), float(r["f1"]), color))
return out
def main() -> int:
OUT_DIR.mkdir(parents=True, exist_ok=True)
rows = _load()
bars = _bars(rows)
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
labels = [b[0] for b in bars]
fprs = [b[1] * 100 for b in bars]
f1s = [b[2] for b in bars]
colors = [b[3] for b in bars]
x = list(range(len(bars)))
axes[0].bar(x, fprs, color=colors, edgecolor="#1f2937", linewidth=0.6)
axes[0].set_xticks(x)
axes[0].set_xticklabels(labels, rotation=30, ha="right", fontsize=9)
axes[0].set_ylabel("False positive rate (%)", fontsize=11)
axes[0].set_title(
"FPR — lower is better\nv2 LoRA beats every model with FPR > 6.7 % at 10× fewer params",
fontsize=11,
)
axes[0].axhline(y=V2_LORA["fpr"] * 100, color=V2_LORA["color"], linestyle="--", alpha=0.5, linewidth=1)
for i, v in enumerate(fprs):
axes[0].text(i, v + 0.7, f"{v:.1f}%", ha="center", fontsize=8)
axes[0].set_ylim(0, max(fprs) * 1.15 + 5)
axes[1].bar(x, f1s, color=colors, edgecolor="#1f2937", linewidth=0.6)
axes[1].set_xticks(x)
axes[1].set_xticklabels(labels, rotation=30, ha="right", fontsize=9)
axes[1].set_ylabel("F1 score", fontsize=11)
axes[1].set_title(
"F1 — higher is better\nv2 LoRA ties Llama-3.3-70B; beats every other open-weight model",
fontsize=11,
)
axes[1].axhline(y=V2_LORA["f1"], color=V2_LORA["color"], linestyle="--", alpha=0.5, linewidth=1)
for i, v in enumerate(f1s):
axes[1].text(i, v + 0.005, f"{v:.3f}", ha="center", fontsize=8)
axes[1].set_ylim(min(f1s) * 0.95, 1.02)
fig.suptitle(
"Open-weight frontier comparison — n=175 same bench, same prompt\n"
"Source: logs/frontier_comparison.csv · DeepSeek-R1 scored with reasoning-aware parser",
fontsize=10,
)
fig.tight_layout()
out = OUT_DIR / "frontier_comparison_bar.png"
fig.savefig(out, dpi=150, bbox_inches="tight")
print(f"Wrote {out}")
return 0
if __name__ == "__main__":
raise SystemExit(main())