"""Scammer LoRA Showcase โ render the trained adversary's story for the demo UI.
Reads pre-computed artifacts:
- logs/scammer_frontier_comparison.json (frontier LLMs as scammers)
- logs/scammer_significance.json (OOD parity + best-of-8 lift)
- logs/b2_phase1_scammer_vs_v2_lora.json (co-evolution gap)
Zero model dependencies โ all data is pre-computed.
"""
from __future__ import annotations
import json
from functools import lru_cache
from pathlib import Path
_LOGS = Path(__file__).resolve().parent.parent / "logs"
@lru_cache(maxsize=1)
def _load_frontier() -> dict | None:
p = _LOGS / "scammer_frontier_comparison.json"
if not p.exists():
return None
return json.loads(p.read_text(encoding="utf-8"))
@lru_cache(maxsize=1)
def _load_significance() -> dict | None:
p = _LOGS / "scammer_significance.json"
if not p.exists():
return None
return json.loads(p.read_text(encoding="utf-8"))
@lru_cache(maxsize=1)
def _load_coevolution() -> dict | None:
p = _LOGS / "b2_phase1_scammer_vs_v2_lora.json"
if not p.exists():
return None
return json.loads(p.read_text(encoding="utf-8"))
def render_scammer_hero() -> str:
sig = _load_significance()
bo8 = "93.75 %"
held_out = "100 %"
if sig:
meta = sig.get("scammer_lora_meta", {})
bo8_val = meta.get("best_of_8_bypass", 0.9375)
held_out_val = meta.get("held_out_best_of_8_bypass", 1.0)
bo8 = f"{bo8_val * 100:.2f} %"
held_out = f"{held_out_val * 100:.0f} %"
return f"""
๐ญ
Trained Scammer LoRA โ Qwen2.5-0.5B + GRPO
LoRA r=16 ยท 200 GRPO episodes ยท reward = 1 โ ScriptedAnalyzer.score
Best-of-8 bypass
{bo8}
vs scripted defense (n=64)
Held-out novel
{held_out}
32/32 on unseen categories
Parameters
0.5B
beats 671B DeepSeek-V3 at evasion
Co-evolution gap
60 pp
93.75 % vs scripted โ 32.8 % vs v2 LoRA
"""
def render_frontier_table() -> str:
"""Render the frontier-LLMs-as-scammer comparison table."""
data = _load_frontier()
if not data:
return 'Frontier scammer data not loaded.
'
trained_ref = data.get("trained_scammer_reference", {})
bo8 = trained_ref.get("scammer_lora_phase1_best_of_8", {})
ss = trained_ref.get("scammer_lora_phase1_single_shot", {})
frontier = data.get("frontier_results", [])
rows_data = []
rows_data.append({
"name": "Chakravyuh Scammer LoRA (best-of-8)",
"params": "0.5B + LoRA r=16",
"bypass": bo8.get("bypass_rate", 0.9375),
"ci": bo8.get("wilson_95ci", [0.85, 0.975]),
"held_out": bo8.get("held_out_rate", 1.0),
"highlight": True,
"caveat": False,
})
for f in sorted(frontier, key=lambda x: -x.get("bypass_rate", 0)):
model_id = f.get("model_id", "")
short = model_id.split("/")[-1] if "/" in model_id else model_id
params = _model_params(short)
is_safety_refusal = "gpt-oss" in short.lower()
display_name = f"{short} (untrained)"
if is_safety_refusal:
display_name += " *"
rows_data.append({
"name": display_name,
"params": params,
"bypass": f.get("bypass_rate", 0),
"ci": f.get("wilson_95ci", [0, 0]),
"held_out": f.get("held_out", {}).get("rate", 0),
"highlight": False,
"caveat": is_safety_refusal,
})
rows_data.append({
"name": "Chakravyuh Scammer LoRA (single-shot)",
"params": "0.5B + LoRA r=16",
"bypass": ss.get("bypass_rate", 0.59375),
"ci": ss.get("wilson_95ci", [0.471, 0.705]),
"held_out": ss.get("held_out_rate", 0.5625),
"highlight": True,
"caveat": False,
})
def _row(d: dict) -> str:
is_caveat = d.get("caveat", False)
if d["highlight"]:
bg = "background:#381932;color:#fff;"
elif is_caveat:
bg = "background:#FFF3E6;color:rgba(0,0,0,0.55);font-style:italic;"
else:
bg = "color:#000;"
name_style = "font-weight:800;" if d["highlight"] else "font-weight:600;"
ci_lo, ci_hi = d["ci"]
return (
f""
f"| {d['name']} | "
f"{d['params']} | "
f"{d['bypass'] * 100:.1f}% | "
f""
f"[{ci_lo * 100:.1f}%, {ci_hi * 100:.1f}%] | "
f"{d['held_out'] * 100:.1f}% | "
"
"
)
rows_html = "\n".join(_row(d) for d in rows_data)
return f"""
Frontier LLMs as Scammers โ who evades the scripted defense best?
| Scammer model |
Params |
Bypass rate |
95% CI |
Held-out |
{rows_html}
All frontier models used the same 16 attack-category prompts (8 train + 8 held-out).
* gpt-oss-120b "bypasses" at 87.5% mostly via safety refusals
— the model refuses to generate scam text, which the analyzer scores as benign.
The trained 0.5B Scammer generates actual scam text that evades keyword
rules — a fundamentally different (and harder) capability.
"""
def render_significance_panel() -> str:
sig = _load_significance()
if not sig:
return ""
t1 = sig.get("test_1_train_vs_held_out", {})
t2 = sig.get("test_2_single_shot_vs_best_of_8", {})
ss = t1.get("single_shot", {})
bo8 = t1.get("best_of_8", {})
return f"""
Statistical evidence โ the Scammer generalizes, and best-of-8 is real
OOD Generalization (Fisher's exact)
Train vs held-out bypass rates are not significantly different:
Single-shot: p = {ss.get('fisher_two_sided_p', 0.80):.3f}
Best-of-8: p = {bo8.get('fisher_two_sided_p', 0.11):.3f}
Large p = the Scammer generalizes to unseen categories.
Best-of-8 Lift (McNemar exact)
Best-of-8 strictly dominates single-shot:
p ≈ {t2.get('exact_two_sided_p', 4.77e-7):.1e}
Discordant: {t2.get('discordant_ss_miss_bo8_hit', 22)} cases where
best-of-8 won but single-shot lost; 0 in reverse.
Small p = the lift is real, not cherry-picking.
"""
def render_coevolution_panel() -> str:
data = _load_coevolution()
if not data:
return ""
overall = data.get("aggregate", {}).get("overall", {})
n = overall.get("n", 64)
scripted_bypass = overall.get("scripted_bypass_rate", 0.9375)
v2_bypass = overall.get("v2_bypass_rate", 0.328125)
gap = overall.get("gap_pp", (scripted_bypass - v2_bypass) * 100)
return f"""
Co-evolution in one number โ the same scam outputs vs two defenders
vs Scripted rules
{scripted_bypass * 100:.1f}%
bypass rate
โ {gap:.0f} pp โ
defensive lift
vs v2 LoRA defender
{v2_bypass * 100:.1f}%
bypass rate
The same {n} Scammer-generated outputs were scored by both defenders.
The trained v2 Analyzer LoRA catches {gap:.0f} percentage points more than
the rule-based baseline โ that gap IS the multi-agent co-evolution, measured on identical inputs.
Browse individual samples in the Adversary Lab tab.
"""
def _model_params(short_name: str) -> str:
mapping = {
"Llama-3.3-70B-Instruct": "70B",
"Qwen2.5-72B-Instruct": "72B",
"DeepSeek-V3-0324": "671B MoE",
"Qwen2.5-7B-Instruct": "7B",
"gpt-oss-120b": "120B",
"gemma-3-27b-it": "27B",
}
for key, val in mapping.items():
if key.lower() in short_name.lower():
return val
return "โ"