"""Scammer LoRA Showcase — render the trained adversary's story for the demo UI. Reads pre-computed artifacts: - logs/scammer_frontier_comparison.json (frontier LLMs as scammers) - logs/scammer_significance.json (OOD parity + best-of-8 lift) - logs/b2_phase1_scammer_vs_v2_lora.json (co-evolution gap) Zero model dependencies — all data is pre-computed. """ from __future__ import annotations import json from functools import lru_cache from pathlib import Path _LOGS = Path(__file__).resolve().parent.parent / "logs" @lru_cache(maxsize=1) def _load_frontier() -> dict | None: p = _LOGS / "scammer_frontier_comparison.json" if not p.exists(): return None return json.loads(p.read_text(encoding="utf-8")) @lru_cache(maxsize=1) def _load_significance() -> dict | None: p = _LOGS / "scammer_significance.json" if not p.exists(): return None return json.loads(p.read_text(encoding="utf-8")) @lru_cache(maxsize=1) def _load_coevolution() -> dict | None: p = _LOGS / "b2_phase1_scammer_vs_v2_lora.json" if not p.exists(): return None return json.loads(p.read_text(encoding="utf-8")) def render_scammer_hero() -> str: sig = _load_significance() bo8 = "93.75 %" held_out = "100 %" if sig: meta = sig.get("scammer_lora_meta", {}) bo8_val = meta.get("best_of_8_bypass", 0.9375) held_out_val = meta.get("held_out_best_of_8_bypass", 1.0) bo8 = f"{bo8_val * 100:.2f} %" held_out = f"{held_out_val * 100:.0f} %" return f"""

🎭

Trained Scammer LoRA — Qwen2.5-0.5B + GRPO

LoRA r=16 · 200 GRPO episodes · reward = 1 − ScriptedAnalyzer.score

Best-of-8 bypass

{bo8}

vs scripted defense (n=64)

Held-out novel

{held_out}

32/32 on unseen categories

Parameters

0.5B

beats 671B DeepSeek-V3 at evasion

Co-evolution gap

60 pp

93.75 % vs scripted → 32.8 % vs v2 LoRA

""" def render_frontier_table() -> str: """Render the frontier-LLMs-as-scammer comparison table.""" data = _load_frontier() if not data: return '

Frontier scammer data not loaded.

' trained_ref = data.get("trained_scammer_reference", {}) bo8 = trained_ref.get("scammer_lora_phase1_best_of_8", {}) ss = trained_ref.get("scammer_lora_phase1_single_shot", {}) frontier = data.get("frontier_results", []) rows_data = [] rows_data.append({ "name": "Chakravyuh Scammer LoRA (best-of-8)", "params": "0.5B + LoRA r=16", "bypass": bo8.get("bypass_rate", 0.9375), "ci": bo8.get("wilson_95ci", [0.85, 0.975]), "held_out": bo8.get("held_out_rate", 1.0), "highlight": True, "caveat": False, }) for f in sorted(frontier, key=lambda x: -x.get("bypass_rate", 0)): model_id = f.get("model_id", "") short = model_id.split("/")[-1] if "/" in model_id else model_id params = _model_params(short) is_safety_refusal = "gpt-oss" in short.lower() display_name = f"{short} (untrained)" if is_safety_refusal: display_name += " *" rows_data.append({ "name": display_name, "params": params, "bypass": f.get("bypass_rate", 0), "ci": f.get("wilson_95ci", [0, 0]), "held_out": f.get("held_out", {}).get("rate", 0), "highlight": False, "caveat": is_safety_refusal, }) rows_data.append({ "name": "Chakravyuh Scammer LoRA (single-shot)", "params": "0.5B + LoRA r=16", "bypass": ss.get("bypass_rate", 0.59375), "ci": ss.get("wilson_95ci", [0.471, 0.705]), "held_out": ss.get("held_out_rate", 0.5625), "highlight": True, "caveat": False, }) def _row(d: dict) -> str: is_caveat = d.get("caveat", False) if d["highlight"]: bg = "background:#381932;color:#fff;" elif is_caveat: bg = "background:#FFF3E6;color:rgba(0,0,0,0.55);font-style:italic;" else: bg = "color:#000;" name_style = "font-weight:800;" if d["highlight"] else "font-weight:600;" ci_lo, ci_hi = d["ci"] return ( f"" f"{d['name']}" f"{d['params']}" f"{d['bypass'] * 100:.1f}%" f"" f"[{ci_lo * 100:.1f}%, {ci_hi * 100:.1f}%]" f"{d['held_out'] * 100:.1f}%" "" ) rows_html = "\n".join(_row(d) for d in rows_data) return f"""

Frontier LLMs as Scammers — who evades the scripted defense best?

{rows_html}

Scammer model	Params	Bypass rate	95% CI	Held-out

All frontier models used the same 16 attack-category prompts (8 train + 8 held-out). * gpt-oss-120b "bypasses" at 87.5% mostly via safety refusals — the model refuses to generate scam text, which the analyzer scores as benign. The trained 0.5B Scammer generates actual scam text that evades keyword rules — a fundamentally different (and harder) capability.

""" def render_significance_panel() -> str: sig = _load_significance() if not sig: return "" t1 = sig.get("test_1_train_vs_held_out", {}) t2 = sig.get("test_2_single_shot_vs_best_of_8", {}) ss = t1.get("single_shot", {}) bo8 = t1.get("best_of_8", {}) return f"""

Statistical evidence — the Scammer generalizes, and best-of-8 is real

OOD Generalization (Fisher's exact)

Train vs held-out bypass rates are not significantly different:
Single-shot: p = {ss.get('fisher_two_sided_p', 0.80):.3f}
Best-of-8: p = {bo8.get('fisher_two_sided_p', 0.11):.3f}
Large p = the Scammer generalizes to unseen categories.

Best-of-8 Lift (McNemar exact)

Best-of-8 strictly dominates single-shot:
p ≈ {t2.get('exact_two_sided_p', 4.77e-7):.1e}
Discordant: {t2.get('discordant_ss_miss_bo8_hit', 22)} cases where best-of-8 won but single-shot lost; 0 in reverse.
Small p = the lift is real, not cherry-picking.

""" def render_coevolution_panel() -> str: data = _load_coevolution() if not data: return "" overall = data.get("aggregate", {}).get("overall", {}) n = overall.get("n", 64) scripted_bypass = overall.get("scripted_bypass_rate", 0.9375) v2_bypass = overall.get("v2_bypass_rate", 0.328125) gap = overall.get("gap_pp", (scripted_bypass - v2_bypass) * 100) return f"""

Co-evolution in one number — the same scam outputs vs two defenders

vs Scripted rules

        {scripted_bypass * 100:.1f}%

bypass rate

→ {gap:.0f} pp →

defensive lift

vs v2 LoRA defender

        {v2_bypass * 100:.1f}%

bypass rate

The same {n} Scammer-generated outputs were scored by both defenders. The trained v2 Analyzer LoRA catches {gap:.0f} percentage points more than the rule-based baseline — that gap IS the multi-agent co-evolution, measured on identical inputs. Browse individual samples in the Adversary Lab tab.

""" def _model_params(short_name: str) -> str: mapping = { "Llama-3.3-70B-Instruct": "70B", "Qwen2.5-72B-Instruct": "72B", "DeepSeek-V3-0324": "671B MoE", "Qwen2.5-7B-Instruct": "7B", "gpt-oss-120b": "120B", "gemma-3-27b-it": "27B", } for key, val in mapping.items(): if key.lower() in short_name.lower(): return val return "—"