"""Adversary Lab — browse the 64 trained-Scammer outputs vs both defenders. Renders the B.2 Phase-1 head-to-head data from `logs/b2_phase1_scammer_vs_v2_lora.json` as a Gradio-friendly HTML panel. Each sample shows: - The seed prompt that triggered the Scammer - The actual generated scam text - Scripted ScriptedAnalyzer's verdict (bypassed / caught) - v2 Analyzer LoRA's verdict (score, signals, explanation) - The asymmetry — when scripted misses but v2 catches, that IS the co-evolution gap made visible This file ships zero new model dependencies — the data is pre-computed and committed to the repo. The Adversary Lab tab is the *visible* Theme #1 demonstration: trained adversary vs trained defender, on real generated scam text, no hand-waving. """ from __future__ import annotations import json from dataclasses import dataclass from functools import lru_cache from pathlib import Path LOG_PATH = Path(__file__).resolve().parent.parent / "logs" / "b2_phase1_scammer_vs_v2_lora.json" @dataclass(frozen=True) class AdversarySample: index: int seed: str split: str completion: str length_chars: int is_refusal: bool scripted_score: float scripted_caught: bool v2_score: float v2_caught: bool v2_signals: tuple[str, ...] v2_explanation: str def _parse_v2_response(raw: str) -> tuple[tuple[str, ...], str]: """Best-effort parse of the v2 JSON response. Returns (signals, explanation).""" if not raw: return ((), "") try: # The v2 response is JSON-formatted; sometimes wrapped in code-fence text = raw.strip() if text.startswith("```"): text = text.split("```", 2)[1] if text.startswith("json"): text = text[4:] decoded = json.loads(text) signals = decoded.get("signals", []) or [] if not isinstance(signals, list): signals = [str(signals)] explanation = str(decoded.get("explanation", "") or "") return (tuple(str(s) for s in signals), explanation) except (json.JSONDecodeError, IndexError, KeyError): return ((), raw[:240]) @lru_cache(maxsize=1) def load_samples() -> tuple[AdversarySample, ...]: if not LOG_PATH.exists(): return () raw = json.loads(LOG_PATH.read_text(encoding="utf-8")) samples_raw = raw.get("samples") or [] out: list[AdversarySample] = [] for i, s in enumerate(samples_raw): # Scripted bypass = True means scripted *missed*; caught = NOT bypass. scripted_caught = not bool(s.get("bypass", False)) scripted_score = ( float(s.get("scripted_score", 1.0 - float(s.get("reward", 0.0)))) if "scripted_score" in s else round(1.0 - float(s.get("reward", 0.5)), 3) ) v2_signals, v2_explanation = _parse_v2_response(s.get("v2_raw_response", "")) out.append( AdversarySample( index=i, seed=str(s.get("seed", "")), split=str(s.get("split", "")), completion=str(s.get("completion", "")), length_chars=int(s.get("length_chars", len(s.get("completion", "")))), is_refusal=bool(s.get("is_refusal", False)), scripted_score=scripted_score, scripted_caught=scripted_caught, v2_score=float(s.get("v2_score", 0.0)), v2_caught=bool(s.get("v2_caught", False)), v2_signals=v2_signals, v2_explanation=v2_explanation, ) ) return tuple(out) def aggregate_stats() -> dict[str, object]: samples = load_samples() if not samples: return {"n": 0} by_split: dict[str, dict[str, int]] = {} cells = {"both_caught": 0, "only_scripted_missed": 0, "only_v2_missed": 0, "both_missed": 0} for s in samples: slot = by_split.setdefault(s.split, {"n": 0, "scripted_caught": 0, "v2_caught": 0}) slot["n"] += 1 slot["scripted_caught"] += int(s.scripted_caught) slot["v2_caught"] += int(s.v2_caught) if s.scripted_caught and s.v2_caught: cells["both_caught"] += 1 elif (not s.scripted_caught) and s.v2_caught: cells["only_scripted_missed"] += 1 elif s.scripted_caught and (not s.v2_caught): cells["only_v2_missed"] += 1 else: cells["both_missed"] += 1 return {"n": len(samples), "by_split": by_split, "cells": cells} def sample_choice_labels() -> list[tuple[str, int]]: samples = load_samples() out: list[tuple[str, int]] = [] for s in samples: scripted_tag = "✓" if s.scripted_caught else "✗" v2_tag = "✓" if s.v2_caught else "✗" seed_short = s.seed[:55].replace("\n", " ") if len(s.seed) > 55: seed_short += "…" label = f"#{s.index:02d} [{s.split:8s}] scripted {scripted_tag} · v2 {v2_tag} — {seed_short}" out.append((label, s.index)) return out def _verdict_pill(caught: bool) -> str: if caught: return ( 'CAUGHT' ) return ( 'BYPASSED' ) def render_aggregate_banner() -> str: stats = aggregate_stats() if stats["n"] == 0: return '
Adversary Lab data not loaded — `logs/b2_phase1_scammer_vs_v2_lora.json` missing.
' cells = stats["cells"] by_split = stats["by_split"] total = stats["n"] train = by_split.get("train", {"n": 0, "scripted_caught": 0, "v2_caught": 0}) held = by_split.get("held_out", {"n": 0, "scripted_caught": 0, "v2_caught": 0}) def _row(name: str, n: int, sc: int, v2c: int) -> str: sc_pct = 100 * (1 - sc / n) if n else 0 v2_pct = 100 * (1 - v2c / n) if n else 0 gap = sc_pct - v2_pct return ( f"{name}" f"{n}" f"{sc_pct:.1f}%" f"{v2_pct:.1f}%" f"+{gap:.1f} pp" ) return f"""
B.2 Phase-1 head-to-head — same Scammer outputs vs both defenders (n={total})
{_row("Train", train['n'], train['scripted_caught'], train['v2_caught'])} {_row("Held-out (novel)", held['n'], held['scripted_caught'], held['v2_caught'])} {_row("Overall", total, train['scripted_caught'] + held['scripted_caught'], train['v2_caught'] + held['v2_caught'])}
Split n Scripted bypass v2 LoRA bypass Gap
Cross-tab: {cells['both_caught']} both caught · {cells['only_scripted_missed']} only-scripted-missed (the co-evolution wins) · {cells['only_v2_missed']} only-v2-missed (v3 targets) · {cells['both_missed']} both missed
""" def render_sample(index: int) -> str: samples = load_samples() if index < 0 or index >= len(samples): return '
Sample index out of range.
' s = samples[index] seed_html = ( s.seed.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace("\n", "
") ) completion_html = ( s.completion.replace("&", "&") .replace("<", "<") .replace(">", ">") .replace("\n", "
") ) explanation_html = ( s.v2_explanation.replace("&", "&") .replace("<", "<") .replace(">", ">") ) signals_html = " ".join( f'{sig}' for sig in s.v2_signals ) if not signals_html: signals_html = '(no signals declared)' asymmetry_note = "" if (not s.scripted_caught) and s.v2_caught: asymmetry_note = ( '
' "This is the co-evolution win. The trained Scammer evaded the " "rule-based detector — but the v2 Analyzer LoRA (trained on the env's 8-rubric " "reward) catches it. This is the kind of pair that the +60 pp head-to-head gap " "is built from." "
" ) elif s.scripted_caught and (not s.v2_caught): asymmetry_note = ( '
' "v3 target. Scripted's keyword rules caught this, but our " "v2 LoRA missed it — typically a non-bank category outside v2's training " "distribution. Phase-2 LoRA-vs-LoRA retrain (queued for onsite GPU) closes " "exactly these cases." "
" ) return f"""
Sample #{s.index} · split = {s.split} · {s.length_chars} chars {' · REFUSAL' if s.is_refusal else ''}
Seed prompt (what the trained Scammer was asked to write)
{seed_html}
Generated scam text (Qwen2.5-0.5B + LoRA, after 200 GRPO episodes)
{completion_html}
Defender 1 — rule-based ScriptedAnalyzer
Score: {s.scripted_score:.2f} · {_verdict_pill(s.scripted_caught)}
Defender 2 — chakravyuh-analyzer-lora-v2 (trained)
Score: {s.v2_score:.2f} · {_verdict_pill(s.v2_caught)}
Signals declared:
{signals_html}
Explanation:
{explanation_html or '(no parseable explanation)'}
{asymmetry_note}
"""