"""Adversary Lab — browse the 64 trained-Scammer outputs vs both defenders. Renders the B.2 Phase-1 head-to-head data from `logs/b2_phase1_scammer_vs_v2_lora.json` as a Gradio-friendly HTML panel. Each sample shows: - The seed prompt that triggered the Scammer - The actual generated scam text - Scripted ScriptedAnalyzer's verdict (bypassed / caught) - v2 Analyzer LoRA's verdict (score, signals, explanation) - The asymmetry — when scripted misses but v2 catches, that IS the co-evolution gap made visible This file ships zero new model dependencies — the data is pre-computed and committed to the repo. The Adversary Lab tab is the *visible* Theme #1 demonstration: trained adversary vs trained defender, on real generated scam text, no hand-waving. """ from __future__ import annotations import json from dataclasses import dataclass from functools import lru_cache from pathlib import Path LOG_PATH = Path(__file__).resolve().parent.parent / "logs" / "b2_phase1_scammer_vs_v2_lora.json" @dataclass(frozen=True) class AdversarySample: index: int seed: str split: str completion: str length_chars: int is_refusal: bool scripted_score: float scripted_caught: bool v2_score: float v2_caught: bool v2_signals: tuple[str, ...] v2_explanation: str def _parse_v2_response(raw: str) -> tuple[tuple[str, ...], str]: """Best-effort parse of the v2 JSON response. Returns (signals, explanation).""" if not raw: return ((), "") try: # The v2 response is JSON-formatted; sometimes wrapped in code-fence text = raw.strip() if text.startswith("```"): text = text.split("```", 2)[1] if text.startswith("json"): text = text[4:] decoded = json.loads(text) signals = decoded.get("signals", []) or [] if not isinstance(signals, list): signals = [str(signals)] explanation = str(decoded.get("explanation", "") or "") return (tuple(str(s) for s in signals), explanation) except (json.JSONDecodeError, IndexError, KeyError): return ((), raw[:240]) @lru_cache(maxsize=1) def load_samples() -> tuple[AdversarySample, ...]: if not LOG_PATH.exists(): return () raw = json.loads(LOG_PATH.read_text(encoding="utf-8")) samples_raw = raw.get("samples") or [] out: list[AdversarySample] = [] for i, s in enumerate(samples_raw): # Scripted bypass = True means scripted *missed*; caught = NOT bypass. scripted_caught = not bool(s.get("bypass", False)) scripted_score = ( float(s.get("scripted_score", 1.0 - float(s.get("reward", 0.0)))) if "scripted_score" in s else round(1.0 - float(s.get("reward", 0.5)), 3) ) v2_signals, v2_explanation = _parse_v2_response(s.get("v2_raw_response", "")) out.append( AdversarySample( index=i, seed=str(s.get("seed", "")), split=str(s.get("split", "")), completion=str(s.get("completion", "")), length_chars=int(s.get("length_chars", len(s.get("completion", "")))), is_refusal=bool(s.get("is_refusal", False)), scripted_score=scripted_score, scripted_caught=scripted_caught, v2_score=float(s.get("v2_score", 0.0)), v2_caught=bool(s.get("v2_caught", False)), v2_signals=v2_signals, v2_explanation=v2_explanation, ) ) return tuple(out) def aggregate_stats() -> dict[str, object]: samples = load_samples() if not samples: return {"n": 0} by_split: dict[str, dict[str, int]] = {} cells = {"both_caught": 0, "only_scripted_missed": 0, "only_v2_missed": 0, "both_missed": 0} for s in samples: slot = by_split.setdefault(s.split, {"n": 0, "scripted_caught": 0, "v2_caught": 0}) slot["n"] += 1 slot["scripted_caught"] += int(s.scripted_caught) slot["v2_caught"] += int(s.v2_caught) if s.scripted_caught and s.v2_caught: cells["both_caught"] += 1 elif (not s.scripted_caught) and s.v2_caught: cells["only_scripted_missed"] += 1 elif s.scripted_caught and (not s.v2_caught): cells["only_v2_missed"] += 1 else: cells["both_missed"] += 1 return {"n": len(samples), "by_split": by_split, "cells": cells} def sample_choice_labels() -> list[tuple[str, int]]: samples = load_samples() out: list[tuple[str, int]] = [] for s in samples: scripted_tag = "✓" if s.scripted_caught else "✗" v2_tag = "✓" if s.v2_caught else "✗" seed_short = s.seed[:55].replace("\n", " ") if len(s.seed) > 55: seed_short += "…" label = f"#{s.index:02d} [{s.split:8s}] scripted {scripted_tag} · v2 {v2_tag} — {seed_short}" out.append((label, s.index)) return out def _verdict_pill(caught: bool) -> str: if caught: return ( 'CAUGHT' ) return ( 'BYPASSED' ) def render_aggregate_banner() -> str: stats = aggregate_stats() if stats["n"] == 0: return '
| Split | n | Scripted bypass | v2 LoRA bypass | Gap |
|---|
ScriptedAnalyzer
chakravyuh-analyzer-lora-v2 (trained)