Spaces:

ujjwalpardeshi
/

chakravyuh

Running

UjjwalPardeshi

deploy: latest main to HF Space

03815d6 13 days ago

12.9 kB

	"""Adversary Lab — browse the 64 trained-Scammer outputs vs both defenders.

	Renders the B.2 Phase-1 head-to-head data from
	`logs/b2_phase1_scammer_vs_v2_lora.json` as a Gradio-friendly HTML
	panel. Each sample shows:

	- The seed prompt that triggered the Scammer
	- The actual generated scam text
	- Scripted ScriptedAnalyzer's verdict (bypassed / caught)
	- v2 Analyzer LoRA's verdict (score, signals, explanation)
	- The asymmetry — when scripted misses but v2 catches, that IS the
	co-evolution gap made visible

	This file ships zero new model dependencies — the data is pre-computed
	and committed to the repo. The Adversary Lab tab is the visible
	Theme #1 demonstration: trained adversary vs trained defender, on real
	generated scam text, no hand-waving.
	"""

	from __future__ import annotations

	import json
	from dataclasses import dataclass
	from functools import lru_cache
	from pathlib import Path

	LOG_PATH = Path(__file__).resolve().parent.parent / "logs" / "b2_phase1_scammer_vs_v2_lora.json"


	@dataclass(frozen=True)
	class AdversarySample:
	index: int
	seed: str
	split: str
	completion: str
	length_chars: int
	is_refusal: bool
	scripted_score: float
	scripted_caught: bool
	v2_score: float
	v2_caught: bool
	v2_signals: tuple[str, ...]
	v2_explanation: str


	def _parse_v2_response(raw: str) -> tuple[tuple[str, ...], str]:
	"""Best-effort parse of the v2 JSON response. Returns (signals, explanation)."""
	if not raw:
	return ((), "")
	try:
	# The v2 response is JSON-formatted; sometimes wrapped in code-fence
	text = raw.strip()
	if text.startswith("```"):
	text = text.split("```", 2)[1]
	if text.startswith("json"):
	text = text[4:]
	decoded = json.loads(text)
	signals = decoded.get("signals", []) or []
	if not isinstance(signals, list):
	signals = [str(signals)]
	explanation = str(decoded.get("explanation", "") or "")
	return (tuple(str(s) for s in signals), explanation)
	except (json.JSONDecodeError, IndexError, KeyError):
	return ((), raw[:240])


	@lru_cache(maxsize=1)
	def load_samples() -> tuple[AdversarySample, ...]:
	if not LOG_PATH.exists():
	return ()
	raw = json.loads(LOG_PATH.read_text(encoding="utf-8"))
	samples_raw = raw.get("samples") or []
	out: list[AdversarySample] = []
	for i, s in enumerate(samples_raw):
	# Scripted bypass = True means scripted missed; caught = NOT bypass.
	scripted_caught = not bool(s.get("bypass", False))
	scripted_score = (
	float(s.get("scripted_score", 1.0 - float(s.get("reward", 0.0))))
	if "scripted_score" in s
	else round(1.0 - float(s.get("reward", 0.5)), 3)
	)
	v2_signals, v2_explanation = _parse_v2_response(s.get("v2_raw_response", ""))
	out.append(
	AdversarySample(
	index=i,
	seed=str(s.get("seed", "")),
	split=str(s.get("split", "")),
	completion=str(s.get("completion", "")),
	length_chars=int(s.get("length_chars", len(s.get("completion", "")))),
	is_refusal=bool(s.get("is_refusal", False)),
	scripted_score=scripted_score,
	scripted_caught=scripted_caught,
	v2_score=float(s.get("v2_score", 0.0)),
	v2_caught=bool(s.get("v2_caught", False)),
	v2_signals=v2_signals,
	v2_explanation=v2_explanation,
	)
	)
	return tuple(out)


	def aggregate_stats() -> dict[str, object]:
	samples = load_samples()
	if not samples:
	return {"n": 0}
	by_split: dict[str, dict[str, int]] = {}
	cells = {"both_caught": 0, "only_scripted_missed": 0, "only_v2_missed": 0, "both_missed": 0}
	for s in samples:
	slot = by_split.setdefault(s.split, {"n": 0, "scripted_caught": 0, "v2_caught": 0})
	slot["n"] += 1
	slot["scripted_caught"] += int(s.scripted_caught)
	slot["v2_caught"] += int(s.v2_caught)
	if s.scripted_caught and s.v2_caught:
	cells["both_caught"] += 1
	elif (not s.scripted_caught) and s.v2_caught:
	cells["only_scripted_missed"] += 1
	elif s.scripted_caught and (not s.v2_caught):
	cells["only_v2_missed"] += 1
	else:
	cells["both_missed"] += 1
	return {"n": len(samples), "by_split": by_split, "cells": cells}


	def sample_choice_labels() -> list[tuple[str, int]]:
	samples = load_samples()
	out: list[tuple[str, int]] = []
	for s in samples:
	scripted_tag = "✓" if s.scripted_caught else "✗"
	v2_tag = "✓" if s.v2_caught else "✗"
	seed_short = s.seed[:55].replace("\n", " ")
	if len(s.seed) > 55:
	seed_short += "…"
	label = f"#{s.index:02d} [{s.split:8s}] scripted {scripted_tag} · v2 {v2_tag} — {seed_short}"
	out.append((label, s.index))
	return out


	def _verdict_pill(caught: bool) -> str:
	if caught:
	return (
	'<span style="display:inline-block;padding:3px 10px;border-radius:999px;'
	'background:#e8f5e9;color:#1b5e20;font-weight:700;font-size:12px;">CAUGHT</span>'
	)
	return (
	'<span style="display:inline-block;padding:3px 10px;border-radius:999px;'
	'background:#ffebee;color:#b71c1c;font-weight:700;font-size:12px;">BYPASSED</span>'
	)


	def render_aggregate_banner() -> str:
	stats = aggregate_stats()
	if stats["n"] == 0:
	return '<div style="color:#b71c1c;">Adversary Lab data not loaded — `logs/b2_phase1_scammer_vs_v2_lora.json` missing.</div>'
	cells = stats["cells"]
	by_split = stats["by_split"]
	total = stats["n"]
	train = by_split.get("train", {"n": 0, "scripted_caught": 0, "v2_caught": 0})
	held = by_split.get("held_out", {"n": 0, "scripted_caught": 0, "v2_caught": 0})

	def _row(name: str, n: int, sc: int, v2c: int) -> str:
	sc_pct = 100 * (1 - sc / n) if n else 0
	v2_pct = 100 * (1 - v2c / n) if n else 0
	gap = sc_pct - v2_pct
	return (
	f"<tr><td style='padding:4px 12px;'>{name}</td>"
	f"<td style='padding:4px 12px;text-align:right;'>{n}</td>"
	f"<td style='padding:4px 12px;text-align:right;color:#b71c1c;font-weight:700;'>{sc_pct:.1f}%</td>"
	f"<td style='padding:4px 12px;text-align:right;color:#1b5e20;font-weight:700;'>{v2_pct:.1f}%</td>"
	f"<td style='padding:4px 12px;text-align:right;font-weight:700;'>+{gap:.1f} pp</td></tr>"
	)

	return f"""
	<div style="background:#FFF3E6;border:1px solid #381932;border-radius:8px;padding:14px 18px;margin:6px 0 14px;">
	<div style="font-weight:700;font-size:15px;margin-bottom:8px;color:#381932;">
	B.2 Phase-1 head-to-head — same Scammer outputs vs both defenders (n={total})
	</div>
	<table style="border-collapse:collapse;font-size:13px;width:100%;">
	<thead>
	<tr style="border-bottom:1px solid #381932;color:#381932;">
	<th style="padding:4px 12px;text-align:left;">Split</th>
	<th style="padding:4px 12px;text-align:right;">n</th>
	<th style="padding:4px 12px;text-align:right;">Scripted bypass</th>
	<th style="padding:4px 12px;text-align:right;">v2 LoRA bypass</th>
	<th style="padding:4px 12px;text-align:right;">Gap</th>
	</tr>
	</thead>
	<tbody>
	{_row("Train", train['n'], train['scripted_caught'], train['v2_caught'])}
	{_row("Held-out (novel)", held['n'], held['scripted_caught'], held['v2_caught'])}
	{_row("Overall", total, train['scripted_caught'] + held['scripted_caught'],
	train['v2_caught'] + held['v2_caught'])}
	</tbody>
	</table>
	<div style="margin-top:10px;font-size:12px;color:#000;">
	<strong>Cross-tab:</strong>
	{cells['both_caught']} both caught ·
	<strong style="color:#1b5e20;">{cells['only_scripted_missed']} only-scripted-missed</strong>
	(the co-evolution wins) ·
	{cells['only_v2_missed']} only-v2-missed (v3 targets) ·
	{cells['both_missed']} both missed
	</div>
	</div>
	"""


	def render_sample(index: int) -> str:
	samples = load_samples()
	if index < 0 or index >= len(samples):
	return '<div style="color:#b71c1c;">Sample index out of range.</div>'
	s = samples[index]
	seed_html = (
	s.seed.replace("&", "&")
	.replace("<", "<")
	.replace(">", ">")
	.replace("\n", "<br>")
	)
	completion_html = (
	s.completion.replace("&", "&")
	.replace("<", "<")
	.replace(">", ">")
	.replace("\n", "<br>")
	)
	explanation_html = (
	s.v2_explanation.replace("&", "&")
	.replace("<", "<")
	.replace(">", ">")
	)
	signals_html = " ".join(
	f'<span style="display:inline-block;padding:2px 8px;border-radius:999px;'
	f'background:#FFF3E6;border:1px solid #381932;color:#381932;'
	f'margin:2px 4px 2px 0;font-size:11px;">{sig}</span>'
	for sig in s.v2_signals
	)
	if not signals_html:
	signals_html = '<span style="color:rgba(0,0,0,0.55);font-size:12px;">(no signals declared)</span>'

	asymmetry_note = ""
	if (not s.scripted_caught) and s.v2_caught:
	asymmetry_note = (
	'<div style="background:#e8f5e9;border-left:4px solid #1b5e20;padding:8px 12px;'
	'margin-top:10px;font-size:13px;color:#1b5e20;">'
	"<strong>This is the co-evolution win.</strong> The trained Scammer evaded the "
	"rule-based detector — but the v2 Analyzer LoRA (trained on the env's 8-rubric "
	"reward) catches it. This is the kind of pair that the +60 pp head-to-head gap "
	"is built from."
	"</div>"
	)
	elif s.scripted_caught and (not s.v2_caught):
	asymmetry_note = (
	'<div style="background:#fff3e0;border-left:4px solid #e65100;padding:8px 12px;'
	'margin-top:10px;font-size:13px;color:#bf360c;">'
	"<strong>v3 target.</strong> Scripted's keyword rules caught this, but our "
	"v2 LoRA missed it — typically a non-bank category outside v2's training "
	"distribution. Phase-2 LoRA-vs-LoRA retrain (queued for onsite GPU) closes "
	"exactly these cases."
	"</div>"
	)

	return f"""
	<div style="background:#fff;border:1px solid #381932;border-radius:8px;padding:16px 18px;">
	<div style="font-size:11px;color:rgba(0,0,0,0.55);margin-bottom:6px;">
	Sample #{s.index} · split = <strong>{s.split}</strong> · {s.length_chars} chars
	{' · <strong style="color:#b71c1c;">REFUSAL</strong>' if s.is_refusal else ''}
	</div>

	<div style="font-weight:700;font-size:13px;color:#381932;margin-bottom:4px;">
	Seed prompt (what the trained Scammer was asked to write)
	</div>
	<div style="background:#FFF3E6;padding:10px 12px;border-radius:6px;margin-bottom:14px;
	font-size:13px;line-height:1.5;color:#000;">{seed_html}</div>

	<div style="font-weight:700;font-size:13px;color:#381932;margin-bottom:4px;">
	Generated scam text (Qwen2.5-0.5B + LoRA, after 200 GRPO episodes)
	</div>
	<div style="background:#FFF;border:1px dashed #381932;padding:10px 12px;border-radius:6px;
	margin-bottom:14px;font-size:13px;line-height:1.5;color:#000;
	font-family:'SF Mono','Menlo',monospace;">{completion_html}</div>

	<div style="display:grid;grid-template-columns:1fr 1fr;gap:12px;">
	<div style="background:#FFF;border:2px solid #b71c1c;border-radius:6px;padding:12px;">
	<div style="font-weight:700;font-size:13px;color:#381932;margin-bottom:6px;">
	Defender 1 — rule-based <code>ScriptedAnalyzer</code>
	</div>
	<div style="font-size:13px;color:#000;">
	Score: <strong>{s.scripted_score:.2f}</strong> · {_verdict_pill(s.scripted_caught)}
	</div>
	</div>
	<div style="background:#FFF;border:2px solid #1b5e20;border-radius:6px;padding:12px;">
	<div style="font-weight:700;font-size:13px;color:#381932;margin-bottom:6px;">
	Defender 2 — <code>chakravyuh-analyzer-lora-v2</code> (trained)
	</div>
	<div style="font-size:13px;color:#000;margin-bottom:6px;">
	Score: <strong>{s.v2_score:.2f}</strong> · {_verdict_pill(s.v2_caught)}
	</div>
	<div style="font-size:11px;color:rgba(0,0,0,0.55);margin-bottom:4px;">Signals declared:</div>
	<div style="margin-bottom:8px;">{signals_html}</div>
	<div style="font-size:11px;color:rgba(0,0,0,0.55);margin-bottom:2px;">Explanation:</div>
	<div style="font-size:12px;line-height:1.5;color:#000;">{explanation_html or '<em style="color:rgba(0,0,0,0.55);">(no parseable explanation)</em>'}</div>
	</div>
	</div>

	{asymmetry_note}
	</div>
	"""