Spaces:

ujjwalpardeshi
/

chakravyuh

Running

chakravyuh / server /redteam_handler.py

UjjwalPardeshi

deploy: latest main to HF Space

03815d6 14 days ago

9.48 kB

	"""Live red-team handler — same analyzer, two reward profiles.

	Demonstrates the v1→v2 reward-hacking story interactively without requiring
	GPU. The user types any message; we score it once with the rule-based
	ScriptedAnalyzer (CPU-only) and then evaluate the same prediction against
	two reward profiles:

	- v1 profile: ``AnalyzerRubric`` with ``DEFAULT_WEIGHTS`` (5 leaves; the
	reward-hacked profile that produced 100% detection / 36% FPR).
	- v2 profile: ``AnalyzerRubricV2`` with ``V2_WEIGHTS`` (8 leaves; the
	principled retrain that holds 99.3% detection at 6.7% FPR).

	The asymmetry between the two reward totals is the diagnostic. The model
	itself is held fixed — we are showing what would have been rewarded
	under each profile, which is the actual mechanism by which v1 learned to
	over-flag.
	"""

	from __future__ import annotations

	from types import SimpleNamespace
	from typing import Any

	from chakravyuh_env import AnalyzerRubric, AnalyzerRubricV2, V2_WEIGHTS
	from chakravyuh_env.rubrics import DEFAULT_WEIGHTS
	from chakravyuh_env.agents.analyzer import ScriptedAnalyzer
	from chakravyuh_env.openenv_models import ChakravyuhAction
	from chakravyuh_env.schemas import AnalyzerScore, ChatMessage, Observation


	def _score_message(message: str) -> tuple[float, list[str], str]:
	"""Run the rule-based analyzer once. Returns (score, signals, explanation)."""
	analyzer = ScriptedAnalyzer()
	obs = Observation(
	agent_role="analyzer",
	turn=1,
	chat_history=[ChatMessage(sender="scammer", turn=1, text=message)],
	)
	action = analyzer.act(obs)
	if isinstance(action, AnalyzerScore):
	signals = [str(s.value if hasattr(s, "value") else s) for s in (action.signals or ())]
	return float(action.score), signals, action.explanation or ""
	return 0.0, [], "(no score returned)"


	def _profile_breakdown(
	rubric_class: type,
	weights: dict[str, float],
	score: float,
	signals: list[str],
	explanation: str,
	*,
	is_benign_truth: bool \| None,
	) -> tuple[dict[str, float], float]:
	"""Compute one reward profile's per-leaf scores + composite total."""
	rubric = rubric_class(weights=weights)
	flag_threshold = 0.5
	flagged = score >= flag_threshold
	is_benign = bool(is_benign_truth) if is_benign_truth is not None else False
	outcome: dict[str, Any] = {
	"analyzer_flagged": flagged,
	"detected_by_turn": 1 if (flagged and not is_benign) else None,
	"is_benign": is_benign,
	"false_positive": flagged and is_benign,
	"money_extracted": False,
	}
	action = ChakravyuhAction(
	score=float(score),
	signals=list(signals),
	explanation=explanation,
	)
	obs = SimpleNamespace(done=True, outcome=outcome)
	total = float(rubric.forward(action, obs))
	return rubric.last_scores(), total


	def _classify_asymmetry(
	score: float,
	flagged: bool,
	v1_total: float,
	v2_total: float,
	is_benign_truth: bool \| None,
	) -> tuple[str, str]:
	"""Return (badge_class, badge_text) — the reward-hacking diagnostic."""
	delta = v1_total - v2_total
	# Honesty: when there's no ground-truth label, asymmetry is informational.
	if is_benign_truth is True and flagged:
	return (
	"redteam-asym-warning",
	f"v1 reward profile rewards this false-positive (Δ = {delta:+.2f}). "
	f"v2's −0.8 false_positive penalty corrects it. "
	f"This is the reward-hacking signature in one input.",
	)
	if is_benign_truth is False and not flagged:
	return (
	"redteam-asym-warning",
	f"v1 reward profile under-rewards this missed scam (Δ = {delta:+.2f}). "
	f"v2's stronger missed_scam penalty would have nudged the model harder.",
	)
	if abs(delta) < 0.05:
	return (
	"redteam-asym-agree",
	f"Both profiles agree on this verdict (Δ = {delta:+.2f}).",
	)
	direction = "v1 rewards more" if delta > 0 else "v2 rewards more"
	return (
	"redteam-asym-mild",
	f"Mild asymmetry — {direction} (Δ = {delta:+.2f}). "
	f"Tag a benign / scam ground-truth to see the reward-hacking signature fire.",
	)


	def render_redteam_view(
	message: str,
	*,
	is_benign_truth: bool \| None = None,
	) -> tuple[str, str, str]:
	"""Public entry point — returns three HTML fragments (v1 card, v2 card, badge).

	is_benign_truth is optional; when provided, the asymmetry diagnostic
	can name the reward-hacking signature explicitly. When None, we report
	the raw v1−v2 delta without claiming it's a hack.
	"""
	if not message or not message.strip():
	empty = (
	'<div class="redteam-empty">'
	"Type a scam attempt above and click <strong>Score</strong>. "
	"Try borderline benigns to see v1 over-flag."
	"</div>"
	)
	return empty, empty, ""

	try:
	score, signals, explanation = _score_message(message)
	except Exception as exc: # noqa: BLE001
	err = (
	f'<div style="padding:12px 16px;background:#FFE8D2;border:1px solid #381932;'
	f'border-radius:10px;color:#000000;font-size:13px;">Analyzer error: {exc!s}</div>'
	)
	return err, err, ""

	flagged = score >= 0.5

	try:
	v1_breakdown, v1_total = _profile_breakdown(
	AnalyzerRubric, DEFAULT_WEIGHTS, score, signals, explanation,
	is_benign_truth=is_benign_truth,
	)
	v2_breakdown, v2_total = _profile_breakdown(
	AnalyzerRubricV2, V2_WEIGHTS, score, signals, explanation,
	is_benign_truth=is_benign_truth,
	)
	except Exception as exc: # noqa: BLE001
	err = (
	f'<div style="padding:12px 16px;background:#FFE8D2;border:1px solid #381932;'
	f'border-radius:10px;color:#000000;font-size:13px;">Reward profile error: {exc!s}</div>'
	)
	return err, err, ""

	v1_html = _render_card(
	title="v1 reward profile",
	subtitle="reward-hacked · 5 leaves · DEFAULT_WEIGHTS",
	score=score,
	flagged=flagged,
	signals=signals,
	explanation=explanation,
	breakdown=v1_breakdown,
	weights=DEFAULT_WEIGHTS,
	total=v1_total,
	accent="v1",
	)
	v2_html = _render_card(
	title="v2 reward profile",
	subtitle="principled retrain · 8 leaves · V2_WEIGHTS",
	score=score,
	flagged=flagged,
	signals=signals,
	explanation=explanation,
	breakdown=v2_breakdown,
	weights=V2_WEIGHTS,
	total=v2_total,
	accent="v2",
	)
	badge_class, badge_text = _classify_asymmetry(
	score, flagged, v1_total, v2_total, is_benign_truth
	)
	badge_html = (
	f'<div class="redteam-asym {badge_class}" role="status" aria-live="polite">'
	f"<strong>Asymmetry diagnostic:</strong> {badge_text}"
	"</div>"
	)
	return v1_html, v2_html, badge_html


	def _render_card(
	*,
	title: str,
	subtitle: str,
	score: float,
	flagged: bool,
	signals: list[str],
	explanation: str,
	breakdown: dict[str, float],
	weights: dict[str, float],
	total: float,
	accent: str,
	) -> str:
	"""Render one side of the side-by-side reward-profile comparison."""
	chip = (
	'<span class="redteam-flag flagged">FLAGGED</span>'
	if flagged
	else '<span class="redteam-flag clean">not flagged</span>'
	)
	sig_html = (
	" ".join(f'<span class="redteam-sig">{s}</span>' for s in signals)
	if signals
	else '<span class="redteam-sig redteam-sig-empty">no signals</span>'
	)
	rows: list[str] = []
	for name in weights:
	leaf_value = breakdown.get(name)
	weight = weights[name]
	if leaf_value is None:
	cell = '<td class="redteam-leaf-na">—</td>'
	else:
	contribution = float(leaf_value) * weight
	cell = f'<td class="redteam-leaf-val">{leaf_value:+.2f}</td>'
	cell += (
	f'<td class="redteam-leaf-weight">×{weight:+.2f}</td>'
	f'<td class="redteam-leaf-contrib">{contribution:+.2f}</td>'
	)
	rows.append(
	f'<tr><th class="redteam-leaf-name">{name}</th>'
	+ (cell if leaf_value is None else cell)
	+ "</tr>"
	)
	body = (
	f'<div class="redteam-card redteam-{accent}" role="article" '
	f'aria-label="{title}">'
	f'<div class="redteam-card-head">'
	f'<strong class="redteam-card-title">{title}</strong>'
	f'<span class="redteam-card-subtitle">{subtitle}</span>'
	f"</div>"
	f'<div class="redteam-card-score-row">'
	f'<div class="redteam-score">{score:.2f}</div>'
	f'<div class="redteam-flag-wrap">{chip}</div>'
	f"</div>"
	f'<div class="redteam-signals" aria-label="signals fired">{sig_html}</div>'
	f'<div class="redteam-explanation"><em>{explanation}</em></div>'
	f'<table class="redteam-breakdown"><thead>'
	f'<tr><th>leaf</th><th>score</th><th>weight</th><th>contribution</th></tr>'
	f'</thead><tbody>{"".join(rows)}</tbody>'
	f'<tfoot><tr><th colspan="3" class="redteam-total-label">Composite</th>'
	f'<th class="redteam-total-val">{total:+.2f}</th></tr></tfoot></table>'
	f"</div>"
	)
	return body