"""Live red-team handler — same analyzer, two reward profiles.
Demonstrates the v1→v2 reward-hacking story interactively without requiring
GPU. The user types any message; we score it once with the rule-based
ScriptedAnalyzer (CPU-only) and then evaluate the *same* prediction against
two reward profiles:
- v1 profile: ``AnalyzerRubric`` with ``DEFAULT_WEIGHTS`` (5 leaves; the
reward-hacked profile that produced 100% detection / 36% FPR).
- v2 profile: ``AnalyzerRubricV2`` with ``V2_WEIGHTS`` (8 leaves; the
principled retrain that holds 99.3% detection at 6.7% FPR).
The asymmetry between the two reward totals is the diagnostic. The model
itself is held fixed — we are showing what would have been *rewarded*
under each profile, which is the actual mechanism by which v1 learned to
over-flag.
"""
from __future__ import annotations
from types import SimpleNamespace
from typing import Any
from chakravyuh_env import AnalyzerRubric, AnalyzerRubricV2, V2_WEIGHTS
from chakravyuh_env.rubrics import DEFAULT_WEIGHTS
from chakravyuh_env.agents.analyzer import ScriptedAnalyzer
from chakravyuh_env.openenv_models import ChakravyuhAction
from chakravyuh_env.schemas import AnalyzerScore, ChatMessage, Observation
def _score_message(message: str) -> tuple[float, list[str], str]:
"""Run the rule-based analyzer once. Returns (score, signals, explanation)."""
analyzer = ScriptedAnalyzer()
obs = Observation(
agent_role="analyzer",
turn=1,
chat_history=[ChatMessage(sender="scammer", turn=1, text=message)],
)
action = analyzer.act(obs)
if isinstance(action, AnalyzerScore):
signals = [str(s.value if hasattr(s, "value") else s) for s in (action.signals or ())]
return float(action.score), signals, action.explanation or ""
return 0.0, [], "(no score returned)"
def _profile_breakdown(
rubric_class: type,
weights: dict[str, float],
score: float,
signals: list[str],
explanation: str,
*,
is_benign_truth: bool | None,
) -> tuple[dict[str, float], float]:
"""Compute one reward profile's per-leaf scores + composite total."""
rubric = rubric_class(weights=weights)
flag_threshold = 0.5
flagged = score >= flag_threshold
is_benign = bool(is_benign_truth) if is_benign_truth is not None else False
outcome: dict[str, Any] = {
"analyzer_flagged": flagged,
"detected_by_turn": 1 if (flagged and not is_benign) else None,
"is_benign": is_benign,
"false_positive": flagged and is_benign,
"money_extracted": False,
}
action = ChakravyuhAction(
score=float(score),
signals=list(signals),
explanation=explanation,
)
obs = SimpleNamespace(done=True, outcome=outcome)
total = float(rubric.forward(action, obs))
return rubric.last_scores(), total
def _classify_asymmetry(
score: float,
flagged: bool,
v1_total: float,
v2_total: float,
is_benign_truth: bool | None,
) -> tuple[str, str]:
"""Return (badge_class, badge_text) — the reward-hacking diagnostic."""
delta = v1_total - v2_total
# Honesty: when there's no ground-truth label, asymmetry is informational.
if is_benign_truth is True and flagged:
return (
"redteam-asym-warning",
f"v1 reward profile rewards this false-positive (Δ = {delta:+.2f}). "
f"v2's −0.8 false_positive penalty corrects it. "
f"This is the reward-hacking signature in one input.",
)
if is_benign_truth is False and not flagged:
return (
"redteam-asym-warning",
f"v1 reward profile under-rewards this missed scam (Δ = {delta:+.2f}). "
f"v2's stronger missed_scam penalty would have nudged the model harder.",
)
if abs(delta) < 0.05:
return (
"redteam-asym-agree",
f"Both profiles agree on this verdict (Δ = {delta:+.2f}).",
)
direction = "v1 rewards more" if delta > 0 else "v2 rewards more"
return (
"redteam-asym-mild",
f"Mild asymmetry — {direction} (Δ = {delta:+.2f}). "
f"Tag a benign / scam ground-truth to see the reward-hacking signature fire.",
)
def render_redteam_view(
message: str,
*,
is_benign_truth: bool | None = None,
) -> tuple[str, str, str]:
"""Public entry point — returns three HTML fragments (v1 card, v2 card, badge).
is_benign_truth is optional; when provided, the asymmetry diagnostic
can name the reward-hacking signature explicitly. When None, we report
the raw v1−v2 delta without claiming it's a hack.
"""
if not message or not message.strip():
empty = (
'
'
"Type a scam attempt above and click Score. "
"Try borderline benigns to see v1 over-flag."
"
"
)
return empty, empty, ""
try:
score, signals, explanation = _score_message(message)
except Exception as exc: # noqa: BLE001
err = (
f'Analyzer error: {exc!s}
'
)
return err, err, ""
flagged = score >= 0.5
try:
v1_breakdown, v1_total = _profile_breakdown(
AnalyzerRubric, DEFAULT_WEIGHTS, score, signals, explanation,
is_benign_truth=is_benign_truth,
)
v2_breakdown, v2_total = _profile_breakdown(
AnalyzerRubricV2, V2_WEIGHTS, score, signals, explanation,
is_benign_truth=is_benign_truth,
)
except Exception as exc: # noqa: BLE001
err = (
f'Reward profile error: {exc!s}
'
)
return err, err, ""
v1_html = _render_card(
title="v1 reward profile",
subtitle="reward-hacked · 5 leaves · DEFAULT_WEIGHTS",
score=score,
flagged=flagged,
signals=signals,
explanation=explanation,
breakdown=v1_breakdown,
weights=DEFAULT_WEIGHTS,
total=v1_total,
accent="v1",
)
v2_html = _render_card(
title="v2 reward profile",
subtitle="principled retrain · 8 leaves · V2_WEIGHTS",
score=score,
flagged=flagged,
signals=signals,
explanation=explanation,
breakdown=v2_breakdown,
weights=V2_WEIGHTS,
total=v2_total,
accent="v2",
)
badge_class, badge_text = _classify_asymmetry(
score, flagged, v1_total, v2_total, is_benign_truth
)
badge_html = (
f''
f"Asymmetry diagnostic: {badge_text}"
"
"
)
return v1_html, v2_html, badge_html
def _render_card(
*,
title: str,
subtitle: str,
score: float,
flagged: bool,
signals: list[str],
explanation: str,
breakdown: dict[str, float],
weights: dict[str, float],
total: float,
accent: str,
) -> str:
"""Render one side of the side-by-side reward-profile comparison."""
chip = (
'FLAGGED'
if flagged
else 'not flagged'
)
sig_html = (
" ".join(f'{s}' for s in signals)
if signals
else 'no signals'
)
rows: list[str] = []
for name in weights:
leaf_value = breakdown.get(name)
weight = weights[name]
if leaf_value is None:
cell = '— | '
else:
contribution = float(leaf_value) * weight
cell = f'{leaf_value:+.2f} | '
cell += (
f'×{weight:+.2f} | '
f'{contribution:+.2f} | '
)
rows.append(
f'| {name} | '
+ (cell if leaf_value is None else cell)
+ "
"
)
body = (
f''
f'
'
f'{title}'
f'{subtitle}'
f"
"
f'
'
f'
{score:.2f}
'
f'
{chip}
'
f"
"
f'
{sig_html}
'
f'
{explanation}
'
f'
'
f'| leaf | score | weight | contribution |
'
f'{"".join(rows)}'
f'| Composite | '
f'{total:+.2f} |
|---|
'
f"
"
)
return body