chakravyuh / server /redteam_handler.py
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
"""Live red-team handler — same analyzer, two reward profiles.
Demonstrates the v1→v2 reward-hacking story interactively without requiring
GPU. The user types any message; we score it once with the rule-based
ScriptedAnalyzer (CPU-only) and then evaluate the *same* prediction against
two reward profiles:
- v1 profile: ``AnalyzerRubric`` with ``DEFAULT_WEIGHTS`` (5 leaves; the
reward-hacked profile that produced 100% detection / 36% FPR).
- v2 profile: ``AnalyzerRubricV2`` with ``V2_WEIGHTS`` (8 leaves; the
principled retrain that holds 99.3% detection at 6.7% FPR).
The asymmetry between the two reward totals is the diagnostic. The model
itself is held fixed — we are showing what would have been *rewarded*
under each profile, which is the actual mechanism by which v1 learned to
over-flag.
"""
from __future__ import annotations
from types import SimpleNamespace
from typing import Any
from chakravyuh_env import AnalyzerRubric, AnalyzerRubricV2, V2_WEIGHTS
from chakravyuh_env.rubrics import DEFAULT_WEIGHTS
from chakravyuh_env.agents.analyzer import ScriptedAnalyzer
from chakravyuh_env.openenv_models import ChakravyuhAction
from chakravyuh_env.schemas import AnalyzerScore, ChatMessage, Observation
def _score_message(message: str) -> tuple[float, list[str], str]:
"""Run the rule-based analyzer once. Returns (score, signals, explanation)."""
analyzer = ScriptedAnalyzer()
obs = Observation(
agent_role="analyzer",
turn=1,
chat_history=[ChatMessage(sender="scammer", turn=1, text=message)],
)
action = analyzer.act(obs)
if isinstance(action, AnalyzerScore):
signals = [str(s.value if hasattr(s, "value") else s) for s in (action.signals or ())]
return float(action.score), signals, action.explanation or ""
return 0.0, [], "(no score returned)"
def _profile_breakdown(
rubric_class: type,
weights: dict[str, float],
score: float,
signals: list[str],
explanation: str,
*,
is_benign_truth: bool | None,
) -> tuple[dict[str, float], float]:
"""Compute one reward profile's per-leaf scores + composite total."""
rubric = rubric_class(weights=weights)
flag_threshold = 0.5
flagged = score >= flag_threshold
is_benign = bool(is_benign_truth) if is_benign_truth is not None else False
outcome: dict[str, Any] = {
"analyzer_flagged": flagged,
"detected_by_turn": 1 if (flagged and not is_benign) else None,
"is_benign": is_benign,
"false_positive": flagged and is_benign,
"money_extracted": False,
}
action = ChakravyuhAction(
score=float(score),
signals=list(signals),
explanation=explanation,
)
obs = SimpleNamespace(done=True, outcome=outcome)
total = float(rubric.forward(action, obs))
return rubric.last_scores(), total
def _classify_asymmetry(
score: float,
flagged: bool,
v1_total: float,
v2_total: float,
is_benign_truth: bool | None,
) -> tuple[str, str]:
"""Return (badge_class, badge_text) — the reward-hacking diagnostic."""
delta = v1_total - v2_total
# Honesty: when there's no ground-truth label, asymmetry is informational.
if is_benign_truth is True and flagged:
return (
"redteam-asym-warning",
f"v1 reward profile rewards this false-positive (Δ = {delta:+.2f}). "
f"v2's −0.8 false_positive penalty corrects it. "
f"This is the reward-hacking signature in one input.",
)
if is_benign_truth is False and not flagged:
return (
"redteam-asym-warning",
f"v1 reward profile under-rewards this missed scam (Δ = {delta:+.2f}). "
f"v2's stronger missed_scam penalty would have nudged the model harder.",
)
if abs(delta) < 0.05:
return (
"redteam-asym-agree",
f"Both profiles agree on this verdict (Δ = {delta:+.2f}).",
)
direction = "v1 rewards more" if delta > 0 else "v2 rewards more"
return (
"redteam-asym-mild",
f"Mild asymmetry — {direction} (Δ = {delta:+.2f}). "
f"Tag a benign / scam ground-truth to see the reward-hacking signature fire.",
)
def render_redteam_view(
message: str,
*,
is_benign_truth: bool | None = None,
) -> tuple[str, str, str]:
"""Public entry point — returns three HTML fragments (v1 card, v2 card, badge).
is_benign_truth is optional; when provided, the asymmetry diagnostic
can name the reward-hacking signature explicitly. When None, we report
the raw v1−v2 delta without claiming it's a hack.
"""
if not message or not message.strip():
empty = (
'<div class="redteam-empty">'
"Type a scam attempt above and click <strong>Score</strong>. "
"Try borderline benigns to see v1 over-flag."
"</div>"
)
return empty, empty, ""
try:
score, signals, explanation = _score_message(message)
except Exception as exc: # noqa: BLE001
err = (
f'<div style="padding:12px 16px;background:#FFE8D2;border:1px solid #381932;'
f'border-radius:10px;color:#000000;font-size:13px;">Analyzer error: {exc!s}</div>'
)
return err, err, ""
flagged = score >= 0.5
try:
v1_breakdown, v1_total = _profile_breakdown(
AnalyzerRubric, DEFAULT_WEIGHTS, score, signals, explanation,
is_benign_truth=is_benign_truth,
)
v2_breakdown, v2_total = _profile_breakdown(
AnalyzerRubricV2, V2_WEIGHTS, score, signals, explanation,
is_benign_truth=is_benign_truth,
)
except Exception as exc: # noqa: BLE001
err = (
f'<div style="padding:12px 16px;background:#FFE8D2;border:1px solid #381932;'
f'border-radius:10px;color:#000000;font-size:13px;">Reward profile error: {exc!s}</div>'
)
return err, err, ""
v1_html = _render_card(
title="v1 reward profile",
subtitle="reward-hacked · 5 leaves · DEFAULT_WEIGHTS",
score=score,
flagged=flagged,
signals=signals,
explanation=explanation,
breakdown=v1_breakdown,
weights=DEFAULT_WEIGHTS,
total=v1_total,
accent="v1",
)
v2_html = _render_card(
title="v2 reward profile",
subtitle="principled retrain · 8 leaves · V2_WEIGHTS",
score=score,
flagged=flagged,
signals=signals,
explanation=explanation,
breakdown=v2_breakdown,
weights=V2_WEIGHTS,
total=v2_total,
accent="v2",
)
badge_class, badge_text = _classify_asymmetry(
score, flagged, v1_total, v2_total, is_benign_truth
)
badge_html = (
f'<div class="redteam-asym {badge_class}" role="status" aria-live="polite">'
f"<strong>Asymmetry diagnostic:</strong> {badge_text}"
"</div>"
)
return v1_html, v2_html, badge_html
def _render_card(
*,
title: str,
subtitle: str,
score: float,
flagged: bool,
signals: list[str],
explanation: str,
breakdown: dict[str, float],
weights: dict[str, float],
total: float,
accent: str,
) -> str:
"""Render one side of the side-by-side reward-profile comparison."""
chip = (
'<span class="redteam-flag flagged">FLAGGED</span>'
if flagged
else '<span class="redteam-flag clean">not flagged</span>'
)
sig_html = (
" ".join(f'<span class="redteam-sig">{s}</span>' for s in signals)
if signals
else '<span class="redteam-sig redteam-sig-empty">no signals</span>'
)
rows: list[str] = []
for name in weights:
leaf_value = breakdown.get(name)
weight = weights[name]
if leaf_value is None:
cell = '<td class="redteam-leaf-na">—</td>'
else:
contribution = float(leaf_value) * weight
cell = f'<td class="redteam-leaf-val">{leaf_value:+.2f}</td>'
cell += (
f'<td class="redteam-leaf-weight">×{weight:+.2f}</td>'
f'<td class="redteam-leaf-contrib">{contribution:+.2f}</td>'
)
rows.append(
f'<tr><th class="redteam-leaf-name">{name}</th>'
+ (cell if leaf_value is None else cell)
+ "</tr>"
)
body = (
f'<div class="redteam-card redteam-{accent}" role="article" '
f'aria-label="{title}">'
f'<div class="redteam-card-head">'
f'<strong class="redteam-card-title">{title}</strong>'
f'<span class="redteam-card-subtitle">{subtitle}</span>'
f"</div>"
f'<div class="redteam-card-score-row">'
f'<div class="redteam-score">{score:.2f}</div>'
f'<div class="redteam-flag-wrap">{chip}</div>'
f"</div>"
f'<div class="redteam-signals" aria-label="signals fired">{sig_html}</div>'
f'<div class="redteam-explanation"><em>{explanation}</em></div>'
f'<table class="redteam-breakdown"><thead>'
f'<tr><th>leaf</th><th>score</th><th>weight</th><th>contribution</th></tr>'
f'</thead><tbody>{"".join(rows)}</tbody>'
f'<tfoot><tr><th colspan="3" class="redteam-total-label">Composite</th>'
f'<th class="redteam-total-val">{total:+.2f}</th></tr></tfoot></table>'
f"</div>"
)
return body