Spaces:

ujjwalpardeshi
/

chakravyuh

Running

File size: 9,479 Bytes

03815d6

"""Live red-team handler — same analyzer, two reward profiles.

Demonstrates the v1→v2 reward-hacking story interactively without requiring
GPU. The user types any message; we score it once with the rule-based
ScriptedAnalyzer (CPU-only) and then evaluate the *same* prediction against
two reward profiles:

    - v1 profile: ``AnalyzerRubric`` with ``DEFAULT_WEIGHTS`` (5 leaves; the
      reward-hacked profile that produced 100% detection / 36% FPR).
    - v2 profile: ``AnalyzerRubricV2`` with ``V2_WEIGHTS`` (8 leaves; the
      principled retrain that holds 99.3% detection at 6.7% FPR).

The asymmetry between the two reward totals is the diagnostic. The model
itself is held fixed — we are showing what would have been *rewarded*
under each profile, which is the actual mechanism by which v1 learned to
over-flag.
"""

from __future__ import annotations

from types import SimpleNamespace
from typing import Any

from chakravyuh_env import AnalyzerRubric, AnalyzerRubricV2, V2_WEIGHTS
from chakravyuh_env.rubrics import DEFAULT_WEIGHTS
from chakravyuh_env.agents.analyzer import ScriptedAnalyzer
from chakravyuh_env.openenv_models import ChakravyuhAction
from chakravyuh_env.schemas import AnalyzerScore, ChatMessage, Observation


def _score_message(message: str) -> tuple[float, list[str], str]:
    """Run the rule-based analyzer once. Returns (score, signals, explanation)."""
    analyzer = ScriptedAnalyzer()
    obs = Observation(
        agent_role="analyzer",
        turn=1,
        chat_history=[ChatMessage(sender="scammer", turn=1, text=message)],
    )
    action = analyzer.act(obs)
    if isinstance(action, AnalyzerScore):
        signals = [str(s.value if hasattr(s, "value") else s) for s in (action.signals or ())]
        return float(action.score), signals, action.explanation or ""
    return 0.0, [], "(no score returned)"


def _profile_breakdown(
    rubric_class: type,
    weights: dict[str, float],
    score: float,
    signals: list[str],
    explanation: str,
    *,
    is_benign_truth: bool | None,
) -> tuple[dict[str, float], float]:
    """Compute one reward profile's per-leaf scores + composite total."""
    rubric = rubric_class(weights=weights)
    flag_threshold = 0.5
    flagged = score >= flag_threshold
    is_benign = bool(is_benign_truth) if is_benign_truth is not None else False
    outcome: dict[str, Any] = {
        "analyzer_flagged": flagged,
        "detected_by_turn": 1 if (flagged and not is_benign) else None,
        "is_benign": is_benign,
        "false_positive": flagged and is_benign,
        "money_extracted": False,
    }
    action = ChakravyuhAction(
        score=float(score),
        signals=list(signals),
        explanation=explanation,
    )
    obs = SimpleNamespace(done=True, outcome=outcome)
    total = float(rubric.forward(action, obs))
    return rubric.last_scores(), total


def _classify_asymmetry(
    score: float,
    flagged: bool,
    v1_total: float,
    v2_total: float,
    is_benign_truth: bool | None,
) -> tuple[str, str]:
    """Return (badge_class, badge_text) — the reward-hacking diagnostic."""
    delta = v1_total - v2_total
    # Honesty: when there's no ground-truth label, asymmetry is informational.
    if is_benign_truth is True and flagged:
        return (
            "redteam-asym-warning",
            f"v1 reward profile rewards this false-positive (Δ = {delta:+.2f}). "
            f"v2's −0.8 false_positive penalty corrects it. "
            f"This is the reward-hacking signature in one input.",
        )
    if is_benign_truth is False and not flagged:
        return (
            "redteam-asym-warning",
            f"v1 reward profile under-rewards this missed scam (Δ = {delta:+.2f}). "
            f"v2's stronger missed_scam penalty would have nudged the model harder.",
        )
    if abs(delta) < 0.05:
        return (
            "redteam-asym-agree",
            f"Both profiles agree on this verdict (Δ = {delta:+.2f}).",
        )
    direction = "v1 rewards more" if delta > 0 else "v2 rewards more"
    return (
        "redteam-asym-mild",
        f"Mild asymmetry — {direction} (Δ = {delta:+.2f}). "
        f"Tag a benign / scam ground-truth to see the reward-hacking signature fire.",
    )


def render_redteam_view(
    message: str,
    *,
    is_benign_truth: bool | None = None,
) -> tuple[str, str, str]:
    """Public entry point — returns three HTML fragments (v1 card, v2 card, badge).

    is_benign_truth is optional; when provided, the asymmetry diagnostic
    can name the reward-hacking signature explicitly. When None, we report
    the raw v1−v2 delta without claiming it's a hack.
    """
    if not message or not message.strip():
        empty = (
            '<div class="redteam-empty">'
            "Type a scam attempt above and click <strong>Score</strong>. "
            "Try borderline benigns to see v1 over-flag."
            "</div>"
        )
        return empty, empty, ""

    try:
        score, signals, explanation = _score_message(message)
    except Exception as exc:  # noqa: BLE001
        err = (
            f'<div style="padding:12px 16px;background:#FFE8D2;border:1px solid #381932;'
            f'border-radius:10px;color:#000000;font-size:13px;">Analyzer error: {exc!s}</div>'
        )
        return err, err, ""

    flagged = score >= 0.5

    try:
        v1_breakdown, v1_total = _profile_breakdown(
            AnalyzerRubric, DEFAULT_WEIGHTS, score, signals, explanation,
            is_benign_truth=is_benign_truth,
        )
        v2_breakdown, v2_total = _profile_breakdown(
            AnalyzerRubricV2, V2_WEIGHTS, score, signals, explanation,
            is_benign_truth=is_benign_truth,
        )
    except Exception as exc:  # noqa: BLE001
        err = (
            f'<div style="padding:12px 16px;background:#FFE8D2;border:1px solid #381932;'
            f'border-radius:10px;color:#000000;font-size:13px;">Reward profile error: {exc!s}</div>'
        )
        return err, err, ""

    v1_html = _render_card(
        title="v1 reward profile",
        subtitle="reward-hacked · 5 leaves · DEFAULT_WEIGHTS",
        score=score,
        flagged=flagged,
        signals=signals,
        explanation=explanation,
        breakdown=v1_breakdown,
        weights=DEFAULT_WEIGHTS,
        total=v1_total,
        accent="v1",
    )
    v2_html = _render_card(
        title="v2 reward profile",
        subtitle="principled retrain · 8 leaves · V2_WEIGHTS",
        score=score,
        flagged=flagged,
        signals=signals,
        explanation=explanation,
        breakdown=v2_breakdown,
        weights=V2_WEIGHTS,
        total=v2_total,
        accent="v2",
    )
    badge_class, badge_text = _classify_asymmetry(
        score, flagged, v1_total, v2_total, is_benign_truth
    )
    badge_html = (
        f'<div class="redteam-asym {badge_class}" role="status" aria-live="polite">'
        f"<strong>Asymmetry diagnostic:</strong> {badge_text}"
        "</div>"
    )
    return v1_html, v2_html, badge_html


def _render_card(
    *,
    title: str,
    subtitle: str,
    score: float,
    flagged: bool,
    signals: list[str],
    explanation: str,
    breakdown: dict[str, float],
    weights: dict[str, float],
    total: float,
    accent: str,
) -> str:
    """Render one side of the side-by-side reward-profile comparison."""
    chip = (
        '<span class="redteam-flag flagged">FLAGGED</span>'
        if flagged
        else '<span class="redteam-flag clean">not flagged</span>'
    )
    sig_html = (
        " ".join(f'<span class="redteam-sig">{s}</span>' for s in signals)
        if signals
        else '<span class="redteam-sig redteam-sig-empty">no signals</span>'
    )
    rows: list[str] = []
    for name in weights:
        leaf_value = breakdown.get(name)
        weight = weights[name]
        if leaf_value is None:
            cell = '<td class="redteam-leaf-na">—</td>'
        else:
            contribution = float(leaf_value) * weight
            cell = f'<td class="redteam-leaf-val">{leaf_value:+.2f}</td>'
            cell += (
                f'<td class="redteam-leaf-weight">×{weight:+.2f}</td>'
                f'<td class="redteam-leaf-contrib">{contribution:+.2f}</td>'
            )
        rows.append(
            f'<tr><th class="redteam-leaf-name">{name}</th>'
            + (cell if leaf_value is None else cell)
            + "</tr>"
        )
    body = (
        f'<div class="redteam-card redteam-{accent}" role="article" '
        f'aria-label="{title}">'
        f'<div class="redteam-card-head">'
        f'<strong class="redteam-card-title">{title}</strong>'
        f'<span class="redteam-card-subtitle">{subtitle}</span>'
        f"</div>"
        f'<div class="redteam-card-score-row">'
        f'<div class="redteam-score">{score:.2f}</div>'
        f'<div class="redteam-flag-wrap">{chip}</div>'
        f"</div>"
        f'<div class="redteam-signals" aria-label="signals fired">{sig_html}</div>'
        f'<div class="redteam-explanation"><em>{explanation}</em></div>'
        f'<table class="redteam-breakdown"><thead>'
        f'<tr><th>leaf</th><th>score</th><th>weight</th><th>contribution</th></tr>'
        f'</thead><tbody>{"".join(rows)}</tbody>'
        f'<tfoot><tr><th colspan="3" class="redteam-total-label">Composite</th>'
        f'<th class="redteam-total-val">{total:+.2f}</th></tr></tfoot></table>'
        f"</div>"
    )
    return body