chakravyuh / tests /test_redteam_tab.py
UjjwalPardeshi
deploy: latest main to HF Space
03815d6
"""Live red-team tab β€” same analyzer, two reward profiles."""
from __future__ import annotations
import pytest
from server.redteam_handler import (
_classify_asymmetry,
_profile_breakdown,
_score_message,
render_redteam_view,
)
from chakravyuh_env import AnalyzerRubric, AnalyzerRubricV2, V2_WEIGHTS
from chakravyuh_env.rubrics import DEFAULT_WEIGHTS
@pytest.mark.unit
def test_render_returns_three_html_strings_on_empty() -> None:
v1, v2, badge = render_redteam_view("")
assert "Type a scam attempt" in v1
assert "Type a scam attempt" in v2
assert badge == ""
@pytest.mark.unit
def test_render_returns_three_html_strings_on_clear_scam() -> None:
v1, v2, badge = render_redteam_view("Share your OTP urgently to claim your prize")
# Both cards render full structure.
assert 'redteam-card' in v1
assert 'redteam-card' in v2
assert 'redteam-v1' in v1
assert 'redteam-v2' in v2
# Score should be high β†’ both cards show FLAGGED chip.
assert 'flagged' in v1.lower()
assert 'flagged' in v2.lower()
# Badge present.
assert "Asymmetry diagnostic" in badge
@pytest.mark.unit
def test_score_message_returns_tuple() -> None:
score, signals, explanation = _score_message("Share your OTP urgently")
assert 0.0 <= score <= 1.0
assert isinstance(signals, list)
assert isinstance(explanation, str)
@pytest.mark.unit
def test_v2_profile_has_more_leaves_than_v1() -> None:
"""The v2 reward profile has 8 leaves; v1 has 5."""
score, signals, expl = _score_message("Share your OTP urgently to claim prize")
v1_breakdown, _ = _profile_breakdown(
AnalyzerRubric, DEFAULT_WEIGHTS, score, signals, expl,
is_benign_truth=False,
)
v2_breakdown, _ = _profile_breakdown(
AnalyzerRubricV2, V2_WEIGHTS, score, signals, expl,
is_benign_truth=False,
)
assert len(V2_WEIGHTS) > len(DEFAULT_WEIGHTS)
assert "format" in V2_WEIGHTS # v2-only leaf
assert "length" in V2_WEIGHTS # v2-only leaf
assert "format" not in DEFAULT_WEIGHTS
@pytest.mark.unit
def test_classify_asymmetry_warns_on_false_positive() -> None:
"""When ground-truth is benign and the analyzer flagged β†’ reward-hacking signature."""
cls, text = _classify_asymmetry(
score=0.8, flagged=True,
v1_total=1.0, v2_total=-0.5,
is_benign_truth=True,
)
assert cls == "redteam-asym-warning"
assert "false-positive" in text.lower()
@pytest.mark.unit
def test_classify_asymmetry_warns_on_missed_scam() -> None:
"""When ground-truth is scam and the analyzer did not flag β†’ missed-scam signature."""
cls, text = _classify_asymmetry(
score=0.3, flagged=False,
v1_total=0.0, v2_total=-0.5,
is_benign_truth=False,
)
assert cls == "redteam-asym-warning"
assert "missed" in text.lower()
@pytest.mark.unit
def test_classify_asymmetry_neutral_on_agreement() -> None:
"""When v1 and v2 totals agree β†’ no asymmetry signal."""
cls, text = _classify_asymmetry(
score=0.5, flagged=True,
v1_total=0.5, v2_total=0.51,
is_benign_truth=None,
)
assert cls == "redteam-asym-agree"
@pytest.mark.unit
def test_render_with_benign_ground_truth_surfaces_diagnostic() -> None:
"""Tagging a flagged input as benign-truth should surface the warning."""
_, _, badge = render_redteam_view(
"Share your OTP urgently",
is_benign_truth=True,
)
assert "redteam-asym-warning" in badge or "false-positive" in badge.lower()