Spaces:
Running
Running
| """Live red-team tab β same analyzer, two reward profiles.""" | |
| from __future__ import annotations | |
| import pytest | |
| from server.redteam_handler import ( | |
| _classify_asymmetry, | |
| _profile_breakdown, | |
| _score_message, | |
| render_redteam_view, | |
| ) | |
| from chakravyuh_env import AnalyzerRubric, AnalyzerRubricV2, V2_WEIGHTS | |
| from chakravyuh_env.rubrics import DEFAULT_WEIGHTS | |
| def test_render_returns_three_html_strings_on_empty() -> None: | |
| v1, v2, badge = render_redteam_view("") | |
| assert "Type a scam attempt" in v1 | |
| assert "Type a scam attempt" in v2 | |
| assert badge == "" | |
| def test_render_returns_three_html_strings_on_clear_scam() -> None: | |
| v1, v2, badge = render_redteam_view("Share your OTP urgently to claim your prize") | |
| # Both cards render full structure. | |
| assert 'redteam-card' in v1 | |
| assert 'redteam-card' in v2 | |
| assert 'redteam-v1' in v1 | |
| assert 'redteam-v2' in v2 | |
| # Score should be high β both cards show FLAGGED chip. | |
| assert 'flagged' in v1.lower() | |
| assert 'flagged' in v2.lower() | |
| # Badge present. | |
| assert "Asymmetry diagnostic" in badge | |
| def test_score_message_returns_tuple() -> None: | |
| score, signals, explanation = _score_message("Share your OTP urgently") | |
| assert 0.0 <= score <= 1.0 | |
| assert isinstance(signals, list) | |
| assert isinstance(explanation, str) | |
| def test_v2_profile_has_more_leaves_than_v1() -> None: | |
| """The v2 reward profile has 8 leaves; v1 has 5.""" | |
| score, signals, expl = _score_message("Share your OTP urgently to claim prize") | |
| v1_breakdown, _ = _profile_breakdown( | |
| AnalyzerRubric, DEFAULT_WEIGHTS, score, signals, expl, | |
| is_benign_truth=False, | |
| ) | |
| v2_breakdown, _ = _profile_breakdown( | |
| AnalyzerRubricV2, V2_WEIGHTS, score, signals, expl, | |
| is_benign_truth=False, | |
| ) | |
| assert len(V2_WEIGHTS) > len(DEFAULT_WEIGHTS) | |
| assert "format" in V2_WEIGHTS # v2-only leaf | |
| assert "length" in V2_WEIGHTS # v2-only leaf | |
| assert "format" not in DEFAULT_WEIGHTS | |
| def test_classify_asymmetry_warns_on_false_positive() -> None: | |
| """When ground-truth is benign and the analyzer flagged β reward-hacking signature.""" | |
| cls, text = _classify_asymmetry( | |
| score=0.8, flagged=True, | |
| v1_total=1.0, v2_total=-0.5, | |
| is_benign_truth=True, | |
| ) | |
| assert cls == "redteam-asym-warning" | |
| assert "false-positive" in text.lower() | |
| def test_classify_asymmetry_warns_on_missed_scam() -> None: | |
| """When ground-truth is scam and the analyzer did not flag β missed-scam signature.""" | |
| cls, text = _classify_asymmetry( | |
| score=0.3, flagged=False, | |
| v1_total=0.0, v2_total=-0.5, | |
| is_benign_truth=False, | |
| ) | |
| assert cls == "redteam-asym-warning" | |
| assert "missed" in text.lower() | |
| def test_classify_asymmetry_neutral_on_agreement() -> None: | |
| """When v1 and v2 totals agree β no asymmetry signal.""" | |
| cls, text = _classify_asymmetry( | |
| score=0.5, flagged=True, | |
| v1_total=0.5, v2_total=0.51, | |
| is_benign_truth=None, | |
| ) | |
| assert cls == "redteam-asym-agree" | |
| def test_render_with_benign_ground_truth_surfaces_diagnostic() -> None: | |
| """Tagging a flagged input as benign-truth should surface the warning.""" | |
| _, _, badge = render_redteam_view( | |
| "Share your OTP urgently", | |
| is_benign_truth=True, | |
| ) | |
| assert "redteam-asym-warning" in badge or "false-positive" in badge.lower() | |