"""Tests for demo curator + state logic (Gradio-free).""" from __future__ import annotations import pytest from chakravyuh_env.schemas import ChatMessage from server.episode_curator import ( CURATED_EPISODES, format_bank_panel, format_chat_html, format_suspicion_timeline, max_turn, outcome_badge, replay, replay_all, suspicion_score_for_turn, ) @pytest.mark.unit def test_curated_episodes_are_five_with_distinct_seeds(): assert len(CURATED_EPISODES) == 5 seeds = [e.seed for e in CURATED_EPISODES] assert len(set(seeds)) == 5 @pytest.mark.unit def test_curated_episodes_cover_all_outcome_kinds(): kinds = {e.outcome_kind for e in CURATED_EPISODES} assert kinds == { "detection_too_late", "scripted_missed", "victim_refused", "multi_agent_save", "victim_verified", } @pytest.mark.integration def test_replay_is_deterministic(): """Calling replay() twice with the same episode must produce identical output.""" ep = CURATED_EPISODES[0] a = replay(ep) b = replay(ep) assert a.seed == b.seed assert a.outcome == b.outcome assert len(a.chat_history) == len(b.chat_history) for msg_a, msg_b in zip(a.chat_history, b.chat_history): assert msg_a.text == msg_b.text assert msg_a.sender == msg_b.sender @pytest.mark.integration def test_replay_all_produces_five_episodes_without_crash(): results = replay_all() assert len(results) == 5 for r in results: assert len(r.chat_history) >= 1 assert r.outcome.turns_used >= 1 @pytest.mark.integration def test_multi_agent_save_actually_saves(): """Multi-agent defense episode: analyzer flags + bank freezes = victim saved.""" saved = next(e for e in CURATED_EPISODES if e.outcome_kind == "multi_agent_save") result = replay(saved) assert result.outcome.analyzer_flagged is True assert result.outcome.bank_froze is True assert result.outcome.money_extracted is False @pytest.mark.integration def test_victim_refused_episode_actually_refuses(): refused = next(e for e in CURATED_EPISODES if e.outcome_kind == "victim_refused") result = replay(refused) assert result.outcome.victim_refused is True assert result.outcome.money_extracted is False @pytest.mark.integration def test_scripted_missed_demonstrates_analyzer_blind_spot(): missed = next(e for e in CURATED_EPISODES if e.outcome_kind == "scripted_missed") result = replay(missed) # The whole point: analyzer did NOT flag at the threshold assert result.outcome.analyzer_flagged is False assert result.outcome.money_extracted is True @pytest.mark.unit def test_format_chat_html_escapes_safely(): msgs = [ChatMessage(sender="scammer", turn=1, text="")] html = format_chat_html(msgs) # Must neutralize the raw