# tests/test_grader_variance.py # Phase 2 of judging runs a variance check. If all graders return the same score # for different quality answers, the submission is DISQUALIFIED. # Run: python -m pytest tests/test_grader_variance.py -v import sys sys.path.insert(0, '.') from server.graders.base_grader import safe_score from server.graders.security_grader import compute_correctness as sec_cc from server.graders.dependency_grader import compute_correctness as dep_cc from server.graders.clinical_grader import compute_correctness as cli_cc # ── Security Case for Testing ── SEC_CASE = { 'expected_vuln_type': 'sql_injection', 'cvss_range': [7.5, 9.8], 'expected_severity': 'critical', 'required_fix_tokens': ['?', 'parameterized'], 'current_feedback_keywords': ['sql', 'injection'], 'original_vuln_pattern': 'query+', } def test_sec_identify_variance(): """Security grader must return 3+ different scores for different quality answers.""" perfect = { 'action_type': 'identify_vulnerability', 'vuln_type': 'sql_injection', 'cvss_score': 8.5, 'severity': 'critical', 'affected_line': 1, } partial = { 'action_type': 'identify_vulnerability', 'vuln_type': 'xss', # wrong vuln_type 'cvss_score': 8.5, # but correct CVSS 'severity': 'critical', # and correct severity 'affected_line': 1, } wrong = { 'action_type': 'identify_vulnerability', 'vuln_type': 'xss', # wrong everything 'cvss_score': 2.0, 'severity': 'low', 'affected_line': 1, } s1 = safe_score(sec_cc(perfect, SEC_CASE)) s2 = safe_score(sec_cc(partial, SEC_CASE)) s3 = safe_score(sec_cc(wrong, SEC_CASE)) assert len({round(s, 2) for s in [s1, s2, s3]}) >= 3, f'No variance: {s1},{s2},{s3}' assert s1 > s2 > s3, f'Wrong ordering: {s1},{s2},{s3}' print(f' Security identify variance: {s1:.4f} > {s2:.4f} > {s3:.4f} PASS') def test_dep_resolve_variance(): """Dependency grader must return different scores for different quality answers.""" case = { 'conflict_packages': ['torch', 'numpy'], 'compatibility_matrix': { 'torch': {'2.1.0': {'numpy': '>=1.24'}, '1.9.0': {}}, 'numpy': {'1.24.0': {}, '1.16.0': {}}, }, 'requirements': {'torch': '1.9.0', 'numpy': '1.16.0'}, } full = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.24.0'}, 'reasoning': 'ok'} part = {'action_type': 'resolve_conflict', 'packages': {'torch': '2.1.0', 'numpy': '1.16.0'}, 'reasoning': 'ok'} empty = {'action_type': 'resolve_conflict', 'packages': {}, 'reasoning': 'ok'} s1 = safe_score(dep_cc(full, case)) s2 = safe_score(dep_cc(part, case)) s3 = safe_score(dep_cc(empty, case)) assert s1 > s2 >= s3, f'No variance: {s1},{s2},{s3}' print(f' Dependency resolve variance: {s1:.4f} > {s2:.4f} >= {s3:.4f} PASS') def test_cli_order_variance(): """Clinical grader must return different scores for correct vs violated dependency order.""" case = { 'dependency_graph': { 'schedule_surgery': ['resolve_insurance', 'complete_pre_op'], 'complete_pre_op': ['resolve_insurance'], 'resolve_insurance': [], }, 'required_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'], } correct = { 'action_type': 'order_steps', 'recovery_steps': ['resolve_insurance', 'complete_pre_op', 'schedule_surgery'], } violated = { 'action_type': 'order_steps', 'recovery_steps': ['schedule_surgery', 'complete_pre_op', 'resolve_insurance'], } partial = { 'action_type': 'order_steps', 'recovery_steps': ['resolve_insurance', 'complete_pre_op'], } s1 = safe_score(cli_cc(correct, case)) s2 = safe_score(cli_cc(violated, case)) s3 = safe_score(cli_cc(partial, case)) assert s1 > s2, f'Violation not penalised: correct={s1}, violated={s2}' assert s1 > s3, f'Completeness not rewarded: correct={s1}, partial={s3}' print(f' Clinical order variance: {s1:.4f} > violated:{s2:.4f}, partial:{s3:.4f} PASS') def test_safe_score_clamp(): """ safe_score clamps to [0.01, 0.99] — strictly between 0 and 1. WHY 0.01 not 0.0: The official spec says scores must be strictly > 0. A score of 0.0 from a crashed run looks indistinguishable from a broken environment. 0.01 signals "ran but failed". WHY 0.99 not 1.0: A score of exactly 1.0 means the grader is trivially solved or broken. 0.99 signals "excellent but not perfect". """ # Floor: None, negative, bad types → 0.01 assert safe_score(None) == 0.01, f"Expected 0.01, got {safe_score(None)}" assert safe_score(-0.5) == 0.01, f"Expected 0.01, got {safe_score(-0.5)}" assert safe_score(-999) == 0.01, f"Expected 0.01, got {safe_score(-999)}" assert safe_score('bad') == 0.01, f"Expected 0.01, got {safe_score('bad')}" assert safe_score([]) == 0.01, f"Expected 0.01, got {safe_score([])}" # Ceiling: values > 1 → 0.99 assert safe_score(1.5) == 0.99, f"Expected 0.99, got {safe_score(1.5)}" assert safe_score(2.0) == 0.99, f"Expected 0.99, got {safe_score(2.0)}" assert safe_score(100) == 0.99, f"Expected 0.99, got {safe_score(100)}" # Exact boundary values assert safe_score(0.01) == 0.01, f"Expected 0.01, got {safe_score(0.01)}" assert safe_score(0.99) == 0.99, f"Expected 0.99, got {safe_score(0.99)}" # Pass-through: normal values in range stay unchanged assert safe_score(0.5) == 0.5, f"Expected 0.5, got {safe_score(0.5)}" assert safe_score(0.85) == 0.85, f"Expected 0.85, got {safe_score(0.85)}" assert safe_score(0.0001) == 0.01, f"Expected 0.01 (below floor), got {safe_score(0.0001)}" assert safe_score(0.9999) == 0.99, f"Expected 0.99 (above ceiling), got {safe_score(0.9999)}" print(' safe_score clamp [0.01, 0.99]: PASS') def test_clinical_valid_actions(): """Bug 2 fix: propose_recovery must NOT be in clinical VALID_ACTIONS.""" from server.graders.clinical_grader import VALID_ACTIONS assert 'propose_recovery' not in VALID_ACTIONS, 'Bug 2 still present!' assert set(VALID_ACTIONS) == {'detect_gap', 'rank_issues', 'order_steps'} print(' Clinical VALID_ACTIONS (Bug 2): PASS') if __name__ == '__main__': test_safe_score_clamp() test_clinical_valid_actions() test_sec_identify_variance() test_dep_resolve_variance() test_cli_order_variance() print('\nALL VARIANCE TESTS PASSED ✅')