Spaces:
Running
Running
| """ | |
| tests/test_calibration.py | |
| Tests for the calibration grader — run with: pytest tests/test_calibration.py -v | |
| ALL tests must pass before pushing to GitHub. | |
| """ | |
| import pytest | |
| from server.calibration_grader import ( | |
| calibration_reward, | |
| detect_confidence_gaming, | |
| training_reward, | |
| eval_reward, | |
| CALIBRATION_MATRIX, | |
| ) | |
| class TestCalibrationMatrix: | |
| """Test the core 3×2 calibration matrix values.""" | |
| def test_high_correct_returns_1_point_0(self): | |
| result = calibration_reward("approve_claim", "HIGH", "approve_claim") | |
| assert result == 1.0 | |
| def test_high_wrong_returns_minus_0_point_8(self): | |
| result = calibration_reward("approve_claim", "HIGH", "deny_claim") | |
| assert result == -0.8 | |
| def test_med_correct_returns_0_point_6(self): | |
| result = calibration_reward("deny_claim", "MED", "deny_claim") | |
| assert result == 0.6 | |
| def test_all_outputs_in_valid_range(self, confidence, correct, expected): | |
| decision = "approve_claim" | |
| ground_truth = "approve_claim" if correct else "deny_claim" | |
| result = calibration_reward(decision, confidence, ground_truth) | |
| assert result == expected | |
| assert -1.0 <= result <= 1.0 | |
| class TestAntiGaming: | |
| def test_systematic_low_triggers_gaming_penalty(self): | |
| history = [{"confidence": "LOW"}] * 15 | |
| penalty = detect_confidence_gaming(history) | |
| assert penalty > 0 | |
| def test_systematic_high_triggers_gaming_penalty(self): | |
| history = [{"confidence": "HIGH"}] * 15 | |
| penalty = detect_confidence_gaming(history) | |
| assert penalty > 0 | |
| def test_gaming_detector_needs_10_episodes_minimum(self): | |
| history = [{"confidence": "LOW"}] * 9 | |
| penalty = detect_confidence_gaming(history) | |
| assert penalty == 0.0 | |
| class TestTrainingReward: | |
| def test_training_reward_step_penalty_applied(self): | |
| result = training_reward("approve_claim", "HIGH", "approve_claim", 0, 1, False) | |
| assert result == pytest.approx(-0.05) | |