Spaces:
Running
Running
| { | |
| "n_scenarios": 135, | |
| "threshold": 0.5, | |
| "default_weights": { | |
| "detection": 1.0, | |
| "missed_scam": -0.5, | |
| "false_positive": -0.3, | |
| "calibration": 0.2, | |
| "explanation": 0.4 | |
| }, | |
| "rubric_class": "AnalyzerRubric", | |
| "full_summary": { | |
| "n": 135, | |
| "tp": 83, | |
| "fp": 6, | |
| "fn": 32, | |
| "tn": 14, | |
| "detection": 0.7217, | |
| "fpr": 0.3, | |
| "precision": 0.9326, | |
| "f1": 0.8137 | |
| }, | |
| "full_avg_reward": 0.7336, | |
| "ablations": [ | |
| { | |
| "rubric_zeroed": "detection", | |
| "default_weight": 1.0, | |
| "avg_reward_full": 0.7336, | |
| "avg_reward_zeroed": 0.1188, | |
| "delta_reward": -0.6148, | |
| "interpretation": "rubric matters (reward dropped without it)" | |
| }, | |
| { | |
| "rubric_zeroed": "missed_scam", | |
| "default_weight": -0.5, | |
| "avg_reward_full": 0.7336, | |
| "avg_reward_zeroed": 0.7336, | |
| "delta_reward": 0.0, | |
| "interpretation": "no effect" | |
| }, | |
| { | |
| "rubric_zeroed": "false_positive", | |
| "default_weight": -0.3, | |
| "avg_reward_full": 0.7336, | |
| "avg_reward_zeroed": 0.7469, | |
| "delta_reward": 0.0133, | |
| "interpretation": "rubric helps (reward rose without it)" | |
| }, | |
| { | |
| "rubric_zeroed": "calibration", | |
| "default_weight": 0.2, | |
| "avg_reward_full": 0.7336, | |
| "avg_reward_zeroed": 0.6015, | |
| "delta_reward": -0.1321, | |
| "interpretation": "rubric matters (reward dropped without it)" | |
| }, | |
| { | |
| "rubric_zeroed": "explanation", | |
| "default_weight": 0.4, | |
| "avg_reward_full": 0.7336, | |
| "avg_reward_zeroed": 0.7336, | |
| "delta_reward": 0.0, | |
| "interpretation": "no effect" | |
| } | |
| ], | |
| "notes": "Post-hoc, eval-time ablation on scripted-baseline scenarios. Each rubric is zeroed in turn; we report \u0394 in average composite reward across the bench. This is a sensitivity probe, not a retrain-ablation \u2014 true 'rubric contribution to learning' requires retraining v2 with each rubric removed (GPU; v3 work).", | |
| "source_mode": "bench" | |
| } | |