Spaces:

ujjwalpardeshi
/

chakravyuh

Running

chakravyuh / logs /ablation_study.json

UjjwalPardeshi

deploy: latest main to HF Space

03815d6 13 days ago

1.98 kB

	{
	"n_scenarios": 135,
	"threshold": 0.5,
	"default_weights": {
	"detection": 1.0,
	"missed_scam": -0.5,
	"false_positive": -0.3,
	"calibration": 0.2,
	"explanation": 0.4
	},
	"rubric_class": "AnalyzerRubric",
	"full_summary": {
	"n": 135,
	"tp": 83,
	"fp": 6,
	"fn": 32,
	"tn": 14,
	"detection": 0.7217,
	"fpr": 0.3,
	"precision": 0.9326,
	"f1": 0.8137
	},
	"full_avg_reward": 0.7336,
	"ablations": [
	{
	"rubric_zeroed": "detection",
	"default_weight": 1.0,
	"avg_reward_full": 0.7336,
	"avg_reward_zeroed": 0.1188,
	"delta_reward": -0.6148,
	"interpretation": "rubric matters (reward dropped without it)"
	},
	{
	"rubric_zeroed": "missed_scam",
	"default_weight": -0.5,
	"avg_reward_full": 0.7336,
	"avg_reward_zeroed": 0.7336,
	"delta_reward": 0.0,
	"interpretation": "no effect"
	},
	{
	"rubric_zeroed": "false_positive",
	"default_weight": -0.3,
	"avg_reward_full": 0.7336,
	"avg_reward_zeroed": 0.7469,
	"delta_reward": 0.0133,
	"interpretation": "rubric helps (reward rose without it)"
	},
	{
	"rubric_zeroed": "calibration",
	"default_weight": 0.2,
	"avg_reward_full": 0.7336,
	"avg_reward_zeroed": 0.6015,
	"delta_reward": -0.1321,
	"interpretation": "rubric matters (reward dropped without it)"
	},
	{
	"rubric_zeroed": "explanation",
	"default_weight": 0.4,
	"avg_reward_full": 0.7336,
	"avg_reward_zeroed": 0.7336,
	"delta_reward": 0.0,
	"interpretation": "no effect"
	}
	],
	"notes": "Post-hoc, eval-time ablation on scripted-baseline scenarios. Each rubric is zeroed in turn; we report \u0394 in average composite reward across the bench. This is a sensitivity probe, not a retrain-ablation \u2014 true 'rubric contribution to learning' requires retraining v2 with each rubric removed (GPU; v3 work).",
	"source_mode": "bench"
	}