{ "generated_at": "2026-04-25T18:12:09.069260+00:00", "base_url": "http://localhost:7860", "rows": [ { "task_id": "clean_claim", "seed": 7, "done": true, "reward": 0.8725, "variant_id": 2, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 4 }, { "task_id": "clean_claim", "seed": 11, "done": true, "reward": 0.8725, "variant_id": 1, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 4 }, { "task_id": "clean_claim", "seed": 13, "done": true, "reward": 0.8725, "variant_id": 3, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 4 }, { "task_id": "clean_claim", "seed": 19, "done": true, "reward": 0.8725, "variant_id": 4, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 4 }, { "task_id": "clean_claim", "seed": 25, "done": true, "reward": 0.8725, "variant_id": 0, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 4 }, { "task_id": "contradictory_claim", "seed": 7, "done": true, "reward": 0.7497, "variant_id": 2, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 8 }, { "task_id": "contradictory_claim", "seed": 11, "done": true, "reward": 0.7497, "variant_id": 1, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 8 }, { "task_id": "contradictory_claim", "seed": 13, "done": true, "reward": 0.7497, "variant_id": 3, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 8 }, { "task_id": "contradictory_claim", "seed": 19, "done": true, "reward": 0.7497, "variant_id": 4, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 8 }, { "task_id": "contradictory_claim", "seed": 25, "done": true, "reward": 0.7497, "variant_id": 0, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 8 }, { "task_id": "distribution_shift_claim", "seed": 7, "done": true, "reward": 0.7827, "variant_id": 2, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "distribution_shift_claim", "seed": 11, "done": true, "reward": 0.7827, "variant_id": 1, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "distribution_shift_claim", "seed": 13, "done": true, "reward": 0.7827, "variant_id": 3, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "distribution_shift_claim", "seed": 19, "done": true, "reward": 0.7827, "variant_id": 4, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "distribution_shift_claim", "seed": 25, "done": true, "reward": 0.7827, "variant_id": 0, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "coordinated_fraud", "seed": 7, "done": true, "reward": 0.823, "variant_id": 2, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "coordinated_fraud", "seed": 11, "done": true, "reward": 0.823, "variant_id": 1, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "coordinated_fraud", "seed": 13, "done": true, "reward": 0.823, "variant_id": 3, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "coordinated_fraud", "seed": 19, "done": true, "reward": 0.823, "variant_id": 4, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "coordinated_fraud", "seed": 25, "done": true, "reward": 0.823, "variant_id": 0, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 12 }, { "task_id": "identity_fraud", "seed": 7, "done": true, "reward": 0.818, "variant_id": 2, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 10 }, { "task_id": "identity_fraud", "seed": 11, "done": true, "reward": 0.818, "variant_id": 1, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 10 }, { "task_id": "identity_fraud", "seed": 13, "done": true, "reward": 0.818, "variant_id": 3, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 10 }, { "task_id": "identity_fraud", "seed": 19, "done": true, "reward": 0.818, "variant_id": 4, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 10 }, { "task_id": "identity_fraud", "seed": 25, "done": true, "reward": 0.818, "variant_id": 0, "evidence_quality": 1.0, "exploit_penalty": 0.0, "steps": 10 } ], "average_reward": 0.8092, "completion_rate": 1.0, "cf4_notes": { "variant_awareness": "All strategies read observation.documents to extract variant-specific values (declared_cost_inr, incident_date, admission_date, claimed_cost_inr, standard_rate_inr, distance_km, template_similarity, days_since_purchase, days_to_claim). Evidence text cites per-variant numbers.", "reward_variance_explanation": "Rewards differ across tasks (5 unique values: 0.8725, 0.7497, 0.7827, 0.823, 0.818) but remain identical within each task across seeds. This is an env design property: variants change document values (costs, dates, distances) but the reward function scores signal flag_ids not values. The only variant-sensitive reward component is payout_accuracy (clean_claim only), which the agent now satisfies by reading estimate_inr from docs.", "previous_clean_claim_reward": 0.7625, "current_clean_claim_reward": 0.8725, "improvement_source": "Reading variant payout_band values instead of hardcoded amount_inr=150000" }, "known_limitations": { "model_capacity": "0.5B parameter model (Qwen2.5-0.5B-Instruct) — limited reasoning and instruction-following capacity compared to larger models", "scripted_baseline": "Eval uses scripted strategies (not LLM inference) to isolate env reward mechanics from model quality" } }