Spaces:
Running
Running
| { | |
| "generated_at": "2026-04-25T18:12:09.069260+00:00", | |
| "base_url": "http://localhost:7860", | |
| "rows": [ | |
| { | |
| "task_id": "clean_claim", | |
| "seed": 7, | |
| "done": true, | |
| "reward": 0.8725, | |
| "variant_id": 2, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 4 | |
| }, | |
| { | |
| "task_id": "clean_claim", | |
| "seed": 11, | |
| "done": true, | |
| "reward": 0.8725, | |
| "variant_id": 1, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 4 | |
| }, | |
| { | |
| "task_id": "clean_claim", | |
| "seed": 13, | |
| "done": true, | |
| "reward": 0.8725, | |
| "variant_id": 3, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 4 | |
| }, | |
| { | |
| "task_id": "clean_claim", | |
| "seed": 19, | |
| "done": true, | |
| "reward": 0.8725, | |
| "variant_id": 4, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 4 | |
| }, | |
| { | |
| "task_id": "clean_claim", | |
| "seed": 25, | |
| "done": true, | |
| "reward": 0.8725, | |
| "variant_id": 0, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 4 | |
| }, | |
| { | |
| "task_id": "contradictory_claim", | |
| "seed": 7, | |
| "done": true, | |
| "reward": 0.7497, | |
| "variant_id": 2, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 8 | |
| }, | |
| { | |
| "task_id": "contradictory_claim", | |
| "seed": 11, | |
| "done": true, | |
| "reward": 0.7497, | |
| "variant_id": 1, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 8 | |
| }, | |
| { | |
| "task_id": "contradictory_claim", | |
| "seed": 13, | |
| "done": true, | |
| "reward": 0.7497, | |
| "variant_id": 3, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 8 | |
| }, | |
| { | |
| "task_id": "contradictory_claim", | |
| "seed": 19, | |
| "done": true, | |
| "reward": 0.7497, | |
| "variant_id": 4, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 8 | |
| }, | |
| { | |
| "task_id": "contradictory_claim", | |
| "seed": 25, | |
| "done": true, | |
| "reward": 0.7497, | |
| "variant_id": 0, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 8 | |
| }, | |
| { | |
| "task_id": "distribution_shift_claim", | |
| "seed": 7, | |
| "done": true, | |
| "reward": 0.7827, | |
| "variant_id": 2, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "distribution_shift_claim", | |
| "seed": 11, | |
| "done": true, | |
| "reward": 0.7827, | |
| "variant_id": 1, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "distribution_shift_claim", | |
| "seed": 13, | |
| "done": true, | |
| "reward": 0.7827, | |
| "variant_id": 3, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "distribution_shift_claim", | |
| "seed": 19, | |
| "done": true, | |
| "reward": 0.7827, | |
| "variant_id": 4, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "distribution_shift_claim", | |
| "seed": 25, | |
| "done": true, | |
| "reward": 0.7827, | |
| "variant_id": 0, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "coordinated_fraud", | |
| "seed": 7, | |
| "done": true, | |
| "reward": 0.823, | |
| "variant_id": 2, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "coordinated_fraud", | |
| "seed": 11, | |
| "done": true, | |
| "reward": 0.823, | |
| "variant_id": 1, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "coordinated_fraud", | |
| "seed": 13, | |
| "done": true, | |
| "reward": 0.823, | |
| "variant_id": 3, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "coordinated_fraud", | |
| "seed": 19, | |
| "done": true, | |
| "reward": 0.823, | |
| "variant_id": 4, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "coordinated_fraud", | |
| "seed": 25, | |
| "done": true, | |
| "reward": 0.823, | |
| "variant_id": 0, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 12 | |
| }, | |
| { | |
| "task_id": "identity_fraud", | |
| "seed": 7, | |
| "done": true, | |
| "reward": 0.818, | |
| "variant_id": 2, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 10 | |
| }, | |
| { | |
| "task_id": "identity_fraud", | |
| "seed": 11, | |
| "done": true, | |
| "reward": 0.818, | |
| "variant_id": 1, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 10 | |
| }, | |
| { | |
| "task_id": "identity_fraud", | |
| "seed": 13, | |
| "done": true, | |
| "reward": 0.818, | |
| "variant_id": 3, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 10 | |
| }, | |
| { | |
| "task_id": "identity_fraud", | |
| "seed": 19, | |
| "done": true, | |
| "reward": 0.818, | |
| "variant_id": 4, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 10 | |
| }, | |
| { | |
| "task_id": "identity_fraud", | |
| "seed": 25, | |
| "done": true, | |
| "reward": 0.818, | |
| "variant_id": 0, | |
| "evidence_quality": 1.0, | |
| "exploit_penalty": 0.0, | |
| "steps": 10 | |
| } | |
| ], | |
| "average_reward": 0.8092, | |
| "completion_rate": 1.0, | |
| "cf4_notes": { | |
| "variant_awareness": "All strategies read observation.documents to extract variant-specific values (declared_cost_inr, incident_date, admission_date, claimed_cost_inr, standard_rate_inr, distance_km, template_similarity, days_since_purchase, days_to_claim). Evidence text cites per-variant numbers.", | |
| "reward_variance_explanation": "Rewards differ across tasks (5 unique values: 0.8725, 0.7497, 0.7827, 0.823, 0.818) but remain identical within each task across seeds. This is an env design property: variants change document values (costs, dates, distances) but the reward function scores signal flag_ids not values. The only variant-sensitive reward component is payout_accuracy (clean_claim only), which the agent now satisfies by reading estimate_inr from docs.", | |
| "previous_clean_claim_reward": 0.7625, | |
| "current_clean_claim_reward": 0.8725, | |
| "improvement_source": "Reading variant payout_band values instead of hardcoded amount_inr=150000" | |
| }, | |
| "known_limitations": { | |
| "model_capacity": "0.5B parameter model (Qwen2.5-0.5B-Instruct) — limited reasoning and instruction-following capacity compared to larger models", | |
| "scripted_baseline": "Eval uses scripted strategies (not LLM inference) to isolate env reward mechanics from model quality" | |
| } | |
| } |