Spaces:
Running
Running
| { | |
| "base_url": "https://aniketasla-debatefloor.hf.space", | |
| "rows": [ | |
| { | |
| "policy": "naive_high_no_investigation", | |
| "task_id": "clean_claim", | |
| "seed": 42, | |
| "steps": 1, | |
| "done": true, | |
| "reward": 0.7998, | |
| "final_decision": "approve_claim", | |
| "agent_confidence": "HIGH", | |
| "calibration_score": 1.0, | |
| "decision_accuracy": 1.0, | |
| "fraud_detection_score": 1.0, | |
| "evidence_quality_score": 1.0 | |
| }, | |
| { | |
| "policy": "naive_high_no_investigation", | |
| "task_id": "contradictory_claim", | |
| "seed": 42, | |
| "steps": 1, | |
| "done": true, | |
| "reward": 0.0, | |
| "final_decision": "approve_claim", | |
| "agent_confidence": "HIGH", | |
| "calibration_score": -0.8, | |
| "decision_accuracy": 0.0, | |
| "fraud_detection_score": 0.0, | |
| "evidence_quality_score": 0.0 | |
| }, | |
| { | |
| "policy": "naive_high_no_investigation", | |
| "task_id": "distribution_shift_claim", | |
| "seed": 42, | |
| "steps": 1, | |
| "done": true, | |
| "reward": 0.0, | |
| "final_decision": "approve_claim", | |
| "agent_confidence": "HIGH", | |
| "calibration_score": -0.8, | |
| "decision_accuracy": 0.0, | |
| "fraud_detection_score": 0.0, | |
| "evidence_quality_score": 0.0 | |
| }, | |
| { | |
| "policy": "calibrated_scripted_investigator", | |
| "task_id": "clean_claim", | |
| "seed": 42, | |
| "steps": 4, | |
| "done": true, | |
| "reward": 0.7623, | |
| "final_decision": "approve_claim", | |
| "agent_confidence": "HIGH", | |
| "calibration_score": 1.0, | |
| "decision_accuracy": 1.0, | |
| "fraud_detection_score": 1.0, | |
| "evidence_quality_score": 1.0 | |
| }, | |
| { | |
| "policy": "calibrated_scripted_investigator", | |
| "task_id": "contradictory_claim", | |
| "seed": 42, | |
| "steps": 7, | |
| "done": true, | |
| "reward": 0.5468, | |
| "final_decision": "deny_claim", | |
| "agent_confidence": "MED", | |
| "calibration_score": 0.6, | |
| "decision_accuracy": 1.0, | |
| "fraud_detection_score": 0.75, | |
| "evidence_quality_score": 0.0 | |
| }, | |
| { | |
| "policy": "calibrated_scripted_investigator", | |
| "task_id": "distribution_shift_claim", | |
| "seed": 42, | |
| "steps": 8, | |
| "done": true, | |
| "reward": 0.3522, | |
| "final_decision": "escalate_to_human", | |
| "agent_confidence": "LOW", | |
| "calibration_score": 0.1, | |
| "decision_accuracy": 1.0, | |
| "fraud_detection_score": 0.0, | |
| "evidence_quality_score": 0.0 | |
| } | |
| ] | |
| } |