Spaces:
Running
Running
| """ | |
| inference_debatefloor.py | |
| DebateFloor — Baseline Agent | |
| Runs all 3 tasks against the DebateFloor environment over HTTP. | |
| Declares calibrated confidence (HIGH/MED/LOW) on every terminal action. | |
| MANDATORY STDOUT FORMAT — do not change: | |
| [START] task=<task_id> env=debatefloor model=<model> confidence_required=true | |
| [STEP] step=<n> action=<action_type> reward=<r> confidence=<conf|null> done=<bool> error=<msg|None> | |
| [END] success=<bool> steps=<n> total_reward=<r> calibration_score=<s> decision=<correct|wrong|none> | |
| Usage: | |
| python inference_debatefloor.py --task contradictory_claim --model gpt-4o | |
| python inference_debatefloor.py --all-tasks --seed 42 --base-url http://localhost:7860 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import sys | |
| import time | |
| from typing import Any, Dict, List, Optional | |
| import requests | |
| # ───────────────────────────────────────────────────────────── | |
| # CONFIGURATION | |
| # ───────────────────────────────────────────────────────────── | |
| DEFAULT_BASE_URL = "http://localhost:7860" | |
| DEFAULT_MODEL = os.getenv("MODEL_NAME", "gpt-4o") | |
| HF_TOKEN = os.getenv("HF_TOKEN", "") | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://api.openai.com/v1") | |
| # Task configuration | |
| TASK_CONFIG = { | |
| "clean_claim": { | |
| "terminal_confidence": "HIGH", # obvious approval → HIGH confidence | |
| "strategy": "approve", | |
| }, | |
| "contradictory_claim": { | |
| "terminal_confidence": "MED", # fraud detected but some uncertainty → MED | |
| "strategy": "deny", | |
| }, | |
| "distribution_shift_claim": { | |
| "terminal_confidence": "MED", # NEW-7: 4 grounded signals + ground_truth_confidence=0.70 → MED | |
| "strategy": "escalate", # canonical decision must be escalate_to_human (env normalises to request_investigation) | |
| }, | |
| "coordinated_fraud": { | |
| "terminal_confidence": "MED", # ground_truth_confidence=0.90, ring scope partly unknown → MED | |
| "strategy": "escalate", # canonical decision must be escalate_to_human (env normalises to request_investigation) | |
| }, | |
| "identity_fraud": { | |
| "terminal_confidence": "MED", # 4 grounded signals + ground_truth_confidence=0.90 → MED (ID forgery never 100% certain) | |
| "strategy": "deny", | |
| }, | |
| } | |
| ALL_TASKS = list(TASK_CONFIG.keys()) | |
| # ───────────────────────────────────────────────────────────── | |
| # HTTP CLIENT | |
| # ───────────────────────────────────────────────────────────── | |
| class DebateFloorClient: | |
| def __init__(self, base_url: str = DEFAULT_BASE_URL): | |
| self.base_url = base_url.rstrip("/") | |
| self.session_id: Optional[str] = None | |
| def health(self) -> Dict: | |
| return requests.get(f"{self.base_url}/health", timeout=10).json() | |
| def reset(self, task_id: str, seed: int = 42) -> Dict: | |
| r = requests.post( | |
| f"{self.base_url}/reset", | |
| json={"task_id": task_id, "seed": seed}, | |
| timeout=15, | |
| ) | |
| r.raise_for_status() | |
| data = r.json() | |
| self.session_id = data.get("session_id") | |
| return data | |
| def step(self, action: Dict[str, Any]) -> Dict: | |
| if not self.session_id: | |
| raise RuntimeError("No active session. Call reset() first.") | |
| r = requests.post( | |
| f"{self.base_url}/step", | |
| json={"action": action, "session_id": self.session_id}, | |
| timeout=15, | |
| ) | |
| r.raise_for_status() | |
| return r.json() | |
| # ───────────────────────────────────────────────────────────── | |
| # DETERMINISTIC AGENT STRATEGIES | |
| # Each strategy is a scripted sequence of actions. In production | |
| # you'd replace this with LLM completions. This baseline | |
| # demonstrates the confidence declaration mechanic clearly. | |
| # ───────────────────────────────────────────────────────────── | |
| def _strategy_clean_claim(client: DebateFloorClient, obs: Dict) -> List[Dict]: | |
| """Validate key documents, estimate payout (variant-aware), approve with HIGH. | |
| CF-4 fix: read declared_cost_inr / estimate_inr from the observation so the | |
| payout estimate falls inside the per-variant payout_band. With the previous | |
| hardcoded amount=150000, payout_accuracy was 0 for every variant; reading | |
| the variant value pushes payout_accuracy to 1.0 AND lets the per-variant | |
| band drift be reflected in evidence/reasoning text. See PLAN.md > CF-4. | |
| """ | |
| observation = obs.get("observation", obs) | |
| docs = observation.get("documents", []) | |
| actions = [] | |
| for doc in docs[:2]: | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": doc["doc_id"]}, | |
| "reasoning": ( | |
| f"Verify document {doc.get('doc_id', '?')} " | |
| f"({doc.get('doc_type', 'unknown')}) before approving." | |
| ), | |
| }) | |
| declared_cost = None | |
| estimate = None | |
| for doc in docs: | |
| meta = doc.get("metadata", {}) or {} | |
| if declared_cost is None and "declared_cost_inr" in meta: | |
| declared_cost = float(meta["declared_cost_inr"]) | |
| if estimate is None and "estimate_inr" in meta: | |
| estimate = float(meta["estimate_inr"]) | |
| payout_amount = estimate if estimate is not None else (declared_cost if declared_cost is not None else 50000.0) | |
| actions.append({ | |
| "action_type": "estimate_payout", | |
| "parameters": {"amount_inr": payout_amount}, | |
| "reasoning": ( | |
| f"Use estimate INR {payout_amount:,.0f} read from doc metadata " | |
| f"(declared INR {declared_cost:,.0f})." | |
| if declared_cost is not None | |
| else f"Use estimate INR {payout_amount:,.0f} (no declared cost in docs)." | |
| ), | |
| }) | |
| approve_reason_parts = ["All documents verified", "no fraud signals"] | |
| if declared_cost is not None: | |
| approve_reason_parts.append(f"declared cost INR {declared_cost:,.0f}") | |
| if estimate is not None and estimate != declared_cost: | |
| approve_reason_parts.append(f"garage estimate INR {estimate:,.0f}") | |
| approve_reason = ". ".join(approve_reason_parts) + ". Clean claim approved." | |
| actions.append({ | |
| "action_type": "approve_claim", | |
| "confidence": "HIGH", | |
| "parameters": {"reason": approve_reason}, | |
| "reasoning": "Clean claim with consistent variant-specific values — HIGH confidence justified.", | |
| }) | |
| return actions | |
| def _strategy_contradictory_claim(client: DebateFloorClient, obs: Dict) -> List[Dict]: | |
| """Investigate document contradictions, flag signals, deny with MED. | |
| CF-4 fix: cite per-variant incident_date / admission_date / claimed_cost / | |
| standard_rate values from the observation so evidence text reflects what | |
| the variant actually shipped. The flag_id keywords still trigger | |
| get_evidence_keyword_hints(), so signal scoring is preserved. | |
| """ | |
| observation = obs.get("observation", obs) | |
| docs = observation.get("documents", []) | |
| actions = [] | |
| for doc in docs[:3]: | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": doc["doc_id"]}, | |
| "reasoning": ( | |
| f"Validate {doc.get('doc_id', '?')} " | |
| f"({doc.get('doc_type', 'unknown')}) — looking for cross-doc contradictions." | |
| ), | |
| }) | |
| actions.append({ | |
| "action_type": "query_historical_data", | |
| "parameters": {}, | |
| "reasoning": "Check for prior similar claims that could indicate pattern fraud.", | |
| }) | |
| incident_date = None | |
| admission_date = None | |
| claimed_cost = None | |
| standard_rate = None | |
| for doc in docs: | |
| meta = doc.get("metadata", {}) or {} | |
| if incident_date is None and "incident_date" in meta: | |
| incident_date = meta["incident_date"] | |
| if admission_date is None and "admission_date" in meta: | |
| admission_date = meta["admission_date"] | |
| if claimed_cost is None and "claimed_cost_inr" in meta: | |
| claimed_cost = meta["claimed_cost_inr"] | |
| if standard_rate is None and "standard_rate_inr" in meta: | |
| standard_rate = meta["standard_rate_inr"] | |
| if incident_date and admission_date: | |
| date_evidence = ( | |
| f"Claim form records incident date {incident_date} but hospital " | |
| f"admission documented on {admission_date} — date mismatch confirmed " | |
| "across documents." | |
| ) | |
| else: | |
| date_evidence = ( | |
| "Claim form incident date does not match hospital admission record — " | |
| "date mismatch confirmed across documents." | |
| ) | |
| if claimed_cost is not None and standard_rate is not None and standard_rate: | |
| ratio = float(claimed_cost) / float(standard_rate) | |
| cost_evidence = ( | |
| f"Hospital bill INR {claimed_cost:,} is {ratio:.2f}x the regional " | |
| f"standard cost of INR {standard_rate:,} — cost inflation pattern " | |
| "indicating overbilled charges." | |
| ) | |
| else: | |
| cost_evidence = ( | |
| "Hospital bill rate is approximately 2.4 times the regional standard " | |
| "cost — cost inflation pattern indicating overbilled charges." | |
| ) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": {"flag_id": "date_mismatch", "evidence": date_evidence}, | |
| "reasoning": "Date inconsistency between claim form and admission record is a grounded fraud indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": {"flag_id": "cost_inflation", "evidence": cost_evidence}, | |
| "reasoning": "Inflated cost versus benchmark suggests billing fraud.", | |
| }) | |
| # Convene debate panel — adversarial review before terminal decision | |
| actions.append({ | |
| "action_type": "convene_debate_panel", | |
| "parameters": {}, | |
| "reasoning": "Contradictory evidence warrants adversarial review. Panel will pressure-test fraud signals.", | |
| }) | |
| # Terminal: deny with MED confidence (evidence found but some uncertainty remains) | |
| actions.append({ | |
| "action_type": "deny_claim", | |
| "confidence": "MED", | |
| "parameters": {"reason": "Date mismatch and cost inflation confirmed across documents. Fraud signals grounded in evidence."}, | |
| "reasoning": "Sufficient evidence to deny, but complex case warrants MED not HIGH confidence.", | |
| }) | |
| return actions | |
| def _strategy_distribution_shift_claim(client: DebateFloorClient, obs: Dict) -> List[Dict]: | |
| """Distribution-shift ring — uses the NEW-7 discovery hooks added to the | |
| environment so this task can finally earn evidence credit. | |
| Env discovery contract (post NEW-7 fix; see app/environment.py and | |
| app/tasks.py:get_evidence_keyword_hints): | |
| validate_document(DOC-41) → records recent_policy_cluster | |
| validate_document(DOC-42) → records shared_repair_shop_far | |
| query_linked_claim(CLM-DIST-602), then (CLM-DIST-603) → CLM-DIST-604 | |
| surfaces; on the 2nd query the shared emergency_contact is detected | |
| across queried claims → records shared_emergency_contact; the broker | |
| check fires for any CLM-DIST-* once 2+ claims have been queried → | |
| records clustered_policy_broker. | |
| near_identical_descriptions has no doc-level discovery hook for this | |
| task (the task's primary docs do not contain the cross-claim | |
| narrative), so we skip flagging it — symmetric to coordinated_fraud | |
| which skips shared_emergency_contact for the same reason. | |
| Result: 4 of 5 expected_signals discovered + flagged with grounded | |
| evidence. evidence_quality = evidence_hits / evidence_total = 4/4 = 1.0. | |
| """ | |
| actions: List[Dict] = [] | |
| # 1. Validate the two documents whose signals are auto-recorded | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": "DOC-41"}, | |
| "reasoning": "Validate claim form — surfaces recent_policy_cluster from claim_date metadata.", | |
| }) | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": "DOC-42"}, | |
| "reasoning": "Validate garage estimate — exposes FastRepair Hub Whitefield (shared shop).", | |
| }) | |
| # 2. Query historical data — confirms the policy purchase cluster context. | |
| actions.append({ | |
| "action_type": "query_historical_data", | |
| "parameters": {}, | |
| "reasoning": "Pull policy history — corroborates 24-day policy age inside the cluster window.", | |
| }) | |
| # 3. Query the two visible linked claims. After the 2nd query the env | |
| # auto-records shared_emergency_contact + clustered_policy_broker | |
| # (NEW-7 hooks) and surfaces the hidden CLM-DIST-604. | |
| for cid in ("CLM-DIST-602", "CLM-DIST-603"): | |
| actions.append({ | |
| "action_type": "query_linked_claim", | |
| "parameters": {"claim_id": cid}, | |
| "reasoning": f"Query {cid} to expose the cross-claim contact/broker overlap.", | |
| }) | |
| # 4. Query the now-surfaced 4th claim — strengthens the broker cluster | |
| # and confirms the shared shop / contact pattern. | |
| actions.append({ | |
| "action_type": "query_linked_claim", | |
| "parameters": {"claim_id": "CLM-DIST-604"}, | |
| "reasoning": "Query the newly-surfaced fourth claim — confirms full ring scope.", | |
| }) | |
| # 5. Flag four of five expected_signals with evidence containing the | |
| # keywords required by app.tasks.get_evidence_keyword_hints | |
| # ("distribution_shift_claim", ...). | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "shared_repair_shop_far", | |
| "evidence": "All linked claims used the same repair shop FastRepair Hub Whitefield — geographic ring indicator.", | |
| }, | |
| "reasoning": "Shared distant repair shop is a grounded geographic ring indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "shared_emergency_contact", | |
| "evidence": "All queried claims share the same emergency contact phone +91-9000005555 — coordinated contact ring.", | |
| }, | |
| "reasoning": "Shared emergency contact across 3 supposedly unrelated claims is a strong ring indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "recent_policy_cluster", | |
| "evidence": "All four related policies were purchased within a 30 day cluster window before the incident — policy purchase cluster.", | |
| }, | |
| "reasoning": "Tight policy purchase cluster is a temporal ring indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "clustered_policy_broker", | |
| "evidence": "All queried claims share the same broker BRK-882 — policy broker cluster confirmed across 4 claims.", | |
| }, | |
| "reasoning": "Same broker across 4 supposedly unrelated policies = coordinated issuance.", | |
| }) | |
| # 6. Adversarial review before terminal action | |
| actions.append({ | |
| "action_type": "convene_debate_panel", | |
| "parameters": {}, | |
| "reasoning": "Cross-claim ring of 4 demands adversarial review before recommending investigation.", | |
| }) | |
| # 7. Terminal: escalate_to_human MED. ground_truth_confidence=0.70 + | |
| # 4 grounded signals → MED is the calibrated answer (LOW would | |
| # underclaim given the strength of the evidence; HIGH would | |
| # overclaim given the residual uncertainty about the full ring scope). | |
| actions.append({ | |
| "action_type": "escalate_to_human", | |
| "confidence": "MED", | |
| "parameters": {"reason": "Ring of 4 linked claims with shared shop/broker/contact/policy cluster. Investigator should confirm full scope."}, | |
| "reasoning": "Strong multi-signal evidence; ring may extend beyond 4 claims, so MED not HIGH.", | |
| }) | |
| return actions | |
| def _strategy_coordinated_fraud(client: DebateFloorClient, obs: Dict) -> List[Dict]: | |
| """Coordinated ring — validate primary docs (records 3 signals), query 3 linked | |
| claims (surfaces hidden CLM-GROUP-304, records clustered_policy_broker), flag | |
| 4 of 5 expected_signals with grounded evidence, then escalate_to_human MED. | |
| Env discovery contract (see app/environment.py:600-636 and 361-417): | |
| validate_document(DOC-21) → records shared_repair_shop_far | |
| validate_document(DOC-22) → records near_identical_descriptions | |
| validate_document(DOC-23) → records recent_policy_cluster | |
| query_linked_claim(CLM-GROUP-302), then (CLM-GROUP-303) → CLM-GROUP-304 surfaces | |
| query_linked_claim(CLM-GROUP-304) → records clustered_policy_broker | |
| shared_emergency_contact has NO discovery path that auto-records the signal | |
| (only a hint string is returned), so flagging it would trigger the | |
| "raised before discovered" penalty (+0.08 penalty_total). We skip it. | |
| CF-4 fix: read variant-specific distance, template_similarity and | |
| days_since_purchase from doc metadata so flagged evidence cites the actual | |
| per-variant numbers. | |
| """ | |
| observation_cf = obs.get("observation", obs) | |
| docs_cf = observation_cf.get("documents", []) or [] | |
| distance_km = None | |
| template_similarity = None | |
| purchase_days = None | |
| for doc in docs_cf: | |
| meta = doc.get("metadata", {}) or {} | |
| if distance_km is None and "distance_km" in meta: | |
| distance_km = meta["distance_km"] | |
| if template_similarity is None and "template_similarity" in meta: | |
| template_similarity = meta["template_similarity"] | |
| if purchase_days is None and "days_since_purchase" in meta: | |
| purchase_days = meta["days_since_purchase"] | |
| actions: List[Dict] = [] | |
| # 1. Validate the three primary documents (each reveals one expected signal) | |
| for doc_id in ("DOC-21", "DOC-22", "DOC-23"): | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": doc_id}, | |
| "reasoning": f"Validate {doc_id} to surface the embedded ring indicator.", | |
| }) | |
| # 2. Query two known linked claims (surfaces the hidden CLM-GROUP-304) | |
| for cid in ("CLM-GROUP-302", "CLM-GROUP-303"): | |
| actions.append({ | |
| "action_type": "query_linked_claim", | |
| "parameters": {"claim_id": cid}, | |
| "reasoning": f"Query {cid} to expose cross-claim contact/broker overlap.", | |
| }) | |
| # 3. Query the now-surfaced 4th claim — this records clustered_policy_broker | |
| actions.append({ | |
| "action_type": "query_linked_claim", | |
| "parameters": {"claim_id": "CLM-GROUP-304"}, | |
| "reasoning": "Query the newly-surfaced fourth claim — confirms shared broker BRK-441.", | |
| }) | |
| # 4. Flag four of five expected_signals with evidence containing required keywords | |
| # (keywords from app.tasks.get_evidence_keyword_hints("coordinated_fraud", ...)) | |
| distance_text = f"{distance_km} km" if distance_km is not None else "340 km" | |
| sim_text = f"{template_similarity:.2f}" if isinstance(template_similarity, (int, float)) else "0.93" | |
| if isinstance(purchase_days, list) and purchase_days: | |
| cluster_text = ( | |
| f"All four related policies were purchased within a 30 day cluster " | |
| f"window before the incident (days since purchase: {purchase_days})." | |
| ) | |
| else: | |
| cluster_text = ( | |
| "All four related policies were purchased within a 30 day cluster window before the incident." | |
| ) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "shared_repair_shop_far", | |
| "evidence": f"Repair shop RapidFix Motors in Kota is {distance_text} from incident site — implausible distance.", | |
| }, | |
| "reasoning": "Shared distant repair shop is a geographic ring indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "near_identical_descriptions", | |
| "evidence": f"All linked claims use a near-identical narrative description template (similarity ~{sim_text}).", | |
| }, | |
| "reasoning": "Identical narrative templates indicate copy-pasted fraud.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "recent_policy_cluster", | |
| "evidence": cluster_text, | |
| }, | |
| "reasoning": "Tight policy purchase cluster is a temporal ring indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "clustered_policy_broker", | |
| "evidence": "All queried claims share the same broker BRK-441 — policy broker cluster confirmed.", | |
| }, | |
| "reasoning": "Same broker across 4 supposedly unrelated policies = coordinated issuance.", | |
| }) | |
| # 5. Adversarial review before terminal action | |
| actions.append({ | |
| "action_type": "convene_debate_panel", | |
| "parameters": {}, | |
| "reasoning": "Cross-claim ring of 4 demands adversarial review before recommending investigation.", | |
| }) | |
| # 6. Terminal: escalate_to_human MED. Env normalises to request_investigation | |
| # (allowed_final_decisions=['request_investigation']) and the calibration | |
| # grader compares the raw escalate_to_human against ground truth | |
| # escalate_to_human (see app/environment.py:34-41, 441-446). | |
| actions.append({ | |
| "action_type": "escalate_to_human", | |
| "confidence": "MED", | |
| "parameters": {"reason": "Ring of 4 linked claims with shared shop/broker/policy cluster. Investigator should confirm full scope."}, | |
| "reasoning": "Strong evidence but ring may extend beyond 4 claims — MED is the calibrated answer.", | |
| }) | |
| return actions | |
| def _strategy_identity_fraud(client: DebateFloorClient, obs: Dict) -> List[Dict]: | |
| """Identity fraud — validate documents (records 2 signals), compare DOC-31 vs | |
| DOC-34 (records dob_inconsistency), lookup_policy_history (records | |
| recent_policy_purchase since policy_age_days=5 ≤ 30), flag all 4 | |
| expected_signals with grounded evidence, then deny_claim MED. | |
| Env discovery contract (see app/environment.py:228-264, 600-636, app/tasks.py:680-683): | |
| validate_document(DOC-31) → records identity_mismatch | |
| validate_document(DOC-32) → records hospital_no_record | |
| compare_documents(DOC-31, DOC-34) → records dob_inconsistency | |
| lookup_policy_history → records recent_policy_purchase (policy_age_days=5) | |
| CF-4 fix: pull per-variant `days_to_claim` from doc metadata so the | |
| recent_policy_purchase evidence reflects the actual variant value | |
| (5/7/3/8/6 days across the 5 variants). | |
| """ | |
| observation_id = obs.get("observation", obs) | |
| docs_id = observation_id.get("documents", []) or [] | |
| actions: List[Dict] = [] | |
| # 1. Validate the two documents whose signals are auto-recorded | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": "DOC-31"}, | |
| "reasoning": "Validate primary claim form — exposes ID/registry mismatch.", | |
| }) | |
| actions.append({ | |
| "action_type": "validate_document", | |
| "parameters": {"doc_id": "DOC-32"}, | |
| "reasoning": "Validate hospital record — confirms no patient match.", | |
| }) | |
| # 2. Compare DOC-31 vs DOC-34 — env's COMPARE_DOCUMENT_SIGNALS records dob_inconsistency | |
| actions.append({ | |
| "action_type": "compare_documents", | |
| "parameters": {"doc_id_a": "DOC-31", "doc_id_b": "DOC-34"}, | |
| "reasoning": "Compare claim form vs ID proof — reveals DOB inconsistency.", | |
| }) | |
| # 3. Policy history lookup — records recent_policy_purchase (policy_age_days=5 ≤ 30) | |
| actions.append({ | |
| "action_type": "lookup_policy_history", | |
| "parameters": {}, | |
| "reasoning": "Pull policy history — exposes recent inception inside the 30 day exclusion window.", | |
| }) | |
| # 4. Flag all four expected_signals with evidence containing required keywords | |
| # (keywords from app.tasks.get_evidence_keyword_hints("identity_fraud", ...)) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "identity_mismatch", | |
| "evidence": "National identity registry returns no record matching policy holder ID suffix 7821 — registry mismatch.", | |
| }, | |
| "reasoning": "Identity registry mismatch is a grounded fraud indicator.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "hospital_no_record", | |
| "evidence": "Hospital admission record has no patient name found for the claimant on file.", | |
| }, | |
| "reasoning": "Hospital lookup confirms ghost claimant.", | |
| }) | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "dob_inconsistency", | |
| "evidence": "Date of birth on submitted ID (1988-04-15) does not match policy DOB (1986-11-22) — inconsistency mismatch.", | |
| }, | |
| "reasoning": "DOB drift across documents is a grounded identity-fraud signal.", | |
| }) | |
| days_to_claim = None | |
| for doc in docs_id: | |
| meta = doc.get("metadata", {}) or {} | |
| if "days_to_claim" in meta: | |
| days_to_claim = meta["days_to_claim"] | |
| break | |
| days_text = f"{days_to_claim} days" if days_to_claim is not None else "5 days" | |
| actions.append({ | |
| "action_type": "flag_fraud_signal", | |
| "parameters": { | |
| "flag_id": "recent_policy_purchase", | |
| "evidence": ( | |
| f"Policy inception was only {days_text} before incident date — " | |
| "well inside the 30 day exclusion window — recent policy purchase." | |
| ), | |
| }, | |
| "reasoning": "Suspiciously recent policy purchase is a grounded indicator.", | |
| }) | |
| # 5. Adversarial review before denial | |
| actions.append({ | |
| "action_type": "convene_debate_panel", | |
| "parameters": {}, | |
| "reasoning": "Four grounded signals warrant adversarial review before denial.", | |
| }) | |
| # 6. Terminal: deny_claim MED. Ground truth is deny_claim | |
| # (see app/environment.py:34-41) and allowed_final_decisions | |
| # includes deny_claim (app/tasks.py:488). | |
| actions.append({ | |
| "action_type": "deny_claim", | |
| "confidence": "MED", | |
| "parameters": {"reason": "Identity registry mismatch, hospital no-record, DOB drift, and recent policy inside exclusion window — claim cannot stand."}, | |
| "reasoning": "Strong multi-signal evidence; ID forgery is rarely provable to 100%, so MED not HIGH.", | |
| }) | |
| return actions | |
| STRATEGIES = { | |
| "clean_claim": _strategy_clean_claim, | |
| "contradictory_claim": _strategy_contradictory_claim, | |
| "distribution_shift_claim": _strategy_distribution_shift_claim, | |
| "coordinated_fraud": _strategy_coordinated_fraud, | |
| "identity_fraud": _strategy_identity_fraud, | |
| } | |
| # ───────────────────────────────────────────────────────────── | |
| # EPISODE RUNNER | |
| # ───────────────────────────────────────────────────────────── | |
| def run_episode(task_id: str, model: str, base_url: str, seed: int) -> Dict[str, Any]: | |
| client = DebateFloorClient(base_url) | |
| # Print mandatory [START] line | |
| print(f"[START] task={task_id} env=debatefloor model={model} confidence_required=true") | |
| # Reset environment | |
| reset_resp = client.reset(task_id=task_id, seed=seed) | |
| obs = reset_resp | |
| # Get scripted actions for this task | |
| strategy_fn = STRATEGIES.get(task_id) | |
| if not strategy_fn: | |
| print(f"[ERROR] No strategy for task '{task_id}'") | |
| return {} | |
| actions = strategy_fn(client, obs) | |
| total_reward = 0.0 | |
| calibration_score = None | |
| step_num = 0 | |
| last_done = False | |
| final_decision_correct = "none" | |
| for action in actions: | |
| if last_done: | |
| break | |
| step_num += 1 | |
| confidence = action.get("confidence", None) | |
| try: | |
| step_resp = client.step(action) | |
| except Exception as e: | |
| print(f"[STEP] step={step_num} action={action['action_type']} reward=0.0 confidence={confidence or 'null'} done=False error={e}") | |
| continue | |
| obs = step_resp | |
| reward = step_resp.get("reward", 0.0) | |
| done = step_resp.get("done", False) | |
| observation = step_resp.get("observation", {}) | |
| metadata = observation.get("metadata", {}) | |
| error = observation.get("metadata", {}).get("last_action_error") | |
| last_done = done | |
| # Extract calibration score on terminal actions | |
| if done and metadata.get("calibration_score") is not None: | |
| calibration_score = metadata["calibration_score"] | |
| total_reward = reward | |
| # Print mandatory [STEP] line | |
| print( | |
| f"[STEP] step={step_num} action={action['action_type']} " | |
| f"reward={reward:.2f} confidence={confidence or 'null'} " | |
| f"done={done} error={error}" | |
| ) | |
| # Determine if decision was correct | |
| if calibration_score is not None: | |
| final_decision_correct = "correct" if calibration_score >= 0.0 else "wrong" | |
| success = last_done and (calibration_score is not None) and (calibration_score >= 0.0) | |
| # Print mandatory [END] line | |
| print( | |
| f"[END] success={success} steps={step_num} total_reward={total_reward:.2f} " | |
| f"calibration_score={calibration_score if calibration_score is not None else 'N/A'} " | |
| f"decision={final_decision_correct}" | |
| ) | |
| return { | |
| "task_id": task_id, | |
| "success": success, | |
| "steps": step_num, | |
| "total_reward": total_reward, | |
| "calibration_score": calibration_score, | |
| "decision": final_decision_correct, | |
| } | |
| # ───────────────────────────────────────────────────────────── | |
| # CLI | |
| # ───────────────────────────────────────────────────────────── | |
| def main(): | |
| parser = argparse.ArgumentParser(description="DebateFloor baseline agent") | |
| parser.add_argument("--task", choices=ALL_TASKS + ["all"], default="contradictory_claim") | |
| parser.add_argument("--model", default=DEFAULT_MODEL) | |
| parser.add_argument("--base-url", default=DEFAULT_BASE_URL) | |
| parser.add_argument("--seed", type=int, default=42) | |
| parser.add_argument("--all-tasks", action="store_true") | |
| args = parser.parse_args() | |
| # Verify server is up | |
| client = DebateFloorClient(args.base_url) | |
| try: | |
| health = client.health() | |
| assert health.get("status") == "healthy" | |
| except Exception as e: | |
| print(f"[ERROR] Server not reachable at {args.base_url}: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| tasks_to_run = ALL_TASKS if (args.all_tasks or args.task == "all") else [args.task] | |
| results = [] | |
| for task_id in tasks_to_run: | |
| result = run_episode(task_id, args.model, args.base_url, args.seed) | |
| results.append(result) | |
| if len(tasks_to_run) > 1: | |
| print() # blank line between tasks | |
| if len(results) > 1: | |
| print("\n-- Summary --") | |
| for r in results: | |
| cs = r.get("calibration_score") | |
| print( | |
| f" {r['task_id']}: reward={r['total_reward']:.2f} " | |
| f"calibration={cs if cs is not None else 'N/A'} " | |
| f"decision={r['decision']}" | |
| ) | |
| if __name__ == "__main__": | |
| main() | |