File size: 4,945 Bytes
d174311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4770478
 
 
 
 
d174311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
spec_version: 1
name: debatefloor_insurance_calibration_env
type: space
runtime: fastapi
app: app.main:app
port: 7860

# ClaimCourt (codename: DebateFloor) additions — required for GRPO parallel rollouts
supports_concurrent_sessions: true
max_concurrent_envs: 64
confidence_required: true
procedural_generation: true
episode_pool_size: 500

metadata:
  title: ClaimCourt  Insurance Calibration RL Environment (codename DebateFloor)
  description: >-
    An OpenEnv training environment where agents must make insurance claim decisions
    AND declare calibrated confidence (HIGH/MED/LOW) simultaneously.
    Based on CoCA framework (arXiv:2603.05881). Rewards agents that know when they
    don't know  penalising overconfidence harder than wrong answers.
    Procedurally generates 500+ unique episodes via seed variation across
    5 fraud types x 4 coverage types x 3 jurisdictions.
  tags:
    - openenv
    - insurance
    - fraud-detection
    - calibration
    - uncertainty-quantification
    - grpo
    - rl-training
    - world-modeling-professional   # hackathon theme #3.1 (primary)
    - multi-agent                   # hackathon theme #1 (secondary, via Court Panel)
  hackathon_themes:
    primary: "Theme 3.1 — World Modeling (Professional Tasks)"
    secondary: "Theme 1 — Multi-Agent Interactions (Court Panel)"
  citation: "CoCA: arXiv:2603.05881 — Co-optimising Confidence and Accuracy via GRPO"

tasks:
  - id: clean_claim
    difficulty: easy
    max_steps: 10
    objective: >-
      Validate a legitimate insurance claim. All documents are in order.
      Correct decision: approve_claim with HIGH confidence.
      HIGH confidence on a clean claim is rewarded at 1.0. LOW confidence penalised at 0.1.

  - id: contradictory_claim
    difficulty: medium
    max_steps: 18
    objective: >-
      Detect fraud signals in a claim with contradictory documents (medical_inflation,
      staged_accident, or identity_fraud type). Call validate_document and flag_fraud_signal
      before deciding. Correct decision: deny_claim. Appropriate confidence: MED.
      Overconfident denial (HIGH) without sufficient signals is penalised.

  - id: distribution_shift_claim
    difficulty: hard
    max_steps: 28
    objective: >-
      Investigate a coordinated fraud ring or phantom provider. Claim looks clean on surface.
      Agent must call query_historical_data or query_linked_claim to discover cross-claim signals.
      Correct decision: escalate_to_human with LOW confidence.
      HIGH confidence on this task is always wrong  the environment is designed to penalise it.

  - id: coordinated_fraud
    difficulty: hard
    max_steps: 22
    objective: >-
      Investigate a coordinated fraud ring. Multiple linked claims share emergency contact
      and broker. Agent must call query_linked_claim to discover cross-claim signals.
      Correct decision: escalate_to_human or request_investigation with LOW confidence.

  - id: identity_fraud
    difficulty: medium
    max_steps: 20
    objective: >-
      Detect identity fraud. Claimant identity does not match policy records.
      Agent must call verify_identity to reveal the mismatch.
      Correct decision: deny_claim with MED confidence.

action_space:
  # Investigative actions (non-terminal, confidence not required)
  - validate_document
  - flag_fraud_signal
  - request_information
  - lookup_policy_history
  - compare_documents
  - query_historical_data
  - query_linked_claim        # coordinated_ring only
  - verify_identity           # identity_fraud only
  - verify_provider_registration  # phantom_provider only
  - estimate_payout
  - convene_debate_panel
  # Terminal actions (confidence REQUIRED: HIGH | MED | LOW)
  - approve_claim
  - deny_claim
  - request_investigation
  - escalate_to_human

observation_space:
  - claim_id
  - task_id
  - claimant
  - incident
  - documents
  - linked_claims
  - action_history
  - available_actions
  - step_number
  - max_steps
  - flags_raised
  - discovered_signals
  - status
  - message
  - confidence_required       # always true in DebateFloor
  - rubric_reward
  - rubric_components
  - reward_breakdown

calibration:
  matrix:
    HIGH_correct:  1.0
    HIGH_wrong:   -0.8
    MED_correct:   0.6
    MED_wrong:    -0.2
    LOW_correct:   0.1
    LOW_wrong:     0.0
  anti_gaming:
    low_threshold:  0.70   # >70% LOW across episodes triggers penalty
    high_threshold: 0.80   # >80% HIGH across episodes triggers penalty
    min_history:    10     # minimum episodes before gaming detection fires

reward:
  training_reward: simple_scalar        # use for GRPO — stable gradients
  evaluation_reward: six_component      # use for demo and reporting only
  never_mix: true                       # CRITICAL — compound rewards break GRPO