spec_version: 1
name: debatefloor_insurance_calibration_env
type: space
runtime: fastapi
app: app.main:app
port: 7860

# ClaimCourt (codename: DebateFloor) additions — required for GRPO parallel rollouts
supports_concurrent_sessions: true
max_concurrent_envs: 64
confidence_required: true
procedural_generation: true
episode_pool_size: 500

metadata:
  title: ClaimCourt — Insurance Calibration RL Environment (codename DebateFloor)
  description: >-
    An OpenEnv training environment where agents must make insurance claim decisions
    AND declare calibrated confidence (HIGH/MED/LOW) simultaneously.
    Based on CoCA framework (arXiv:2603.05881). Rewards agents that know when they
    don't know — penalising overconfidence harder than wrong answers.
    Procedurally generates 500+ unique episodes via seed variation across
    5 fraud types x 4 coverage types x 3 jurisdictions.
  tags:
    - openenv
    - insurance
    - fraud-detection
    - calibration
    - uncertainty-quantification
    - grpo
    - rl-training
    - world-modeling-professional   # hackathon theme #3.1 (primary)
    - multi-agent                   # hackathon theme #1 (secondary, via Court Panel)
  hackathon_themes:
    primary: "Theme 3.1 — World Modeling (Professional Tasks)"
    secondary: "Theme 1 — Multi-Agent Interactions (Court Panel)"
  citation: "CoCA: arXiv:2603.05881 — Co-optimising Confidence and Accuracy via GRPO"

tasks:
  - id: clean_claim
    difficulty: easy
    max_steps: 10
    objective: >-
      Validate a legitimate insurance claim. All documents are in order.
      Correct decision: approve_claim with HIGH confidence.
      HIGH confidence on a clean claim is rewarded at 1.0. LOW confidence penalised at 0.1.

  - id: contradictory_claim
    difficulty: medium
    max_steps: 18
    objective: >-
      Detect fraud signals in a claim with contradictory documents (medical_inflation,
      staged_accident, or identity_fraud type). Call validate_document and flag_fraud_signal
      before deciding. Correct decision: deny_claim. Appropriate confidence: MED.
      Overconfident denial (HIGH) without sufficient signals is penalised.

  - id: distribution_shift_claim
    difficulty: hard
    max_steps: 28
    objective: >-
      Investigate a coordinated fraud ring or phantom provider. Claim looks clean on surface.
      Agent must call query_historical_data or query_linked_claim to discover cross-claim signals.
      Correct decision: escalate_to_human with LOW confidence.
      HIGH confidence on this task is always wrong — the environment is designed to penalise it.

  - id: coordinated_fraud
    difficulty: hard
    max_steps: 22
    objective: >-
      Investigate a coordinated fraud ring. Multiple linked claims share emergency contact
      and broker. Agent must call query_linked_claim to discover cross-claim signals.
      Correct decision: escalate_to_human or request_investigation with LOW confidence.

  - id: identity_fraud
    difficulty: medium
    max_steps: 20
    objective: >-
      Detect identity fraud. Claimant identity does not match policy records.
      Agent must call verify_identity to reveal the mismatch.
      Correct decision: deny_claim with MED confidence.

action_space:
  # Investigative actions (non-terminal, confidence not required)
  - validate_document
  - flag_fraud_signal
  - request_information
  - lookup_policy_history
  - compare_documents
  - query_historical_data
  - query_linked_claim        # coordinated_ring only
  - verify_identity           # identity_fraud only
  - verify_provider_registration  # phantom_provider only
  - estimate_payout
  - convene_debate_panel
  # Terminal actions (confidence REQUIRED: HIGH | MED | LOW)
  - approve_claim
  - deny_claim
  - request_investigation
  - escalate_to_human

observation_space:
  - claim_id
  - task_id
  - claimant
  - incident
  - documents
  - linked_claims
  - action_history
  - available_actions
  - step_number
  - max_steps
  - flags_raised
  - discovered_signals
  - status
  - message
  - confidence_required       # always true in DebateFloor
  - rubric_reward
  - rubric_components
  - reward_breakdown

calibration:
  matrix:
    HIGH_correct:  1.0
    HIGH_wrong:   -0.8
    MED_correct:   0.6
    MED_wrong:    -0.2
    LOW_correct:   0.1
    LOW_wrong:     0.0
  anti_gaming:
    low_threshold:  0.70   # >70% LOW across episodes triggers penalty
    high_threshold: 0.80   # >80% HIGH across episodes triggers penalty
    min_history:    10     # minimum episodes before gaming detection fires

reward:
  training_reward: simple_scalar        # use for GRPO — stable gradients
  evaluation_reward: six_component      # use for demo and reporting only
  never_mix: true                       # CRITICAL — compound rewards break GRPO