Spaces:

AniketAsla
/

debatefloor

Running

App Files Files Community

debatefloor / openenv.yaml

AniketAsla

sync: git 6c015af (6c015af5a56716c7792e9b6ff9612dc91d901483)

4770478 verified 12 days ago

raw

history blame contribute delete

4.95 kB

	spec_version: 1
	name: debatefloor_insurance_calibration_env
	type: space
	runtime: fastapi
	app: app.main:app
	port: 7860

	# ClaimCourt (codename: DebateFloor) additions — required for GRPO parallel rollouts
	supports_concurrent_sessions: true
	max_concurrent_envs: 64
	confidence_required: true
	procedural_generation: true
	episode_pool_size: 500

	metadata:
	title: ClaimCourt — Insurance Calibration RL Environment (codename DebateFloor)
	description: >-
	An OpenEnv training environment where agents must make insurance claim decisions
	AND declare calibrated confidence (HIGH/MED/LOW) simultaneously.
	Based on CoCA framework (arXiv:2603.05881). Rewards agents that know when they
	don't know — penalising overconfidence harder than wrong answers.
	Procedurally generates 500+ unique episodes via seed variation across
	5 fraud types x 4 coverage types x 3 jurisdictions.
	tags:
	- openenv
	- insurance
	- fraud-detection
	- calibration
	- uncertainty-quantification
	- grpo
	- rl-training
	- world-modeling-professional # hackathon theme #3.1 (primary)
	- multi-agent # hackathon theme #1 (secondary, via Court Panel)
	hackathon_themes:
	primary: "Theme 3.1 — World Modeling (Professional Tasks)"
	secondary: "Theme 1 — Multi-Agent Interactions (Court Panel)"
	citation: "CoCA: arXiv:2603.05881 — Co-optimising Confidence and Accuracy via GRPO"

	tasks:
	- id: clean_claim
	difficulty: easy
	max_steps: 10
	objective: >-
	Validate a legitimate insurance claim. All documents are in order.
	Correct decision: approve_claim with HIGH confidence.
	HIGH confidence on a clean claim is rewarded at 1.0. LOW confidence penalised at 0.1.

	- id: contradictory_claim
	difficulty: medium
	max_steps: 18
	objective: >-
	Detect fraud signals in a claim with contradictory documents (medical_inflation,
	staged_accident, or identity_fraud type). Call validate_document and flag_fraud_signal
	before deciding. Correct decision: deny_claim. Appropriate confidence: MED.
	Overconfident denial (HIGH) without sufficient signals is penalised.

	- id: distribution_shift_claim
	difficulty: hard
	max_steps: 28
	objective: >-
	Investigate a coordinated fraud ring or phantom provider. Claim looks clean on surface.
	Agent must call query_historical_data or query_linked_claim to discover cross-claim signals.
	Correct decision: escalate_to_human with LOW confidence.
	HIGH confidence on this task is always wrong — the environment is designed to penalise it.

	- id: coordinated_fraud
	difficulty: hard
	max_steps: 22
	objective: >-
	Investigate a coordinated fraud ring. Multiple linked claims share emergency contact
	and broker. Agent must call query_linked_claim to discover cross-claim signals.
	Correct decision: escalate_to_human or request_investigation with LOW confidence.

	- id: identity_fraud
	difficulty: medium
	max_steps: 20
	objective: >-
	Detect identity fraud. Claimant identity does not match policy records.
	Agent must call verify_identity to reveal the mismatch.
	Correct decision: deny_claim with MED confidence.

	action_space:
	# Investigative actions (non-terminal, confidence not required)
	- validate_document
	- flag_fraud_signal
	- request_information
	- lookup_policy_history
	- compare_documents
	- query_historical_data
	- query_linked_claim # coordinated_ring only
	- verify_identity # identity_fraud only
	- verify_provider_registration # phantom_provider only
	- estimate_payout
	- convene_debate_panel
	# Terminal actions (confidence REQUIRED: HIGH \| MED \| LOW)
	- approve_claim
	- deny_claim
	- request_investigation
	- escalate_to_human

	observation_space:
	- claim_id
	- task_id
	- claimant
	- incident
	- documents
	- linked_claims
	- action_history
	- available_actions
	- step_number
	- max_steps
	- flags_raised
	- discovered_signals
	- status
	- message
	- confidence_required # always true in DebateFloor
	- rubric_reward
	- rubric_components
	- reward_breakdown

	calibration:
	matrix:
	HIGH_correct: 1.0
	HIGH_wrong: -0.8
	MED_correct: 0.6
	MED_wrong: -0.2
	LOW_correct: 0.1
	LOW_wrong: 0.0
	anti_gaming:
	low_threshold: 0.70 # >70% LOW across episodes triggers penalty
	high_threshold: 0.80 # >80% HIGH across episodes triggers penalty
	min_history: 10 # minimum episodes before gaming detection fires

	reward:
	training_reward: simple_scalar # use for GRPO — stable gradients
	evaluation_reward: six_component # use for demo and reporting only
	never_mix: true # CRITICAL — compound rewards break GRPO