Spaces:

Draken1606
/

undertrial-ai

Running

App Files Files Community

undertrial-ai / openenv.yaml

Draken1606

3-level

2c93c00 7 days ago

raw

history blame contribute delete

5.74 kB

	name: undertrial-ai
	version: "1.1.0"
	description: >
	OpenEnv-compliant RL training environment for Indian bail decision support
	with adaptive self-improvement (Theme 4). An LLM agent reads High Court bail
	cases, invokes legal tools, and submits structured bail recommendations.
	Reward computed deterministically against real HC judgments with an explicit
	bias penalty (lambda=0.3). Features performance-aware episode selection,
	stage-gated curriculum promotion, and synthetic case generation.

	author: Draken1606
	license: MIT
	repository: https://github.com/Faiz-1606/Undertrial
	space: https://huggingface.co/spaces/Draken1606/undertrial-ai

	tags:
	- legal-ai
	- india
	- bail
	- grpo
	- world-modeling
	- bias-mitigation
	- bnss-2023
	- self-improvement
	- adaptive-curriculum

	environment:
	class: undertrial_ai.server.undertrial_environment.UndertriAIEnvironment
	supports_concurrent_sessions: true
	max_steps_per_episode: 10

	actions:
	- name: request_document
	description: Request a missing case document (FIR, charge sheet, prior judgment)
	- name: flag_inconsistency
	description: Flag a legal inconsistency in the charge or prosecution argument
	- name: cross_reference_precedent
	description: Retrieve a relevant landmark SC/HC precedent
	- name: compute_statutory_eligibility
	description: Check BNSS 479 default bail eligibility (custody vs. max sentence)
	- name: assess_surety
	description: Evaluate financial viability of proposed surety
	- name: classify_bail_type
	description: Determine bail type from grounds for/against
	- name: read_submissions
	description: Read and summarise prosecution or defence submissions on record
	- name: assess_flight_risk
	description: Systematic flight risk assessment using a structured scoring matrix
	- name: check_case_factors
	description: Examine specific case factors (parity, evidence tampering, victim vulnerability)
	- name: apply_proportionality
	description: Apply BNSS 479 proportionality — custody vs. max sentence vs. trial timeline
	- name: pull_criminal_history
	description: Pull the accused's prior criminal record, bail history, and conviction status
	- name: submit_memo
	description: "TERMINAL — Submit structured bail assessment memo"

	reward:
	formula: "0.4outcome_gated + 0.2flight_risk + 0.2statutory + 0.2conditions + 0.1reasoning_quality + 0.05efficiency + 0.05format + 0.05process_bonus - 0.3*bias"
	range: [-0.7, 1.15]
	terminal_action: submit_memo
	deterministic: true
	llm_as_judge: false
	components:
	- outcome_match: "Agreement with real High Court decision, gated by reasoning quality (40%)"
	- flight_risk_accuracy: "Flight risk classification accuracy (20%)"
	- statutory_accuracy: "IPC/BNSS threshold computation with direction gate (20%)"
	- condition_appropriateness: "Bail condition quality (20%)"
	- reasoning_quality: "Justification anchoring + arithmetic verification + grounds specificity (10% bonus)"
	- format_compliance: "XML tag adherence matching system prompt structure (5% bonus)"
	- bias_penalty: "Penalty for ignoring parity in bias cases (-30%)"

	curriculum:
	levels: 3
	easy: "Easy — landmark clear-cut cases (104 episodes, 60 steps)"
	medium: "Medium — contested judgment calls (761 episodes, 160 steps)"
	hard: "Hard — bias reversal + schema drift (335 episodes, 80 steps)"

	self_improvement:
	adaptive_curriculum:
	description: >
	Performance-gated stage promotion using exponential moving averages.
	Agent auto-promotes when per-stage EMA exceeds threshold.
	thresholds:
	stage_1_to_2: {min_reward: 0.65, min_episodes: 20}
	stage_2_to_3: {min_reward: 0.55, min_episodes: 50}
	stage_3_to_4: {min_reward: 0.50, min_episodes: 20}
	weakness_targeting:
	description: >
	Adaptive episode selection identifies the crime type with lowest EMA
	reward and serves proportionally more cases from that domain.
	strategy: "60% weakest domain / 30% failure replay / 10% exploration"
	synthetic_generation:
	description: >
	When agent masters a domain (EMA > 0.70), generates harder synthetic
	variants using 5 perturbation types.
	perturbation_types:
	- custody_escalation
	- co_accused_conflict
	- section_ambiguity
	- evidence_reversal
	- surety_complexity

	endpoints:
	- path: /reset
	method: POST
	description: "Start a new episode. Supports adaptive=true and auto_stage=true for Theme 4."
	- path: /step
	method: POST
	description: "Submit a tool call or final memo. Updates performance tracker when done."
	- path: /state
	method: GET
	description: "Inspect current episode state."
	- path: /health
	method: GET
	description: "Health check."
	- path: /tools
	method: GET
	description: "List available tools."
	- path: /profile
	method: GET
	description: "Get agent performance profile for a session (Theme 4)."
	- path: /adaptive_status
	method: GET
	description: "Get global adaptive mode capabilities and thresholds."
	- path: /ws/{session_id}
	method: WS
	description: "WebSocket real-time feed."

	training:
	method: GRPO
	framework: TRL + Unsloth
	model: unsloth/Qwen2.5-7B-Instruct
	notebook: training/UndertriAI_GRPO_Training.ipynb
	script: training/train_grpo.py
	total_steps: 300
	num_generations: 6
	temperature: 1.1
	modes:
	- name: curriculum_3level
	command: "python training/train_grpo.py --curriculum --offline"
	- name: single_stage
	command: "python training/train_grpo.py --stage 1 --offline --steps 200"

	deployment:
	platform: huggingface-spaces
	sdk: docker
	port: 7860
	url: https://draken1606-undertrial-ai.hf.space