undertrial-ai / openenv.yaml
Draken1606's picture
3-level
2c93c00
name: undertrial-ai
version: "1.1.0"
description: >
OpenEnv-compliant RL training environment for Indian bail decision support
with adaptive self-improvement (Theme 4). An LLM agent reads High Court bail
cases, invokes legal tools, and submits structured bail recommendations.
Reward computed deterministically against real HC judgments with an explicit
bias penalty (lambda=0.3). Features performance-aware episode selection,
stage-gated curriculum promotion, and synthetic case generation.
author: Draken1606
license: MIT
repository: https://github.com/Faiz-1606/Undertrial
space: https://huggingface.co/spaces/Draken1606/undertrial-ai
tags:
- legal-ai
- india
- bail
- grpo
- world-modeling
- bias-mitigation
- bnss-2023
- self-improvement
- adaptive-curriculum
environment:
class: undertrial_ai.server.undertrial_environment.UndertriAIEnvironment
supports_concurrent_sessions: true
max_steps_per_episode: 10
actions:
- name: request_document
description: Request a missing case document (FIR, charge sheet, prior judgment)
- name: flag_inconsistency
description: Flag a legal inconsistency in the charge or prosecution argument
- name: cross_reference_precedent
description: Retrieve a relevant landmark SC/HC precedent
- name: compute_statutory_eligibility
description: Check BNSS 479 default bail eligibility (custody vs. max sentence)
- name: assess_surety
description: Evaluate financial viability of proposed surety
- name: classify_bail_type
description: Determine bail type from grounds for/against
- name: read_submissions
description: Read and summarise prosecution or defence submissions on record
- name: assess_flight_risk
description: Systematic flight risk assessment using a structured scoring matrix
- name: check_case_factors
description: Examine specific case factors (parity, evidence tampering, victim vulnerability)
- name: apply_proportionality
description: Apply BNSS 479 proportionality custody vs. max sentence vs. trial timeline
- name: pull_criminal_history
description: Pull the accused's prior criminal record, bail history, and conviction status
- name: submit_memo
description: "TERMINAL — Submit structured bail assessment memo"
reward:
formula: "0.4*outcome_gated + 0.2*flight_risk + 0.2*statutory + 0.2*conditions + 0.1*reasoning_quality + 0.05*efficiency + 0.05*format + 0.05*process_bonus - 0.3*bias"
range: [-0.7, 1.15]
terminal_action: submit_memo
deterministic: true
llm_as_judge: false
components:
- outcome_match: "Agreement with real High Court decision, gated by reasoning quality (40%)"
- flight_risk_accuracy: "Flight risk classification accuracy (20%)"
- statutory_accuracy: "IPC/BNSS threshold computation with direction gate (20%)"
- condition_appropriateness: "Bail condition quality (20%)"
- reasoning_quality: "Justification anchoring + arithmetic verification + grounds specificity (10% bonus)"
- format_compliance: "XML tag adherence matching system prompt structure (5% bonus)"
- bias_penalty: "Penalty for ignoring parity in bias cases (-30%)"
curriculum:
levels: 3
easy: "Easy — landmark clear-cut cases (104 episodes, 60 steps)"
medium: "Medium — contested judgment calls (761 episodes, 160 steps)"
hard: "Hard — bias reversal + schema drift (335 episodes, 80 steps)"
self_improvement:
adaptive_curriculum:
description: >
Performance-gated stage promotion using exponential moving averages.
Agent auto-promotes when per-stage EMA exceeds threshold.
thresholds:
stage_1_to_2: {min_reward: 0.65, min_episodes: 20}
stage_2_to_3: {min_reward: 0.55, min_episodes: 50}
stage_3_to_4: {min_reward: 0.50, min_episodes: 20}
weakness_targeting:
description: >
Adaptive episode selection identifies the crime type with lowest EMA
reward and serves proportionally more cases from that domain.
strategy: "60% weakest domain / 30% failure replay / 10% exploration"
synthetic_generation:
description: >
When agent masters a domain (EMA > 0.70), generates harder synthetic
variants using 5 perturbation types.
perturbation_types:
- custody_escalation
- co_accused_conflict
- section_ambiguity
- evidence_reversal
- surety_complexity
endpoints:
- path: /reset
method: POST
description: "Start a new episode. Supports adaptive=true and auto_stage=true for Theme 4."
- path: /step
method: POST
description: "Submit a tool call or final memo. Updates performance tracker when done."
- path: /state
method: GET
description: "Inspect current episode state."
- path: /health
method: GET
description: "Health check."
- path: /tools
method: GET
description: "List available tools."
- path: /profile
method: GET
description: "Get agent performance profile for a session (Theme 4)."
- path: /adaptive_status
method: GET
description: "Get global adaptive mode capabilities and thresholds."
- path: /ws/{session_id}
method: WS
description: "WebSocket real-time feed."
training:
method: GRPO
framework: TRL + Unsloth
model: unsloth/Qwen2.5-7B-Instruct
notebook: training/UndertriAI_GRPO_Training.ipynb
script: training/train_grpo.py
total_steps: 300
num_generations: 6
temperature: 1.1
modes:
- name: curriculum_3level
command: "python training/train_grpo.py --curriculum --offline"
- name: single_stage
command: "python training/train_grpo.py --stage 1 --offline --steps 200"
deployment:
platform: huggingface-spaces
sdk: docker
port: 7860
url: https://draken1606-undertrial-ai.hf.space