undertrial-ai / server /case_generator.py
Shabista Sehar
implemented
d8f8a45
"""
UndertriAI — Synthetic Case Generator (Theme 4: Self-Improvement)
When the agent masters a domain, this generates harder synthetic variants
of existing cases. All generation is deterministic string manipulation —
no LLM calls.
5 perturbation types:
1. custody_escalation — custody just below statutory threshold
2. co_accused_conflict — co-accused with opposite bail outcome
3. section_ambiguity — IPC ↔ BNSS section swap
4. evidence_reversal — retracted witness / unreliable evidence
5. surety_complexity — non-resident surety complication
"""
import copy
import re
from typing import Any, Dict, List, Optional
# IPC → BNSS mapping (subset used by the environment)
IPC_TO_BNSS = {
"302": "103", "307": "109", "376": "64", "304B": "80", "395": "310",
"392": "309", "420": "318", "498A": "85", "406": "316", "465": "336",
"323": "115", "354": "74", "120B": "61", "506": "351", "121": "147",
"379": "303", "324": "117", "354A": "75",
}
BNSS_TO_IPC = {v: k for k, v in IPC_TO_BNSS.items()}
# ── Required fields for schema validation ────────────────────────────
REQUIRED_FIELDS = {
"case_id": str,
"crime_type": str,
"ipc_sections": list,
"custody_months": (int, float),
"charge_sheet": str,
"ground_truth": dict,
"curriculum_stage": (int, float),
}
def is_schema_valid(episode: Dict[str, Any]) -> bool:
"""
Check that all required fields are present and correct types.
Returns True/False — used to filter out malformed synthetic cases.
"""
for field, expected_type in REQUIRED_FIELDS.items():
if field not in episode:
return False
if not isinstance(episode[field], expected_type):
return False
# ground_truth must have 'outcome'
gt = episode.get("ground_truth", {})
if "outcome" not in gt:
return False
return True
def generate_variants(
source_episode: Dict[str, Any],
n: int = 5,
) -> List[Dict[str, Any]]:
"""
Generate up to n synthetic harder variants of a real episode.
Each variant applies exactly ONE perturbation.
Returns only valid variants (may be fewer than n if some
perturbations can't be applied cleanly).
"""
if not is_schema_valid(source_episode):
return []
perturbations = [
_custody_escalation,
_co_accused_conflict,
_section_ambiguity,
_evidence_reversal,
_surety_complexity,
]
variants = []
for i, perturb_fn in enumerate(perturbations[:n]):
try:
variant = perturb_fn(source_episode)
if variant is not None and is_schema_valid(variant):
variants.append(variant)
except Exception:
# Skip perturbation on any error
continue
return variants
# ── Perturbation 1: Custody Escalation ───────────────────────────────
def _custody_escalation(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Set custody_months to exactly 2 months below the statutory threshold.
Forces careful computation — case is NOT yet eligible for default bail.
"""
ep = copy.deepcopy(episode)
max_sent = ep.get("max_sentence_years", 5.0)
# Threshold is 50% of max sentence in months
threshold_months = (max_sent * 12) / 2
new_custody = max(1.0, threshold_months - 2.0)
old_custody = ep.get("custody_months", 0)
ep["custody_months"] = round(new_custody, 1)
# Update charge sheet text if it mentions custody duration
charge = ep.get("charge_sheet", "")
if str(int(old_custody)) in charge:
charge = charge.replace(
f"{int(old_custody)} months",
f"{int(new_custody)} months",
)
ep["charge_sheet"] = charge
# Metadata
parent_id = ep.get("case_id", "UNKNOWN")
ep["case_id"] = f"SYN_{parent_id}_CUST"
ep["source"] = "synthetic"
ep["parent_case_id"] = parent_id
ep["perturbation_type"] = "custody_escalation"
ep["difficulty"] = "hard"
return ep
# ── Perturbation 2: Co-Accused Conflict ──────────────────────────────
def _co_accused_conflict(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Add a co-accused with the OPPOSITE bail outcome.
Forces the agent to make a parity argument.
"""
ep = copy.deepcopy(episode)
gt = ep.get("ground_truth", {})
gt_outcome = gt.get("outcome", "Bail Granted")
# Opposite outcome
if "grant" in gt_outcome.lower():
co_outcome = "Bail Denied"
else:
co_outcome = "Bail Granted"
ep["co_accused"] = [{
"name": "Co-Accused A",
"bail_outcome": co_outcome,
"sections": ep.get("ipc_sections", []),
}]
gt["parity_argument_used"] = True
ep["ground_truth"] = gt
# Add parity context to defence arguments
defence = ep.get("defence_arguments", [])
defence.append(
f"Co-accused was {'granted' if 'grant' in co_outcome.lower() else 'denied'} "
f"bail under identical charges — parity principle applies."
)
ep["defence_arguments"] = defence
# Metadata
parent_id = ep.get("case_id", "UNKNOWN")
ep["case_id"] = f"SYN_{parent_id}_COAC"
ep["source"] = "synthetic"
ep["parent_case_id"] = parent_id
ep["perturbation_type"] = "co_accused_conflict"
ep["difficulty"] = "hard"
return ep
# ── Perturbation 3: Section Ambiguity (IPC ↔ BNSS) ──────────────────
def _section_ambiguity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Swap IPC sections to BNSS equivalents (or vice versa).
Tests schema drift adaptability.
"""
ep = copy.deepcopy(episode)
sections = ep.get("ipc_sections", [])
if not sections:
return None
new_sections = []
swapped = False
for sec in sections:
sec_clean = sec.strip()
if sec_clean in IPC_TO_BNSS:
new_sections.append(IPC_TO_BNSS[sec_clean])
swapped = True
elif sec_clean in BNSS_TO_IPC:
new_sections.append(BNSS_TO_IPC[sec_clean])
swapped = True
else:
new_sections.append(sec_clean)
if not swapped:
return None
ep["ipc_sections"] = new_sections
# Update charge sheet references
charge = ep.get("charge_sheet", "")
for old_sec, new_sec in zip(sections, new_sections):
if old_sec != new_sec:
charge = charge.replace(f"Section {old_sec}", f"Section {new_sec}")
charge = charge.replace(f"section {old_sec}", f"section {new_sec}")
ep["charge_sheet"] = charge
# Metadata
parent_id = ep.get("case_id", "UNKNOWN")
ep["case_id"] = f"SYN_{parent_id}_SECT"
ep["source"] = "synthetic"
ep["parent_case_id"] = parent_id
ep["perturbation_type"] = "section_ambiguity"
ep["difficulty"] = "hard"
return ep
# ── Perturbation 4: Evidence Reversal ────────────────────────────────
def _evidence_reversal(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Add a contradicting element to the strongest evidence.
Tests whether the agent updates assessment when evidence weakens.
"""
ep = copy.deepcopy(episode)
# Find the strongest evidence mention
evidence_keywords = ["witness", "evidence", "testimony", "eyewitness"]
pros_args = ep.get("prosecution_arguments", [])
charge = ep.get("charge_sheet", "")
# Check prosecution arguments first
target_arg = None
for arg in pros_args:
if any(kw in arg.lower() for kw in evidence_keywords):
target_arg = arg
break
if target_arg is None:
# Check charge sheet sentences
sentences = [s.strip() for s in charge.split('.') if s.strip()]
for sent in sentences:
if any(kw in sent.lower() for kw in evidence_keywords):
target_arg = sent
break
if target_arg is None:
return None # No evidence to reverse
# Add reversal to defence arguments
defence = ep.get("defence_arguments", [])
defence.append(
"However, the key prosecution evidence was subsequently found "
"unreliable — the primary witness retracted their statement and "
"forensic analysis raised doubts about the physical evidence."
)
ep["defence_arguments"] = defence
# Metadata
parent_id = ep.get("case_id", "UNKNOWN")
ep["case_id"] = f"SYN_{parent_id}_EVID"
ep["source"] = "synthetic"
ep["parent_case_id"] = parent_id
ep["perturbation_type"] = "evidence_reversal"
ep["difficulty"] = "hard"
return ep
# ── Perturbation 5: Surety Complexity ────────────────────────────────
def _surety_complexity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Add a surety complication forcing careful condition assessment.
"""
ep = copy.deepcopy(episode)
# Add surety complication to defence arguments
defence = ep.get("defence_arguments", [])
defence.append(
"Proposed surety is a non-resident relative with no verifiable "
"local assets or employment in the jurisdiction. Surety bond "
"amount of Rs. 5,00,000 proposed."
)
ep["defence_arguments"] = defence
# Add surety info to accused profile
profile = ep.get("accused_profile", {})
profile["surety_status"] = "non-resident, unverified assets"
ep["accused_profile"] = profile
# Metadata
parent_id = ep.get("case_id", "UNKNOWN")
ep["case_id"] = f"SYN_{parent_id}_SURE"
ep["source"] = "synthetic"
ep["parent_case_id"] = parent_id
ep["perturbation_type"] = "surety_complexity"
ep["difficulty"] = "hard"
return ep