Spaces:

Draken1606
/

undertrial-ai

Running

undertrial-ai / server /case_generator.py

Shabista Sehar

implemented

d8f8a45 14 days ago

10 kB

	"""
	UndertriAI — Synthetic Case Generator (Theme 4: Self-Improvement)

	When the agent masters a domain, this generates harder synthetic variants
	of existing cases. All generation is deterministic string manipulation —
	no LLM calls.

	5 perturbation types:
	1. custody_escalation — custody just below statutory threshold
	2. co_accused_conflict — co-accused with opposite bail outcome
	3. section_ambiguity — IPC ↔ BNSS section swap
	4. evidence_reversal — retracted witness / unreliable evidence
	5. surety_complexity — non-resident surety complication
	"""

	import copy
	import re
	from typing import Any, Dict, List, Optional


	# IPC → BNSS mapping (subset used by the environment)
	IPC_TO_BNSS = {
	"302": "103", "307": "109", "376": "64", "304B": "80", "395": "310",
	"392": "309", "420": "318", "498A": "85", "406": "316", "465": "336",
	"323": "115", "354": "74", "120B": "61", "506": "351", "121": "147",
	"379": "303", "324": "117", "354A": "75",
	}
	BNSS_TO_IPC = {v: k for k, v in IPC_TO_BNSS.items()}


	# ── Required fields for schema validation ────────────────────────────

	REQUIRED_FIELDS = {
	"case_id": str,
	"crime_type": str,
	"ipc_sections": list,
	"custody_months": (int, float),
	"charge_sheet": str,
	"ground_truth": dict,
	"curriculum_stage": (int, float),
	}


	def is_schema_valid(episode: Dict[str, Any]) -> bool:
	"""
	Check that all required fields are present and correct types.
	Returns True/False — used to filter out malformed synthetic cases.
	"""
	for field, expected_type in REQUIRED_FIELDS.items():
	if field not in episode:
	return False
	if not isinstance(episode[field], expected_type):
	return False

	# ground_truth must have 'outcome'
	gt = episode.get("ground_truth", {})
	if "outcome" not in gt:
	return False

	return True


	def generate_variants(
	source_episode: Dict[str, Any],
	n: int = 5,
	) -> List[Dict[str, Any]]:
	"""
	Generate up to n synthetic harder variants of a real episode.
	Each variant applies exactly ONE perturbation.

	Returns only valid variants (may be fewer than n if some
	perturbations can't be applied cleanly).
	"""
	if not is_schema_valid(source_episode):
	return []

	perturbations = [
	_custody_escalation,
	_co_accused_conflict,
	_section_ambiguity,
	_evidence_reversal,
	_surety_complexity,
	]

	variants = []
	for i, perturb_fn in enumerate(perturbations[:n]):
	try:
	variant = perturb_fn(source_episode)
	if variant is not None and is_schema_valid(variant):
	variants.append(variant)
	except Exception:
	# Skip perturbation on any error
	continue

	return variants


	# ── Perturbation 1: Custody Escalation ───────────────────────────────

	def _custody_escalation(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	"""
	Set custody_months to exactly 2 months below the statutory threshold.
	Forces careful computation — case is NOT yet eligible for default bail.
	"""
	ep = copy.deepcopy(episode)
	max_sent = ep.get("max_sentence_years", 5.0)

	# Threshold is 50% of max sentence in months
	threshold_months = (max_sent * 12) / 2
	new_custody = max(1.0, threshold_months - 2.0)

	old_custody = ep.get("custody_months", 0)
	ep["custody_months"] = round(new_custody, 1)

	# Update charge sheet text if it mentions custody duration
	charge = ep.get("charge_sheet", "")
	if str(int(old_custody)) in charge:
	charge = charge.replace(
	f"{int(old_custody)} months",
	f"{int(new_custody)} months",
	)
	ep["charge_sheet"] = charge

	# Metadata
	parent_id = ep.get("case_id", "UNKNOWN")
	ep["case_id"] = f"SYN_{parent_id}_CUST"
	ep["source"] = "synthetic"
	ep["parent_case_id"] = parent_id
	ep["perturbation_type"] = "custody_escalation"
	ep["difficulty"] = "hard"

	return ep


	# ── Perturbation 2: Co-Accused Conflict ──────────────────────────────

	def _co_accused_conflict(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	"""
	Add a co-accused with the OPPOSITE bail outcome.
	Forces the agent to make a parity argument.
	"""
	ep = copy.deepcopy(episode)
	gt = ep.get("ground_truth", {})
	gt_outcome = gt.get("outcome", "Bail Granted")

	# Opposite outcome
	if "grant" in gt_outcome.lower():
	co_outcome = "Bail Denied"
	else:
	co_outcome = "Bail Granted"

	ep["co_accused"] = [{
	"name": "Co-Accused A",
	"bail_outcome": co_outcome,
	"sections": ep.get("ipc_sections", []),
	}]

	gt["parity_argument_used"] = True
	ep["ground_truth"] = gt

	# Add parity context to defence arguments
	defence = ep.get("defence_arguments", [])
	defence.append(
	f"Co-accused was {'granted' if 'grant' in co_outcome.lower() else 'denied'} "
	f"bail under identical charges — parity principle applies."
	)
	ep["defence_arguments"] = defence

	# Metadata
	parent_id = ep.get("case_id", "UNKNOWN")
	ep["case_id"] = f"SYN_{parent_id}_COAC"
	ep["source"] = "synthetic"
	ep["parent_case_id"] = parent_id
	ep["perturbation_type"] = "co_accused_conflict"
	ep["difficulty"] = "hard"

	return ep


	# ── Perturbation 3: Section Ambiguity (IPC ↔ BNSS) ──────────────────

	def _section_ambiguity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	"""
	Swap IPC sections to BNSS equivalents (or vice versa).
	Tests schema drift adaptability.
	"""
	ep = copy.deepcopy(episode)
	sections = ep.get("ipc_sections", [])

	if not sections:
	return None

	new_sections = []
	swapped = False
	for sec in sections:
	sec_clean = sec.strip()
	if sec_clean in IPC_TO_BNSS:
	new_sections.append(IPC_TO_BNSS[sec_clean])
	swapped = True
	elif sec_clean in BNSS_TO_IPC:
	new_sections.append(BNSS_TO_IPC[sec_clean])
	swapped = True
	else:
	new_sections.append(sec_clean)

	if not swapped:
	return None

	ep["ipc_sections"] = new_sections

	# Update charge sheet references
	charge = ep.get("charge_sheet", "")
	for old_sec, new_sec in zip(sections, new_sections):
	if old_sec != new_sec:
	charge = charge.replace(f"Section {old_sec}", f"Section {new_sec}")
	charge = charge.replace(f"section {old_sec}", f"section {new_sec}")
	ep["charge_sheet"] = charge

	# Metadata
	parent_id = ep.get("case_id", "UNKNOWN")
	ep["case_id"] = f"SYN_{parent_id}_SECT"
	ep["source"] = "synthetic"
	ep["parent_case_id"] = parent_id
	ep["perturbation_type"] = "section_ambiguity"
	ep["difficulty"] = "hard"

	return ep


	# ── Perturbation 4: Evidence Reversal ────────────────────────────────

	def _evidence_reversal(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	"""
	Add a contradicting element to the strongest evidence.
	Tests whether the agent updates assessment when evidence weakens.
	"""
	ep = copy.deepcopy(episode)

	# Find the strongest evidence mention
	evidence_keywords = ["witness", "evidence", "testimony", "eyewitness"]
	pros_args = ep.get("prosecution_arguments", [])
	charge = ep.get("charge_sheet", "")

	# Check prosecution arguments first
	target_arg = None
	for arg in pros_args:
	if any(kw in arg.lower() for kw in evidence_keywords):
	target_arg = arg
	break

	if target_arg is None:
	# Check charge sheet sentences
	sentences = [s.strip() for s in charge.split('.') if s.strip()]
	for sent in sentences:
	if any(kw in sent.lower() for kw in evidence_keywords):
	target_arg = sent
	break

	if target_arg is None:
	return None # No evidence to reverse

	# Add reversal to defence arguments
	defence = ep.get("defence_arguments", [])
	defence.append(
	"However, the key prosecution evidence was subsequently found "
	"unreliable — the primary witness retracted their statement and "
	"forensic analysis raised doubts about the physical evidence."
	)
	ep["defence_arguments"] = defence

	# Metadata
	parent_id = ep.get("case_id", "UNKNOWN")
	ep["case_id"] = f"SYN_{parent_id}_EVID"
	ep["source"] = "synthetic"
	ep["parent_case_id"] = parent_id
	ep["perturbation_type"] = "evidence_reversal"
	ep["difficulty"] = "hard"

	return ep


	# ── Perturbation 5: Surety Complexity ────────────────────────────────

	def _surety_complexity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
	"""
	Add a surety complication forcing careful condition assessment.
	"""
	ep = copy.deepcopy(episode)

	# Add surety complication to defence arguments
	defence = ep.get("defence_arguments", [])
	defence.append(
	"Proposed surety is a non-resident relative with no verifiable "
	"local assets or employment in the jurisdiction. Surety bond "
	"amount of Rs. 5,00,000 proposed."
	)
	ep["defence_arguments"] = defence

	# Add surety info to accused profile
	profile = ep.get("accused_profile", {})
	profile["surety_status"] = "non-resident, unverified assets"
	ep["accused_profile"] = profile

	# Metadata
	parent_id = ep.get("case_id", "UNKNOWN")
	ep["case_id"] = f"SYN_{parent_id}_SURE"
	ep["source"] = "synthetic"
	ep["parent_case_id"] = parent_id
	ep["perturbation_type"] = "surety_complexity"
	ep["difficulty"] = "hard"

	return ep