Spaces:
Running
Running
| """ | |
| UndertriAI — Synthetic Case Generator (Theme 4: Self-Improvement) | |
| When the agent masters a domain, this generates harder synthetic variants | |
| of existing cases. All generation is deterministic string manipulation — | |
| no LLM calls. | |
| 5 perturbation types: | |
| 1. custody_escalation — custody just below statutory threshold | |
| 2. co_accused_conflict — co-accused with opposite bail outcome | |
| 3. section_ambiguity — IPC ↔ BNSS section swap | |
| 4. evidence_reversal — retracted witness / unreliable evidence | |
| 5. surety_complexity — non-resident surety complication | |
| """ | |
| import copy | |
| import re | |
| from typing import Any, Dict, List, Optional | |
| # IPC → BNSS mapping (subset used by the environment) | |
| IPC_TO_BNSS = { | |
| "302": "103", "307": "109", "376": "64", "304B": "80", "395": "310", | |
| "392": "309", "420": "318", "498A": "85", "406": "316", "465": "336", | |
| "323": "115", "354": "74", "120B": "61", "506": "351", "121": "147", | |
| "379": "303", "324": "117", "354A": "75", | |
| } | |
| BNSS_TO_IPC = {v: k for k, v in IPC_TO_BNSS.items()} | |
| # ── Required fields for schema validation ──────────────────────────── | |
| REQUIRED_FIELDS = { | |
| "case_id": str, | |
| "crime_type": str, | |
| "ipc_sections": list, | |
| "custody_months": (int, float), | |
| "charge_sheet": str, | |
| "ground_truth": dict, | |
| "curriculum_stage": (int, float), | |
| } | |
| def is_schema_valid(episode: Dict[str, Any]) -> bool: | |
| """ | |
| Check that all required fields are present and correct types. | |
| Returns True/False — used to filter out malformed synthetic cases. | |
| """ | |
| for field, expected_type in REQUIRED_FIELDS.items(): | |
| if field not in episode: | |
| return False | |
| if not isinstance(episode[field], expected_type): | |
| return False | |
| # ground_truth must have 'outcome' | |
| gt = episode.get("ground_truth", {}) | |
| if "outcome" not in gt: | |
| return False | |
| return True | |
| def generate_variants( | |
| source_episode: Dict[str, Any], | |
| n: int = 5, | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Generate up to n synthetic harder variants of a real episode. | |
| Each variant applies exactly ONE perturbation. | |
| Returns only valid variants (may be fewer than n if some | |
| perturbations can't be applied cleanly). | |
| """ | |
| if not is_schema_valid(source_episode): | |
| return [] | |
| perturbations = [ | |
| _custody_escalation, | |
| _co_accused_conflict, | |
| _section_ambiguity, | |
| _evidence_reversal, | |
| _surety_complexity, | |
| ] | |
| variants = [] | |
| for i, perturb_fn in enumerate(perturbations[:n]): | |
| try: | |
| variant = perturb_fn(source_episode) | |
| if variant is not None and is_schema_valid(variant): | |
| variants.append(variant) | |
| except Exception: | |
| # Skip perturbation on any error | |
| continue | |
| return variants | |
| # ── Perturbation 1: Custody Escalation ─────────────────────────────── | |
| def _custody_escalation(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |
| """ | |
| Set custody_months to exactly 2 months below the statutory threshold. | |
| Forces careful computation — case is NOT yet eligible for default bail. | |
| """ | |
| ep = copy.deepcopy(episode) | |
| max_sent = ep.get("max_sentence_years", 5.0) | |
| # Threshold is 50% of max sentence in months | |
| threshold_months = (max_sent * 12) / 2 | |
| new_custody = max(1.0, threshold_months - 2.0) | |
| old_custody = ep.get("custody_months", 0) | |
| ep["custody_months"] = round(new_custody, 1) | |
| # Update charge sheet text if it mentions custody duration | |
| charge = ep.get("charge_sheet", "") | |
| if str(int(old_custody)) in charge: | |
| charge = charge.replace( | |
| f"{int(old_custody)} months", | |
| f"{int(new_custody)} months", | |
| ) | |
| ep["charge_sheet"] = charge | |
| # Metadata | |
| parent_id = ep.get("case_id", "UNKNOWN") | |
| ep["case_id"] = f"SYN_{parent_id}_CUST" | |
| ep["source"] = "synthetic" | |
| ep["parent_case_id"] = parent_id | |
| ep["perturbation_type"] = "custody_escalation" | |
| ep["difficulty"] = "hard" | |
| return ep | |
| # ── Perturbation 2: Co-Accused Conflict ────────────────────────────── | |
| def _co_accused_conflict(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |
| """ | |
| Add a co-accused with the OPPOSITE bail outcome. | |
| Forces the agent to make a parity argument. | |
| """ | |
| ep = copy.deepcopy(episode) | |
| gt = ep.get("ground_truth", {}) | |
| gt_outcome = gt.get("outcome", "Bail Granted") | |
| # Opposite outcome | |
| if "grant" in gt_outcome.lower(): | |
| co_outcome = "Bail Denied" | |
| else: | |
| co_outcome = "Bail Granted" | |
| ep["co_accused"] = [{ | |
| "name": "Co-Accused A", | |
| "bail_outcome": co_outcome, | |
| "sections": ep.get("ipc_sections", []), | |
| }] | |
| gt["parity_argument_used"] = True | |
| ep["ground_truth"] = gt | |
| # Add parity context to defence arguments | |
| defence = ep.get("defence_arguments", []) | |
| defence.append( | |
| f"Co-accused was {'granted' if 'grant' in co_outcome.lower() else 'denied'} " | |
| f"bail under identical charges — parity principle applies." | |
| ) | |
| ep["defence_arguments"] = defence | |
| # Metadata | |
| parent_id = ep.get("case_id", "UNKNOWN") | |
| ep["case_id"] = f"SYN_{parent_id}_COAC" | |
| ep["source"] = "synthetic" | |
| ep["parent_case_id"] = parent_id | |
| ep["perturbation_type"] = "co_accused_conflict" | |
| ep["difficulty"] = "hard" | |
| return ep | |
| # ── Perturbation 3: Section Ambiguity (IPC ↔ BNSS) ────────────────── | |
| def _section_ambiguity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |
| """ | |
| Swap IPC sections to BNSS equivalents (or vice versa). | |
| Tests schema drift adaptability. | |
| """ | |
| ep = copy.deepcopy(episode) | |
| sections = ep.get("ipc_sections", []) | |
| if not sections: | |
| return None | |
| new_sections = [] | |
| swapped = False | |
| for sec in sections: | |
| sec_clean = sec.strip() | |
| if sec_clean in IPC_TO_BNSS: | |
| new_sections.append(IPC_TO_BNSS[sec_clean]) | |
| swapped = True | |
| elif sec_clean in BNSS_TO_IPC: | |
| new_sections.append(BNSS_TO_IPC[sec_clean]) | |
| swapped = True | |
| else: | |
| new_sections.append(sec_clean) | |
| if not swapped: | |
| return None | |
| ep["ipc_sections"] = new_sections | |
| # Update charge sheet references | |
| charge = ep.get("charge_sheet", "") | |
| for old_sec, new_sec in zip(sections, new_sections): | |
| if old_sec != new_sec: | |
| charge = charge.replace(f"Section {old_sec}", f"Section {new_sec}") | |
| charge = charge.replace(f"section {old_sec}", f"section {new_sec}") | |
| ep["charge_sheet"] = charge | |
| # Metadata | |
| parent_id = ep.get("case_id", "UNKNOWN") | |
| ep["case_id"] = f"SYN_{parent_id}_SECT" | |
| ep["source"] = "synthetic" | |
| ep["parent_case_id"] = parent_id | |
| ep["perturbation_type"] = "section_ambiguity" | |
| ep["difficulty"] = "hard" | |
| return ep | |
| # ── Perturbation 4: Evidence Reversal ──────────────────────────────── | |
| def _evidence_reversal(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |
| """ | |
| Add a contradicting element to the strongest evidence. | |
| Tests whether the agent updates assessment when evidence weakens. | |
| """ | |
| ep = copy.deepcopy(episode) | |
| # Find the strongest evidence mention | |
| evidence_keywords = ["witness", "evidence", "testimony", "eyewitness"] | |
| pros_args = ep.get("prosecution_arguments", []) | |
| charge = ep.get("charge_sheet", "") | |
| # Check prosecution arguments first | |
| target_arg = None | |
| for arg in pros_args: | |
| if any(kw in arg.lower() for kw in evidence_keywords): | |
| target_arg = arg | |
| break | |
| if target_arg is None: | |
| # Check charge sheet sentences | |
| sentences = [s.strip() for s in charge.split('.') if s.strip()] | |
| for sent in sentences: | |
| if any(kw in sent.lower() for kw in evidence_keywords): | |
| target_arg = sent | |
| break | |
| if target_arg is None: | |
| return None # No evidence to reverse | |
| # Add reversal to defence arguments | |
| defence = ep.get("defence_arguments", []) | |
| defence.append( | |
| "However, the key prosecution evidence was subsequently found " | |
| "unreliable — the primary witness retracted their statement and " | |
| "forensic analysis raised doubts about the physical evidence." | |
| ) | |
| ep["defence_arguments"] = defence | |
| # Metadata | |
| parent_id = ep.get("case_id", "UNKNOWN") | |
| ep["case_id"] = f"SYN_{parent_id}_EVID" | |
| ep["source"] = "synthetic" | |
| ep["parent_case_id"] = parent_id | |
| ep["perturbation_type"] = "evidence_reversal" | |
| ep["difficulty"] = "hard" | |
| return ep | |
| # ── Perturbation 5: Surety Complexity ──────────────────────────────── | |
| def _surety_complexity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]: | |
| """ | |
| Add a surety complication forcing careful condition assessment. | |
| """ | |
| ep = copy.deepcopy(episode) | |
| # Add surety complication to defence arguments | |
| defence = ep.get("defence_arguments", []) | |
| defence.append( | |
| "Proposed surety is a non-resident relative with no verifiable " | |
| "local assets or employment in the jurisdiction. Surety bond " | |
| "amount of Rs. 5,00,000 proposed." | |
| ) | |
| ep["defence_arguments"] = defence | |
| # Add surety info to accused profile | |
| profile = ep.get("accused_profile", {}) | |
| profile["surety_status"] = "non-resident, unverified assets" | |
| ep["accused_profile"] = profile | |
| # Metadata | |
| parent_id = ep.get("case_id", "UNKNOWN") | |
| ep["case_id"] = f"SYN_{parent_id}_SURE" | |
| ep["source"] = "synthetic" | |
| ep["parent_case_id"] = parent_id | |
| ep["perturbation_type"] = "surety_complexity" | |
| ep["difficulty"] = "hard" | |
| return ep | |