Spaces:

Draken1606
/

undertrial-ai

Running

File size: 10,047 Bytes

d8f8a45

"""
UndertriAI — Synthetic Case Generator (Theme 4: Self-Improvement)

When the agent masters a domain, this generates harder synthetic variants
of existing cases. All generation is deterministic string manipulation —
no LLM calls.

5 perturbation types:
  1. custody_escalation  — custody just below statutory threshold
  2. co_accused_conflict — co-accused with opposite bail outcome
  3. section_ambiguity   — IPC ↔ BNSS section swap
  4. evidence_reversal   — retracted witness / unreliable evidence
  5. surety_complexity   — non-resident surety complication
"""

import copy
import re
from typing import Any, Dict, List, Optional


# IPC → BNSS mapping (subset used by the environment)
IPC_TO_BNSS = {
    "302": "103", "307": "109", "376": "64",  "304B": "80",  "395": "310",
    "392": "309", "420": "318", "498A": "85", "406":  "316", "465": "336",
    "323": "115", "354": "74",  "120B": "61", "506":  "351", "121": "147",
    "379": "303", "324": "117", "354A": "75",
}
BNSS_TO_IPC = {v: k for k, v in IPC_TO_BNSS.items()}


# ── Required fields for schema validation ────────────────────────────

REQUIRED_FIELDS = {
    "case_id": str,
    "crime_type": str,
    "ipc_sections": list,
    "custody_months": (int, float),
    "charge_sheet": str,
    "ground_truth": dict,
    "curriculum_stage": (int, float),
}


def is_schema_valid(episode: Dict[str, Any]) -> bool:
    """
    Check that all required fields are present and correct types.
    Returns True/False — used to filter out malformed synthetic cases.
    """
    for field, expected_type in REQUIRED_FIELDS.items():
        if field not in episode:
            return False
        if not isinstance(episode[field], expected_type):
            return False

    # ground_truth must have 'outcome'
    gt = episode.get("ground_truth", {})
    if "outcome" not in gt:
        return False

    return True


def generate_variants(
    source_episode: Dict[str, Any],
    n: int = 5,
) -> List[Dict[str, Any]]:
    """
    Generate up to n synthetic harder variants of a real episode.
    Each variant applies exactly ONE perturbation.

    Returns only valid variants (may be fewer than n if some
    perturbations can't be applied cleanly).
    """
    if not is_schema_valid(source_episode):
        return []

    perturbations = [
        _custody_escalation,
        _co_accused_conflict,
        _section_ambiguity,
        _evidence_reversal,
        _surety_complexity,
    ]

    variants = []
    for i, perturb_fn in enumerate(perturbations[:n]):
        try:
            variant = perturb_fn(source_episode)
            if variant is not None and is_schema_valid(variant):
                variants.append(variant)
        except Exception:
            # Skip perturbation on any error
            continue

    return variants


# ── Perturbation 1: Custody Escalation ───────────────────────────────

def _custody_escalation(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Set custody_months to exactly 2 months below the statutory threshold.
    Forces careful computation — case is NOT yet eligible for default bail.
    """
    ep = copy.deepcopy(episode)
    max_sent = ep.get("max_sentence_years", 5.0)

    # Threshold is 50% of max sentence in months
    threshold_months = (max_sent * 12) / 2
    new_custody = max(1.0, threshold_months - 2.0)

    old_custody = ep.get("custody_months", 0)
    ep["custody_months"] = round(new_custody, 1)

    # Update charge sheet text if it mentions custody duration
    charge = ep.get("charge_sheet", "")
    if str(int(old_custody)) in charge:
        charge = charge.replace(
            f"{int(old_custody)} months",
            f"{int(new_custody)} months",
        )
    ep["charge_sheet"] = charge

    # Metadata
    parent_id = ep.get("case_id", "UNKNOWN")
    ep["case_id"] = f"SYN_{parent_id}_CUST"
    ep["source"] = "synthetic"
    ep["parent_case_id"] = parent_id
    ep["perturbation_type"] = "custody_escalation"
    ep["difficulty"] = "hard"

    return ep


# ── Perturbation 2: Co-Accused Conflict ──────────────────────────────

def _co_accused_conflict(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Add a co-accused with the OPPOSITE bail outcome.
    Forces the agent to make a parity argument.
    """
    ep = copy.deepcopy(episode)
    gt = ep.get("ground_truth", {})
    gt_outcome = gt.get("outcome", "Bail Granted")

    # Opposite outcome
    if "grant" in gt_outcome.lower():
        co_outcome = "Bail Denied"
    else:
        co_outcome = "Bail Granted"

    ep["co_accused"] = [{
        "name": "Co-Accused A",
        "bail_outcome": co_outcome,
        "sections": ep.get("ipc_sections", []),
    }]

    gt["parity_argument_used"] = True
    ep["ground_truth"] = gt

    # Add parity context to defence arguments
    defence = ep.get("defence_arguments", [])
    defence.append(
        f"Co-accused was {'granted' if 'grant' in co_outcome.lower() else 'denied'} "
        f"bail under identical charges — parity principle applies."
    )
    ep["defence_arguments"] = defence

    # Metadata
    parent_id = ep.get("case_id", "UNKNOWN")
    ep["case_id"] = f"SYN_{parent_id}_COAC"
    ep["source"] = "synthetic"
    ep["parent_case_id"] = parent_id
    ep["perturbation_type"] = "co_accused_conflict"
    ep["difficulty"] = "hard"

    return ep


# ── Perturbation 3: Section Ambiguity (IPC ↔ BNSS) ──────────────────

def _section_ambiguity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Swap IPC sections to BNSS equivalents (or vice versa).
    Tests schema drift adaptability.
    """
    ep = copy.deepcopy(episode)
    sections = ep.get("ipc_sections", [])

    if not sections:
        return None

    new_sections = []
    swapped = False
    for sec in sections:
        sec_clean = sec.strip()
        if sec_clean in IPC_TO_BNSS:
            new_sections.append(IPC_TO_BNSS[sec_clean])
            swapped = True
        elif sec_clean in BNSS_TO_IPC:
            new_sections.append(BNSS_TO_IPC[sec_clean])
            swapped = True
        else:
            new_sections.append(sec_clean)

    if not swapped:
        return None

    ep["ipc_sections"] = new_sections

    # Update charge sheet references
    charge = ep.get("charge_sheet", "")
    for old_sec, new_sec in zip(sections, new_sections):
        if old_sec != new_sec:
            charge = charge.replace(f"Section {old_sec}", f"Section {new_sec}")
            charge = charge.replace(f"section {old_sec}", f"section {new_sec}")
    ep["charge_sheet"] = charge

    # Metadata
    parent_id = ep.get("case_id", "UNKNOWN")
    ep["case_id"] = f"SYN_{parent_id}_SECT"
    ep["source"] = "synthetic"
    ep["parent_case_id"] = parent_id
    ep["perturbation_type"] = "section_ambiguity"
    ep["difficulty"] = "hard"

    return ep


# ── Perturbation 4: Evidence Reversal ────────────────────────────────

def _evidence_reversal(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Add a contradicting element to the strongest evidence.
    Tests whether the agent updates assessment when evidence weakens.
    """
    ep = copy.deepcopy(episode)

    # Find the strongest evidence mention
    evidence_keywords = ["witness", "evidence", "testimony", "eyewitness"]
    pros_args = ep.get("prosecution_arguments", [])
    charge = ep.get("charge_sheet", "")

    # Check prosecution arguments first
    target_arg = None
    for arg in pros_args:
        if any(kw in arg.lower() for kw in evidence_keywords):
            target_arg = arg
            break

    if target_arg is None:
        # Check charge sheet sentences
        sentences = [s.strip() for s in charge.split('.') if s.strip()]
        for sent in sentences:
            if any(kw in sent.lower() for kw in evidence_keywords):
                target_arg = sent
                break

    if target_arg is None:
        return None  # No evidence to reverse

    # Add reversal to defence arguments
    defence = ep.get("defence_arguments", [])
    defence.append(
        "However, the key prosecution evidence was subsequently found "
        "unreliable — the primary witness retracted their statement and "
        "forensic analysis raised doubts about the physical evidence."
    )
    ep["defence_arguments"] = defence

    # Metadata
    parent_id = ep.get("case_id", "UNKNOWN")
    ep["case_id"] = f"SYN_{parent_id}_EVID"
    ep["source"] = "synthetic"
    ep["parent_case_id"] = parent_id
    ep["perturbation_type"] = "evidence_reversal"
    ep["difficulty"] = "hard"

    return ep


# ── Perturbation 5: Surety Complexity ────────────────────────────────

def _surety_complexity(episode: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Add a surety complication forcing careful condition assessment.
    """
    ep = copy.deepcopy(episode)

    # Add surety complication to defence arguments
    defence = ep.get("defence_arguments", [])
    defence.append(
        "Proposed surety is a non-resident relative with no verifiable "
        "local assets or employment in the jurisdiction. Surety bond "
        "amount of Rs. 5,00,000 proposed."
    )
    ep["defence_arguments"] = defence

    # Add surety info to accused profile
    profile = ep.get("accused_profile", {})
    profile["surety_status"] = "non-resident, unverified assets"
    ep["accused_profile"] = profile

    # Metadata
    parent_id = ep.get("case_id", "UNKNOWN")
    ep["case_id"] = f"SYN_{parent_id}_SURE"
    ep["source"] = "synthetic"
    ep["parent_case_id"] = parent_id
    ep["perturbation_type"] = "surety_complexity"
    ep["difficulty"] = "hard"

    return ep