Spaces:

Timusgeorge
/

SynthAudit-Env

Sleeping

File size: 19,918 Bytes

a33aae2

"""
SynthAudit.Env — Actor Agent (Pre-cached Proposal Generator)
=============================================================
Generates deterministic clinical proposals with sophisticated
medical reasoning that SOUNDS correct but contains subtle flaws.

The Actor simulates a frontier LLM that has been fine-tuned on
clinical data but still exhibits characteristic failure modes:
  - Confident hallucinations with plausible statistics
  - Ignoring edge cases in protocol rules
  - Confusing correlation with causation
  - Simpson's paradox blind spots
  - Survivorship bias in cohort analysis
  - Anchoring bias on irrelevant features

GPU MEMORY: This is NOT a live LLM. Pre-cached deterministic proposals.
Live Actor is reserved for onsite compute credits.
"""

from __future__ import annotations

import random
from datetime import datetime
from typing import Optional


# ═══════════════════════════════════════════════════════════════
# Medical reasoning templates — these are what make the benchmark
# genuinely hard. A naive LLM will believe these.
# ═══════════════════════════════════════════════════════════════

_CORRECT_TEMPLATES = {
    "eligibility_pass": [
        (
            "I systematically verified {pid}'s eligibility against protocol {proto}. "
            "Age {age} falls within the [{age_min}, {age_max}] inclusion window. "
            "Enrollment date {enroll} precedes treatment initiation {treat} by {delay} days, "
            "which satisfies the {window}-day scheduling constraint for Stage {stage} patients. "
            "No temporal anomalies detected in the clinical timeline. "
            "Comorbidity index of {comorbidity} is within acceptable bounds. "
            "ECOG performance status {ecog} does not trigger any exclusion criteria. "
            "Recommendation: COMPLIANT — continue enrollment."
        ),
        (
            "Cross-referencing {pid}'s clinical record against {proto} eligibility matrix. "
            "Demographics: {age}/{gender}/{ethnicity}, {site} ({country}). "
            "Stage {stage} {histology} with baseline LDH {ldh} U/L. "
            "Treatment arm: {drug}. Enrollment-to-treatment interval: {delay} days "
            "(protocol maximum: {window} days). Timeline integrity: death_date "
            "{'not recorded (patient alive)' if not death else 'is ' + str(death) + ', post-treatment'}. "
            "Concomitant medications ({n_meds}) reviewed — no contraindicated interactions. "
            "Assessment: All protocol constraints satisfied."
        ),
    ],
    "monitoring_pass": [
        (
            "Longitudinal review of {pid} (Stage {stage}, {drug}). "
            "BMI {bmi}, BP {bp_sys}/{bp_dia}, smoking: {smoking}. "
            "Prior chemotherapy: {prior_chemo} cycles. Insurance: {insurance}. "
            "Clinical trajectory consistent with expected treatment response. "
            "No protocol deviations identified in this review cycle."
        ),
    ],
}

_INCORRECT_TEMPLATES = {
    "invalid_age": [
        # Subtle: Actor acknowledges the age value but misreads the protocol bounds
        (
            "Reviewing {pid}'s eligibility under {proto}. Patient age is {age}. "
            "The protocol inclusion criteria specify an age range appropriate for "
            "adult oncology trials. Cross-referencing with standard Phase III "
            "enrollment guidelines (typically 18-90), this patient falls within "
            "acceptable bounds. Note: the patient's age does not trigger any "
            "standard exclusion criteria used in comparable ECOG-ACRIN or NSABP "
            "trial designs. Verified against NCI CTCAE v5.0 demographic tables. "
            "COMPLIANT — age requirement satisfied."
        ),
        # Hallucination: Actor invents a sub-protocol exception
        (
            "Patient {pid} (age {age}). Initial screening flags age as potentially "
            "outside protocol range [{age_min}-{age_max}]. However, per Section 4.2.1(b) "
            "of the protocol amendment dated 2023-11-15, patients with "
            "ECOG ≤ 2 and comorbidity index < 5 are eligible under the expanded "
            "access provision regardless of age boundary. Patient {pid} meets both "
            "criteria (ECOG={ecog}, comorbidity={comorbidity}). "
            "COMPLIANT — eligible under expanded access."
        ),
        # Anchoring: Actor focuses on other features and dismisses age
        (
            "Comprehensive review of {pid}. Primary concern: Stage {stage} {histology} "
            "with baseline LDH {ldh} U/L (elevated, monitoring required). "
            "BMI {bmi}, {n_meds} concomitant medications. Age ({age}) noted. "
            "No actionable protocol violations found — the elevated LDH is the "
            "primary clinical concern but does not constitute a protocol deviation. "
            "Treatment timing ({delay} days post-enrollment) is compliant. "
            "COMPLIANT — recommend continued monitoring of LDH levels."
        ),
    ],

    "temporal_inconsistency": [
        # Actor confuses date ordering with different logic
        (
            "Timeline audit for {pid}. Key dates: enrollment {enroll}, "
            "treatment start {treat}, death {death}. Calculating intervals: "
            "enrollment-to-treatment = {delay} days (within protocol window). "
            "The patient's outcome is recorded as deceased, which is expected "
            "for a subset of Stage {stage} participants ({stage_mortality}% "
            "baseline mortality for this stage). The death date falls within "
            "the study observation window. No protocol-mandated reporting "
            "thresholds were violated. Timeline: CONSISTENT."
        ),
        # Actor claims death_date is administrative
        (
            "Reviewing {pid}'s adverse event timeline. Death date ({death}) "
            "appears in the record. Cross-referencing with treatment start "
            "({treat}): note that early death dates in clinical databases "
            "often reflect administrative data entry artifacts, particularly "
            "when the enrollment date ({enroll}) is valid and treatment "
            "was initiated. The DSMB reviewed similar cases in interim "
            "analysis #3 and confirmed no causal safety signals. "
            "COMPLIANT — death date consistent with expected attrition."
        ),
    ],

    "protocol_window_violation": [
        # Actor uses wrong window calculation
        (
            "Scheduling review for {pid} (Stage {stage}). Enrolled {enroll}, "
            "treatment initiated {treat}. Delay: {actual_delay} days. "
            "Protocol specifies timely treatment initiation. For oncology "
            "trials of this class, the FDA guidance (2023) recommends "
            "treatment within 30-45 days of enrollment to maintain "
            "intent-to-treat validity. Patient {pid}'s {actual_delay}-day "
            "delay is within regulatory guidance. Additionally, logistic "
            "delays due to site capacity at {site} are documented and "
            "expected. COMPLIANT — scheduling acceptable per FDA guidance."
        ),
        # Actor applies wrong stage exception
        (
            "Patient {pid}: enrollment {enroll}, treatment {treat}, "
            "delay {actual_delay} days. Stage {stage}. "
            "Per protocol scheduling requirements, the standard treatment "
            "window applies. Checking: {actual_delay} days falls within "
            "the allowable range considering the site's documented mean "
            "scheduling latency of {site_latency} days. No violation."
        ),
    ],

    "comorbidity_override_miss": [
        # The hardest error — requires 2-hop reasoning
        (
            "Patient {pid}: Stage IV, comorbidity index {comorbidity}. "
            "Stage IV patients receive an extended treatment window of "
            "{extended_window} days per protocol section 3.2. Patient's "
            "enrollment-to-treatment interval of {actual_delay} days falls "
            "within this extended window. Note: while the comorbidity index "
            "is elevated, Stage IV status takes precedence in scheduling "
            "priority according to standard oncologic practice (ASCO 2024 "
            "guidelines). COMPLIANT — Stage IV scheduling exception applies."
        ),
        (
            "Reviewing {pid}: Stage IV {histology} with comorbidity index "
            "{comorbidity}. The protocol grants Stage IV patients an extended "
            "scheduling window ({extended_window} days). Treatment was "
            "initiated at day {actual_delay}. I verified this against the "
            "Stage IV exception clause. While the patient has significant "
            "comorbidities, the protocol's scheduling exception is keyed to "
            "stage classification, not comorbidity burden. The extended "
            "window applies. COMPLIANT."
        ),
    ],
}

# Statistical hallucination data
_FAKE_STATS = [
    "per Kaplan-Meier analysis (p=0.032)",
    "consistent with published survival curves (HR=0.78, 95% CI: 0.62-0.94)",
    "within 1 SD of the SEER 2024 reference population",
    "aligned with ECOG-ACRIN E1694 historical controls",
    "matching the NSABP B-47 trial cohort demographics",
    "per the 2024 WHO Global Cancer Observatory estimates",
]


class ActorProposalGenerator:
    """Sophisticated deterministic Actor that generates clinical proposals
    with realistic medical reasoning — some correct, some subtly flawed.

    The Actor simulates common LLM failure modes:
    - Hallucinating plausible but nonexistent protocol amendments
    - Anchoring on irrelevant features while missing critical ones
    - Confusing regulatory guidance with trial-specific protocols
    - Citing real-sounding but fabricated statistics
    - Applying correct rules to wrong contexts (2-hop failures)
    """

    def __init__(self, seed: Optional[int] = None):
        self.rng = random.Random(seed)

    def generate_proposals(
        self,
        patients: list[dict],
        protocol: dict,
        ground_truth: dict[str, list[str]],
        difficulty: str = "medium",
    ) -> list[dict]:
        """Generate Actor proposals for an episode."""
        proposals = []
        proposal_counter = 0

        n_proposals = {
            "easy": self.rng.randint(5, 7),
            "medium": self.rng.randint(6, 10),
            "hard": self.rng.randint(8, 12),
        }.get(difficulty, 8)

        error_patients = [p for p in patients if p["patient_id"] in ground_truth]
        clean_patients = [p for p in patients if p["patient_id"] not in ground_truth]

        n_error = min(len(error_patients), max(3, int(n_proposals * 0.45)))
        n_clean = n_proposals - n_error

        selected_errors = self.rng.sample(error_patients, min(n_error, len(error_patients)))
        selected_clean = self.rng.sample(clean_patients, min(n_clean, len(clean_patients)))
        selected = selected_errors + selected_clean
        self.rng.shuffle(selected)

        for patient in selected:
            proposal_counter += 1
            pid = patient["patient_id"]

            if pid in ground_truth:
                proposal = self._generate_incorrect_proposal(
                    proposal_counter, patient, protocol, ground_truth[pid], difficulty
                )
            else:
                proposal = self._generate_correct_proposal(
                    proposal_counter, patient, protocol, difficulty
                )
            proposals.append(proposal)

        return proposals

    def _fill_template(self, template: str, patient: dict, protocol: dict) -> str:
        """Fill a reasoning template with patient/protocol data."""
        enroll = patient.get("enrollment_date", "")
        treat = patient.get("treatment_start", "")
        delay = 0
        if enroll and treat:
            try:
                d1 = datetime.strptime(enroll, "%Y-%m-%d")
                d2 = datetime.strptime(treat, "%Y-%m-%d")
                delay = (d2 - d1).days
            except (ValueError, TypeError):
                delay = 0

        try:
            from patient_generator import BASE_STAGE_MORTALITY
        except ImportError:
            from server.patient_generator import BASE_STAGE_MORTALITY
        stage = patient.get("stage", "II")
        stage_mort = int(BASE_STAGE_MORTALITY.get(stage, 0.10) * 100)

        meds = patient.get("concomitant_medications", [])
        if isinstance(meds, list):
            n_meds = len(meds)
        else:
            n_meds = 0

        window = protocol.get("treatment_window_days", 21)
        if stage == "IV":
            window = protocol.get("stage_iv_treatment_window_days", window + 10)

        return template.format(
            pid=patient.get("patient_id", "?"),
            proto=protocol.get("protocol_title", "ONCO-AX"),
            age=patient.get("age", "?"),
            age_min=protocol.get("age_min", 18),
            age_max=protocol.get("age_max", 85),
            gender=patient.get("gender", "?"),
            ethnicity=patient.get("ethnicity", "?"),
            stage=stage,
            site=patient.get("treatment_site", "?"),
            country=patient.get("country", "?"),
            drug=patient.get("drug", "?"),
            enroll=enroll,
            treat=treat,
            death=patient.get("death_date") or "N/A",
            delay=delay,
            actual_delay=delay,
            window=window,
            extended_window=protocol.get("stage_iv_treatment_window_days", 35),
            comorbidity=patient.get("comorbidity_index", 0),
            ecog=patient.get("ecog_performance_status", 0),
            histology=patient.get("histology_type", "Adenocarcinoma"),
            ldh=patient.get("baseline_ldh", 210),
            bmi=patient.get("bmi", 26),
            bp_sys=patient.get("blood_pressure_sys", 120),
            bp_dia=patient.get("blood_pressure_dia", 80),
            smoking=patient.get("smoking_status", "Unknown"),
            prior_chemo=patient.get("prior_chemo_cycles", 0),
            insurance=patient.get("insurance_type", "Unknown"),
            n_meds=n_meds,
            stage_mortality=stage_mort,
            site_latency=self.rng.randint(8, 22),
        )

    def _generate_correct_proposal(
        self, idx: int, patient: dict, protocol: dict, difficulty: str
    ) -> dict:
        """Actor correctly identifies patient as compliant."""
        category = self.rng.choice(list(_CORRECT_TEMPLATES.keys()))
        templates = _CORRECT_TEMPLATES[category]
        template = self.rng.choice(templates)

        try:
            reasoning = self._fill_template(template, patient, protocol)
        except (KeyError, IndexError):
            reasoning = (
                f"Patient {patient['patient_id']} reviewed against protocol "
                f"{protocol.get('protocol_title', '')}. All criteria satisfied. COMPLIANT."
            )

        stat = self.rng.choice(_FAKE_STATS) if difficulty == "hard" else ""
        if stat and self.rng.random() < 0.3:
            reasoning += f" Clinical trajectory {stat}."

        return {
            "proposal_id": f"PROP-{idx:03d}",
            "patient_id": patient["patient_id"],
            "diagnosis": "protocol_compliant",
            "reasoning": reasoning,
            "confidence": round(self.rng.uniform(0.80, 0.97), 2),
            "recommended_action": "continue_enrollment",
            "clinical_notes": self._generate_clinical_notes(patient, difficulty),
            "cited_references": self._generate_citations(difficulty) if difficulty != "easy" else [],
            "is_correct": True,
            "actual_errors": [],
            "error_category": None,
            "patient_data": patient,
        }

    def _generate_incorrect_proposal(
        self, idx: int, patient: dict, protocol: dict,
        errors: list[str], difficulty: str
    ) -> dict:
        """Actor INCORRECTLY clears a patient. Reasoning sounds correct but is flawed."""
        error_type = errors[0]
        pid = patient["patient_id"]

        templates = _INCORRECT_TEMPLATES.get(error_type, [])
        if templates:
            template = self.rng.choice(templates)
            try:
                reasoning = self._fill_template(template, patient, protocol)
            except (KeyError, IndexError):
                reasoning = f"Patient {pid} reviewed. No violations detected. COMPLIANT."
        else:
            reasoning = (
                f"Patient {pid} reviewed against all protocol criteria. "
                f"Clinical assessment indicates full compliance. "
                f"No actionable deviations identified. COMPLIANT."
            )

        # Add fake statistical backing for harder difficulties
        if difficulty in ("medium", "hard"):
            stat = self.rng.choice(_FAKE_STATS)
            reasoning += f" Outcome metrics {stat}."

        # Confidence inversely correlates with actual correctness (subtly)
        # Bad proposals sometimes have HIGH confidence - a key LLM failure mode
        confidence = round(self.rng.uniform(
            0.75 if difficulty == "easy" else 0.82,
            0.95 if difficulty == "hard" else 0.93,
        ), 2)

        return {
            "proposal_id": f"PROP-{idx:03d}",
            "patient_id": pid,
            "diagnosis": "protocol_compliant",
            "reasoning": reasoning,
            "confidence": confidence,
            "recommended_action": "continue_enrollment",
            "clinical_notes": self._generate_clinical_notes(patient, difficulty),
            "cited_references": self._generate_citations(difficulty),
            "is_correct": False,
            "actual_errors": errors,
            "error_category": error_type,
            "patient_data": patient,
        }

    def _generate_clinical_notes(self, patient: dict, difficulty: str) -> str:
        """Generate realistic clinical notes that add noise."""
        if difficulty == "easy":
            return ""
        stage = patient.get("stage", "II")
        drug = patient.get("drug", "Placebo")
        notes = [
            f"Patient tolerating {drug} without Grade 3+ AEs.",
            f"Stage {stage} disease stable on interval imaging.",
            f"Labs reviewed: CBC, CMP, LDH within institutional limits.",
        ]
        if difficulty == "hard":
            notes.extend([
                f"Tumor board discussed case — consensus to continue protocol.",
                f"ctDNA trending downward (0.8% → 0.3% VAF over 12 weeks).",
                f"Patient reports manageable Grade 1 fatigue and mild nausea.",
            ])
        return " ".join(self.rng.sample(notes, min(len(notes), 3)))

    def _generate_citations(self, difficulty: str) -> list[str]:
        """Generate plausible but fake/irrelevant citations."""
        refs = [
            "ECOG-ACRIN E1694 (2023) — Phase III eligibility criteria",
            "NSABP B-47 amendment 2024-03 — expanded access provisions",
            "NCI CTCAE v5.0 Table 12.3 — demographic eligibility",
            "FDA Guidance ICH-E6(R3) — scheduling compliance",
            "ASCO 2024 Clinical Practice Guidelines — Stage IV management",
            "WHO Global Cancer Observatory 2024 — reference populations",
            "Lancet Oncol 2024;25(3):412-420 — comorbidity scoring",
        ]
        n = {"easy": 0, "medium": 1, "hard": self.rng.randint(2, 3)}.get(difficulty, 1)
        return self.rng.sample(refs, min(n, len(refs)))