Spaces:
Sleeping
Sleeping
| """ | |
| SynthAudit.Env β Actor Agent (Pre-cached Proposal Generator) | |
| ============================================================= | |
| Generates deterministic clinical proposals with sophisticated | |
| medical reasoning that SOUNDS correct but contains subtle flaws. | |
| The Actor simulates a frontier LLM that has been fine-tuned on | |
| clinical data but still exhibits characteristic failure modes: | |
| - Confident hallucinations with plausible statistics | |
| - Ignoring edge cases in protocol rules | |
| - Confusing correlation with causation | |
| - Simpson's paradox blind spots | |
| - Survivorship bias in cohort analysis | |
| - Anchoring bias on irrelevant features | |
| GPU MEMORY: This is NOT a live LLM. Pre-cached deterministic proposals. | |
| Live Actor is reserved for onsite compute credits. | |
| """ | |
| from __future__ import annotations | |
| import random | |
| from datetime import datetime | |
| from typing import Optional | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Medical reasoning templates β these are what make the benchmark | |
| # genuinely hard. A naive LLM will believe these. | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _CORRECT_TEMPLATES = { | |
| "eligibility_pass": [ | |
| ( | |
| "I systematically verified {pid}'s eligibility against protocol {proto}. " | |
| "Age {age} falls within the [{age_min}, {age_max}] inclusion window. " | |
| "Enrollment date {enroll} precedes treatment initiation {treat} by {delay} days, " | |
| "which satisfies the {window}-day scheduling constraint for Stage {stage} patients. " | |
| "No temporal anomalies detected in the clinical timeline. " | |
| "Comorbidity index of {comorbidity} is within acceptable bounds. " | |
| "ECOG performance status {ecog} does not trigger any exclusion criteria. " | |
| "Recommendation: COMPLIANT β continue enrollment." | |
| ), | |
| ( | |
| "Cross-referencing {pid}'s clinical record against {proto} eligibility matrix. " | |
| "Demographics: {age}/{gender}/{ethnicity}, {site} ({country}). " | |
| "Stage {stage} {histology} with baseline LDH {ldh} U/L. " | |
| "Treatment arm: {drug}. Enrollment-to-treatment interval: {delay} days " | |
| "(protocol maximum: {window} days). Timeline integrity: death_date " | |
| "{'not recorded (patient alive)' if not death else 'is ' + str(death) + ', post-treatment'}. " | |
| "Concomitant medications ({n_meds}) reviewed β no contraindicated interactions. " | |
| "Assessment: All protocol constraints satisfied." | |
| ), | |
| ], | |
| "monitoring_pass": [ | |
| ( | |
| "Longitudinal review of {pid} (Stage {stage}, {drug}). " | |
| "BMI {bmi}, BP {bp_sys}/{bp_dia}, smoking: {smoking}. " | |
| "Prior chemotherapy: {prior_chemo} cycles. Insurance: {insurance}. " | |
| "Clinical trajectory consistent with expected treatment response. " | |
| "No protocol deviations identified in this review cycle." | |
| ), | |
| ], | |
| } | |
| _INCORRECT_TEMPLATES = { | |
| "invalid_age": [ | |
| # Subtle: Actor acknowledges the age value but misreads the protocol bounds | |
| ( | |
| "Reviewing {pid}'s eligibility under {proto}. Patient age is {age}. " | |
| "The protocol inclusion criteria specify an age range appropriate for " | |
| "adult oncology trials. Cross-referencing with standard Phase III " | |
| "enrollment guidelines (typically 18-90), this patient falls within " | |
| "acceptable bounds. Note: the patient's age does not trigger any " | |
| "standard exclusion criteria used in comparable ECOG-ACRIN or NSABP " | |
| "trial designs. Verified against NCI CTCAE v5.0 demographic tables. " | |
| "COMPLIANT β age requirement satisfied." | |
| ), | |
| # Hallucination: Actor invents a sub-protocol exception | |
| ( | |
| "Patient {pid} (age {age}). Initial screening flags age as potentially " | |
| "outside protocol range [{age_min}-{age_max}]. However, per Section 4.2.1(b) " | |
| "of the protocol amendment dated 2023-11-15, patients with " | |
| "ECOG β€ 2 and comorbidity index < 5 are eligible under the expanded " | |
| "access provision regardless of age boundary. Patient {pid} meets both " | |
| "criteria (ECOG={ecog}, comorbidity={comorbidity}). " | |
| "COMPLIANT β eligible under expanded access." | |
| ), | |
| # Anchoring: Actor focuses on other features and dismisses age | |
| ( | |
| "Comprehensive review of {pid}. Primary concern: Stage {stage} {histology} " | |
| "with baseline LDH {ldh} U/L (elevated, monitoring required). " | |
| "BMI {bmi}, {n_meds} concomitant medications. Age ({age}) noted. " | |
| "No actionable protocol violations found β the elevated LDH is the " | |
| "primary clinical concern but does not constitute a protocol deviation. " | |
| "Treatment timing ({delay} days post-enrollment) is compliant. " | |
| "COMPLIANT β recommend continued monitoring of LDH levels." | |
| ), | |
| ], | |
| "temporal_inconsistency": [ | |
| # Actor confuses date ordering with different logic | |
| ( | |
| "Timeline audit for {pid}. Key dates: enrollment {enroll}, " | |
| "treatment start {treat}, death {death}. Calculating intervals: " | |
| "enrollment-to-treatment = {delay} days (within protocol window). " | |
| "The patient's outcome is recorded as deceased, which is expected " | |
| "for a subset of Stage {stage} participants ({stage_mortality}% " | |
| "baseline mortality for this stage). The death date falls within " | |
| "the study observation window. No protocol-mandated reporting " | |
| "thresholds were violated. Timeline: CONSISTENT." | |
| ), | |
| # Actor claims death_date is administrative | |
| ( | |
| "Reviewing {pid}'s adverse event timeline. Death date ({death}) " | |
| "appears in the record. Cross-referencing with treatment start " | |
| "({treat}): note that early death dates in clinical databases " | |
| "often reflect administrative data entry artifacts, particularly " | |
| "when the enrollment date ({enroll}) is valid and treatment " | |
| "was initiated. The DSMB reviewed similar cases in interim " | |
| "analysis #3 and confirmed no causal safety signals. " | |
| "COMPLIANT β death date consistent with expected attrition." | |
| ), | |
| ], | |
| "protocol_window_violation": [ | |
| # Actor uses wrong window calculation | |
| ( | |
| "Scheduling review for {pid} (Stage {stage}). Enrolled {enroll}, " | |
| "treatment initiated {treat}. Delay: {actual_delay} days. " | |
| "Protocol specifies timely treatment initiation. For oncology " | |
| "trials of this class, the FDA guidance (2023) recommends " | |
| "treatment within 30-45 days of enrollment to maintain " | |
| "intent-to-treat validity. Patient {pid}'s {actual_delay}-day " | |
| "delay is within regulatory guidance. Additionally, logistic " | |
| "delays due to site capacity at {site} are documented and " | |
| "expected. COMPLIANT β scheduling acceptable per FDA guidance." | |
| ), | |
| # Actor applies wrong stage exception | |
| ( | |
| "Patient {pid}: enrollment {enroll}, treatment {treat}, " | |
| "delay {actual_delay} days. Stage {stage}. " | |
| "Per protocol scheduling requirements, the standard treatment " | |
| "window applies. Checking: {actual_delay} days falls within " | |
| "the allowable range considering the site's documented mean " | |
| "scheduling latency of {site_latency} days. No violation." | |
| ), | |
| ], | |
| "comorbidity_override_miss": [ | |
| # The hardest error β requires 2-hop reasoning | |
| ( | |
| "Patient {pid}: Stage IV, comorbidity index {comorbidity}. " | |
| "Stage IV patients receive an extended treatment window of " | |
| "{extended_window} days per protocol section 3.2. Patient's " | |
| "enrollment-to-treatment interval of {actual_delay} days falls " | |
| "within this extended window. Note: while the comorbidity index " | |
| "is elevated, Stage IV status takes precedence in scheduling " | |
| "priority according to standard oncologic practice (ASCO 2024 " | |
| "guidelines). COMPLIANT β Stage IV scheduling exception applies." | |
| ), | |
| ( | |
| "Reviewing {pid}: Stage IV {histology} with comorbidity index " | |
| "{comorbidity}. The protocol grants Stage IV patients an extended " | |
| "scheduling window ({extended_window} days). Treatment was " | |
| "initiated at day {actual_delay}. I verified this against the " | |
| "Stage IV exception clause. While the patient has significant " | |
| "comorbidities, the protocol's scheduling exception is keyed to " | |
| "stage classification, not comorbidity burden. The extended " | |
| "window applies. COMPLIANT." | |
| ), | |
| ], | |
| } | |
| # Statistical hallucination data | |
| _FAKE_STATS = [ | |
| "per Kaplan-Meier analysis (p=0.032)", | |
| "consistent with published survival curves (HR=0.78, 95% CI: 0.62-0.94)", | |
| "within 1 SD of the SEER 2024 reference population", | |
| "aligned with ECOG-ACRIN E1694 historical controls", | |
| "matching the NSABP B-47 trial cohort demographics", | |
| "per the 2024 WHO Global Cancer Observatory estimates", | |
| ] | |
| class ActorProposalGenerator: | |
| """Sophisticated deterministic Actor that generates clinical proposals | |
| with realistic medical reasoning β some correct, some subtly flawed. | |
| The Actor simulates common LLM failure modes: | |
| - Hallucinating plausible but nonexistent protocol amendments | |
| - Anchoring on irrelevant features while missing critical ones | |
| - Confusing regulatory guidance with trial-specific protocols | |
| - Citing real-sounding but fabricated statistics | |
| - Applying correct rules to wrong contexts (2-hop failures) | |
| """ | |
| def __init__(self, seed: Optional[int] = None): | |
| self.rng = random.Random(seed) | |
| def generate_proposals( | |
| self, | |
| patients: list[dict], | |
| protocol: dict, | |
| ground_truth: dict[str, list[str]], | |
| difficulty: str = "medium", | |
| ) -> list[dict]: | |
| """Generate Actor proposals for an episode.""" | |
| proposals = [] | |
| proposal_counter = 0 | |
| n_proposals = { | |
| "easy": self.rng.randint(5, 7), | |
| "medium": self.rng.randint(6, 10), | |
| "hard": self.rng.randint(8, 12), | |
| }.get(difficulty, 8) | |
| error_patients = [p for p in patients if p["patient_id"] in ground_truth] | |
| clean_patients = [p for p in patients if p["patient_id"] not in ground_truth] | |
| n_error = min(len(error_patients), max(3, int(n_proposals * 0.45))) | |
| n_clean = n_proposals - n_error | |
| selected_errors = self.rng.sample(error_patients, min(n_error, len(error_patients))) | |
| selected_clean = self.rng.sample(clean_patients, min(n_clean, len(clean_patients))) | |
| selected = selected_errors + selected_clean | |
| self.rng.shuffle(selected) | |
| for patient in selected: | |
| proposal_counter += 1 | |
| pid = patient["patient_id"] | |
| if pid in ground_truth: | |
| proposal = self._generate_incorrect_proposal( | |
| proposal_counter, patient, protocol, ground_truth[pid], difficulty | |
| ) | |
| else: | |
| proposal = self._generate_correct_proposal( | |
| proposal_counter, patient, protocol, difficulty | |
| ) | |
| proposals.append(proposal) | |
| return proposals | |
| def _fill_template(self, template: str, patient: dict, protocol: dict) -> str: | |
| """Fill a reasoning template with patient/protocol data.""" | |
| enroll = patient.get("enrollment_date", "") | |
| treat = patient.get("treatment_start", "") | |
| delay = 0 | |
| if enroll and treat: | |
| try: | |
| d1 = datetime.strptime(enroll, "%Y-%m-%d") | |
| d2 = datetime.strptime(treat, "%Y-%m-%d") | |
| delay = (d2 - d1).days | |
| except (ValueError, TypeError): | |
| delay = 0 | |
| try: | |
| from patient_generator import BASE_STAGE_MORTALITY | |
| except ImportError: | |
| from server.patient_generator import BASE_STAGE_MORTALITY | |
| stage = patient.get("stage", "II") | |
| stage_mort = int(BASE_STAGE_MORTALITY.get(stage, 0.10) * 100) | |
| meds = patient.get("concomitant_medications", []) | |
| if isinstance(meds, list): | |
| n_meds = len(meds) | |
| else: | |
| n_meds = 0 | |
| window = protocol.get("treatment_window_days", 21) | |
| if stage == "IV": | |
| window = protocol.get("stage_iv_treatment_window_days", window + 10) | |
| return template.format( | |
| pid=patient.get("patient_id", "?"), | |
| proto=protocol.get("protocol_title", "ONCO-AX"), | |
| age=patient.get("age", "?"), | |
| age_min=protocol.get("age_min", 18), | |
| age_max=protocol.get("age_max", 85), | |
| gender=patient.get("gender", "?"), | |
| ethnicity=patient.get("ethnicity", "?"), | |
| stage=stage, | |
| site=patient.get("treatment_site", "?"), | |
| country=patient.get("country", "?"), | |
| drug=patient.get("drug", "?"), | |
| enroll=enroll, | |
| treat=treat, | |
| death=patient.get("death_date") or "N/A", | |
| delay=delay, | |
| actual_delay=delay, | |
| window=window, | |
| extended_window=protocol.get("stage_iv_treatment_window_days", 35), | |
| comorbidity=patient.get("comorbidity_index", 0), | |
| ecog=patient.get("ecog_performance_status", 0), | |
| histology=patient.get("histology_type", "Adenocarcinoma"), | |
| ldh=patient.get("baseline_ldh", 210), | |
| bmi=patient.get("bmi", 26), | |
| bp_sys=patient.get("blood_pressure_sys", 120), | |
| bp_dia=patient.get("blood_pressure_dia", 80), | |
| smoking=patient.get("smoking_status", "Unknown"), | |
| prior_chemo=patient.get("prior_chemo_cycles", 0), | |
| insurance=patient.get("insurance_type", "Unknown"), | |
| n_meds=n_meds, | |
| stage_mortality=stage_mort, | |
| site_latency=self.rng.randint(8, 22), | |
| ) | |
| def _generate_correct_proposal( | |
| self, idx: int, patient: dict, protocol: dict, difficulty: str | |
| ) -> dict: | |
| """Actor correctly identifies patient as compliant.""" | |
| category = self.rng.choice(list(_CORRECT_TEMPLATES.keys())) | |
| templates = _CORRECT_TEMPLATES[category] | |
| template = self.rng.choice(templates) | |
| try: | |
| reasoning = self._fill_template(template, patient, protocol) | |
| except (KeyError, IndexError): | |
| reasoning = ( | |
| f"Patient {patient['patient_id']} reviewed against protocol " | |
| f"{protocol.get('protocol_title', '')}. All criteria satisfied. COMPLIANT." | |
| ) | |
| stat = self.rng.choice(_FAKE_STATS) if difficulty == "hard" else "" | |
| if stat and self.rng.random() < 0.3: | |
| reasoning += f" Clinical trajectory {stat}." | |
| return { | |
| "proposal_id": f"PROP-{idx:03d}", | |
| "patient_id": patient["patient_id"], | |
| "diagnosis": "protocol_compliant", | |
| "reasoning": reasoning, | |
| "confidence": round(self.rng.uniform(0.80, 0.97), 2), | |
| "recommended_action": "continue_enrollment", | |
| "clinical_notes": self._generate_clinical_notes(patient, difficulty), | |
| "cited_references": self._generate_citations(difficulty) if difficulty != "easy" else [], | |
| "is_correct": True, | |
| "actual_errors": [], | |
| "error_category": None, | |
| "patient_data": patient, | |
| } | |
| def _generate_incorrect_proposal( | |
| self, idx: int, patient: dict, protocol: dict, | |
| errors: list[str], difficulty: str | |
| ) -> dict: | |
| """Actor INCORRECTLY clears a patient. Reasoning sounds correct but is flawed.""" | |
| error_type = errors[0] | |
| pid = patient["patient_id"] | |
| templates = _INCORRECT_TEMPLATES.get(error_type, []) | |
| if templates: | |
| template = self.rng.choice(templates) | |
| try: | |
| reasoning = self._fill_template(template, patient, protocol) | |
| except (KeyError, IndexError): | |
| reasoning = f"Patient {pid} reviewed. No violations detected. COMPLIANT." | |
| else: | |
| reasoning = ( | |
| f"Patient {pid} reviewed against all protocol criteria. " | |
| f"Clinical assessment indicates full compliance. " | |
| f"No actionable deviations identified. COMPLIANT." | |
| ) | |
| # Add fake statistical backing for harder difficulties | |
| if difficulty in ("medium", "hard"): | |
| stat = self.rng.choice(_FAKE_STATS) | |
| reasoning += f" Outcome metrics {stat}." | |
| # Confidence inversely correlates with actual correctness (subtly) | |
| # Bad proposals sometimes have HIGH confidence - a key LLM failure mode | |
| confidence = round(self.rng.uniform( | |
| 0.75 if difficulty == "easy" else 0.82, | |
| 0.95 if difficulty == "hard" else 0.93, | |
| ), 2) | |
| return { | |
| "proposal_id": f"PROP-{idx:03d}", | |
| "patient_id": pid, | |
| "diagnosis": "protocol_compliant", | |
| "reasoning": reasoning, | |
| "confidence": confidence, | |
| "recommended_action": "continue_enrollment", | |
| "clinical_notes": self._generate_clinical_notes(patient, difficulty), | |
| "cited_references": self._generate_citations(difficulty), | |
| "is_correct": False, | |
| "actual_errors": errors, | |
| "error_category": error_type, | |
| "patient_data": patient, | |
| } | |
| def _generate_clinical_notes(self, patient: dict, difficulty: str) -> str: | |
| """Generate realistic clinical notes that add noise.""" | |
| if difficulty == "easy": | |
| return "" | |
| stage = patient.get("stage", "II") | |
| drug = patient.get("drug", "Placebo") | |
| notes = [ | |
| f"Patient tolerating {drug} without Grade 3+ AEs.", | |
| f"Stage {stage} disease stable on interval imaging.", | |
| f"Labs reviewed: CBC, CMP, LDH within institutional limits.", | |
| ] | |
| if difficulty == "hard": | |
| notes.extend([ | |
| f"Tumor board discussed case β consensus to continue protocol.", | |
| f"ctDNA trending downward (0.8% β 0.3% VAF over 12 weeks).", | |
| f"Patient reports manageable Grade 1 fatigue and mild nausea.", | |
| ]) | |
| return " ".join(self.rng.sample(notes, min(len(notes), 3))) | |
| def _generate_citations(self, difficulty: str) -> list[str]: | |
| """Generate plausible but fake/irrelevant citations.""" | |
| refs = [ | |
| "ECOG-ACRIN E1694 (2023) β Phase III eligibility criteria", | |
| "NSABP B-47 amendment 2024-03 β expanded access provisions", | |
| "NCI CTCAE v5.0 Table 12.3 β demographic eligibility", | |
| "FDA Guidance ICH-E6(R3) β scheduling compliance", | |
| "ASCO 2024 Clinical Practice Guidelines β Stage IV management", | |
| "WHO Global Cancer Observatory 2024 β reference populations", | |
| "Lancet Oncol 2024;25(3):412-420 β comorbidity scoring", | |
| ] | |
| n = {"easy": 0, "medium": 1, "hard": self.rng.randint(2, 3)}.get(difficulty, 1) | |
| return self.rng.sample(refs, min(n, len(refs))) | |