""" Task 1: Methodology Audit — EASY (PeerGuard: CONSORT Protocol Violation Audit) --------------------------------- Agent reads a synthetic clinical trial paper stub and must identify 4 planted CONSORT protocol violations. Paper is procedurally generated each episode to prevent memorisation. Flaw injection system: - Templates have slots: [STATISTICAL_TEST], [GROUP_A], [GROUP_B], etc. - At generation time, a valid combination is picked, then one or more flaws are injected by replacing the valid choice with an incorrect one. - Ground truth records which taxonomy applies and where. """ from __future__ import annotations import random import textwrap from typing import Optional from tasks.base import BaseTask # --------------------------------------------------------------------------- # Vocabulary pools — the raw material for procedural generation # --------------------------------------------------------------------------- DOMAINS = [ { "field": "clinical trial", "intervention": ["Drug A", "Treatment B", "Compound X", "Therapy Z"], "outcome": ["recovery time", "symptom severity", "blood pressure", "pain score"], "group_type": "patients", }, { "field": "psychology study", "intervention": ["Mindfulness training", "Cognitive therapy", "Group therapy", "App-based intervention"], "outcome": ["anxiety score", "depression score", "stress levels", "cognitive performance"], "group_type": "participants", }, { "field": "educational study", "intervention": ["Active learning", "Flipped classroom", "Peer tutoring", "Digital tools"], "outcome": ["exam scores", "knowledge retention", "engagement", "completion rate"], "group_type": "students", }, ] # (test_name, valid_data_types, invalid_for) STAT_TESTS = { "independent samples t-test": {"valid": ["continuous", "normally distributed"], "invalid_for": ["categorical", "ordinal", "non-normal"]}, "chi-square test": {"valid": ["categorical", "frequency data"], "invalid_for": ["continuous", "time-series"]}, "Mann-Whitney U test": {"valid": ["ordinal", "non-normal continuous"], "invalid_for": ["normally distributed small samples"]}, "one-way ANOVA": {"valid": ["continuous", "3+ groups", "normal"], "invalid_for": ["binary outcome", "two groups"]}, "Pearson correlation": {"valid": ["continuous linear relationship"], "invalid_for": ["ordinal", "non-linear"]}, "logistic regression": {"valid": ["binary outcome", "categorical"], "invalid_for": ["continuous outcome", "ordinal multi-class"]}, } # Flaw templates: (flaw_taxonomy, description_template, section_hint) FLAW_TEMPLATES = [ { "taxonomy": "unblinded_investigator_bias", "inject": lambda ctx, rng: _inject_unblinded_bias(ctx, rng), "section": "statistical_analysis", }, { "taxonomy": "insufficient_power_analysis", "inject": lambda ctx, rng: _inject_insufficient_power(ctx, rng), "section": "participants", }, { "taxonomy": "protocol_deviation_unreported", "inject": lambda ctx, rng: _inject_protocol_deviation(ctx, rng), "section": "results", }, { "taxonomy": "endpoint_switching", "inject": lambda ctx, rng: _inject_endpoint_switching(ctx, rng), "section": "results", }, ] # --------------------------------------------------------------------------- # Task class # --------------------------------------------------------------------------- class MethodologyAuditTask(BaseTask): task_id = "task1_methodology_audit" task_name = "CONSORT Protocol Violation Audit" difficulty = "easy" max_steps = 20 def generate_episode(self) -> dict: rng = self.rng domain = rng.choice(DOMAINS) ctx = { "field": domain["field"], "intervention": rng.choice(domain["intervention"]), "outcome": rng.choice(domain["outcome"]), "group_type": domain["group_type"], "n_per_group": rng.choice([18, 22, 24, 26, 28]), # small → underpowered "total_n": 0, "alpha": 0.05, "rng": rng, "flaws_text": {}, # section -> flaw sentence injected "flaw_notes": [], # ground truth list } ctx["total_n"] = ctx["n_per_group"] * 2 # Inject all 4 flaws flaw_ids = [] for i, ft in enumerate(FLAW_TEMPLATES): flaw_id = f"flaw_{i+1}" flaw_sentence, flaw_note = ft["inject"](ctx, rng) ctx["flaws_text"][ft["section"]] = ctx["flaws_text"].get(ft["section"], "") + " " + flaw_sentence flaw_note.update({ "id": flaw_id, "taxonomy": ft["taxonomy"], "location": ft["section"], }) ctx["flaw_notes"].append(flaw_note) flaw_ids.append(flaw_id) sections = _build_sections(ctx) paper_text = _build_paper_text(ctx, sections) ground_truth = { "flaws": ctx["flaw_notes"], "flaw_sections": list({f["location"] for f in ctx["flaw_notes"]}), "flaw_ids": flaw_ids, "n_flaws": 4, } return { "paper_text": paper_text, "paper_sections": sections, "dataset_path": None, "ground_truth": ground_truth, } def _action_schema(self) -> dict: return { "read_section": {"section": "str — e.g. 'abstract', 'methods', 'statistical_analysis', 'results'"}, "flag_flaw": {"flaw_type": "str", "location": "str", "description": "str"}, "submit_audit": {"audit_payload": {"flaws": "[{flaw_type, location, description}]"}}, } @classmethod def generate(cls, seed=None): """Convenience method for Task 5 consumption.""" task = cls(seed=seed) ep = task.generate_episode() state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")} return state, ep["ground_truth"] # --------------------------------------------------------------------------- # Flaw injectors — CONSORT protocol violations # --------------------------------------------------------------------------- def _inject_unblinded_bias(ctx: dict, rng: random.Random) -> tuple[str, dict]: """Inject unblinded investigator bias (CONSORT violation).""" outcome = ctx["outcome"] sentence = ( f"The principal investigator, who was unblinded to group allocation, " f"personally assessed {outcome} across all {ctx['group_type']}. " f"No independent blinded assessor was used for outcome evaluation." ) note = { "description": f"Unblinded investigator assessed primary outcome ({outcome}); " f"CONSORT requires blinded outcome assessment to prevent detection bias.", "hint_keywords": ["unblinded", "investigator bias", "detection bias", "blinding", "assessor"], } return sentence, note def _inject_insufficient_power(ctx: dict, rng: random.Random) -> tuple[str, dict]: """Inject an insufficient power analysis (CONSORT violation).""" n = ctx["n_per_group"] sentence = ( f"A total of {ctx['total_n']} {ctx['group_type']} were recruited " f"({n} per group). No a priori power analysis was conducted " f"and no sample size justification was provided per ICH-GCP guidelines." ) note = { "description": f"Sample of {n} per group is underpowered for detecting medium effects " f"(typically requires n≥64 per group at 80% power, α=0.05). " f"No power analysis reported — CONSORT Item 7a violation.", "hint_keywords": ["underpowered", "sample size", "power analysis", "insufficient power", "ICH-GCP"], } return sentence, note def _inject_protocol_deviation(ctx: dict, rng: random.Random) -> tuple[str, dict]: """Inject unreported protocol deviation (CONSORT violation).""" n_excluded = rng.choice([5, 7, 9, 11]) original_n = ctx["total_n"] reported_n = original_n - n_excluded outcome = ctx["outcome"] p_val = round(rng.uniform(0.02, 0.048), 3) sentence = ( f"After per-protocol analysis, the {ctx['intervention']} group showed significantly " f"improved {outcome} compared to placebo (p={p_val}, n={reported_n}), " f"representing a clinically meaningful improvement. Protocol deviations " f"were not separately reported in the CONSORT flow diagram." ) note = { "description": f"Results reported for n={reported_n} but {original_n} were recruited. " f"{n_excluded} participants excluded from analysis without disclosure " f"in the CONSORT flow diagram — protocol deviation unreported.", "hint_keywords": ["protocol deviation", "exclusion", "excluded", "undisclosed", "CONSORT flow"], "excluded_n": n_excluded, "reported_n": reported_n, "original_n": original_n, } return sentence, note def _inject_endpoint_switching(ctx: dict, rng: random.Random) -> tuple[str, dict]: """Inject endpoint switching (CONSORT violation).""" outcomes_tested = [ctx["outcome"], "secondary biomarker", "quality of life score", "adverse event rate", "dropout rate"] rng.shuffle(outcomes_tested) sentence = ( f"The pre-registered primary endpoint was {outcomes_tested[1]}, however " f"multiple secondary outcomes were assessed including " f"{', '.join(outcomes_tested[:3])}. " f"Only {ctx['outcome']} reached statistical significance (p=0.043) " f"and is reported as the primary outcome in the final analysis." ) note = { "description": f"Primary endpoint was switched post-hoc. Multiple outcomes tested " f"({len(outcomes_tested[:3])}) without correction for multiple comparisons. " f"Original primary endpoint did not reach significance — CONSORT violation.", "hint_keywords": ["endpoint switching", "primary endpoint", "outcome switching", "multiple comparison", "selective reporting"], } return sentence, note # --------------------------------------------------------------------------- # Paper builder # --------------------------------------------------------------------------- def _build_sections(ctx: dict) -> dict: field = ctx["field"] intervention = ctx["intervention"] outcome = ctx["outcome"] group_type = ctx["group_type"] total_n = ctx["total_n"] n_per_group = ctx["n_per_group"] abstract = textwrap.dedent(f""" Background: This {field} evaluated the efficacy of {intervention} on {outcome} in accordance with ICH-GCP guidelines and CONSORT reporting standards. We hypothesised that {group_type} receiving {intervention} would demonstrate significantly better {outcome} compared to those receiving placebo. Methods: A randomised controlled trial design was employed per IRB approval. Results: {intervention} produced statistically significant improvements. Conclusion: These findings support adoption of {intervention} in clinical practice. """).strip() participants = textwrap.dedent(f""" {total_n} {group_type} were enrolled from three clinical sites between 2021 and 2023 under IRB protocol #2021-CT-{ctx['rng'].randint(100,999)}. Inclusion criteria: aged 18–65, no prior treatment exposure, written informed consent obtained per Declaration of Helsinki. Exclusion criteria: severe comorbidities, inability to complete clinical assessments. {ctx['flaws_text'].get('participants', '')} Participants were randomly assigned to {intervention} (n={n_per_group}) or placebo (n={n_per_group}) using block randomisation (block size=4) per the CONSORT-compliant allocation sequence. """).strip() statistical_analysis = textwrap.dedent(f""" All analyses were performed using SAS v9.4 and Python 3.10 per the pre-registered Statistical Analysis Plan (SAP). The primary outcome ({outcome}) was compared between groups at 12 weeks. {ctx['flaws_text'].get('statistical_analysis', '')} Significance threshold was set at α=0.05. Missing data handled via last observation carried forward (LOCF). No corrections for multiple comparisons were pre-specified in the protocol or SAP. """).strip() results = textwrap.dedent(f""" {ctx['flaws_text'].get('results', '')} Secondary analysis of subgroups by age and sex showed consistent direction of effect. Adverse events were minor and balanced across arms (p=0.71). Full CONSORT flow diagram available in supplementary materials. """).strip() discussion = textwrap.dedent(f""" The present study demonstrates that {intervention} significantly improves {outcome} in {group_type}. These results are consistent with prior mechanistic studies and meet the bar for regulatory consideration. Limitations include the single-blind design and relatively short follow-up period of 12 weeks. Future work should examine long-term durability of effects and dose-response relationships per FDA post-marketing surveillance requirements. Generalisability may be limited to populations similar to those studied. """).strip() return { "abstract": abstract, "participants": participants, "statistical_analysis": statistical_analysis, "results": results, "discussion": discussion, } def _build_paper_text(ctx: dict, sections: dict) -> str: intervention = ctx["intervention"] outcome = ctx["outcome"] field = ctx["field"] return textwrap.dedent(f""" TITLE: Efficacy of {intervention} on {outcome}: A CONSORT-compliant randomised controlled {field} ABSTRACT {sections['abstract']} 1. PARTICIPANTS (CONSORT Items 3-5) {sections['participants']} 2. STATISTICAL ANALYSIS (CONSORT Item 12) {sections['statistical_analysis']} 3. RESULTS (CONSORT Items 13-19) {sections['results']} 4. DISCUSSION (CONSORT Items 20-22) {sections['discussion']} --- Available sections for read_section: abstract, participants, statistical_analysis, results, discussion """).strip()