research-integrity-gym / tasks /task1_methodology_audit.py
Bhavishya011
refactor: PeerGuard clinical trial verification system
8122ba9
"""
Task 1: Methodology Audit — EASY (PeerGuard: CONSORT Protocol Violation Audit)
---------------------------------
Agent reads a synthetic clinical trial paper stub and must identify 4 planted
CONSORT protocol violations. Paper is procedurally generated each episode
to prevent memorisation.
Flaw injection system:
- Templates have slots: [STATISTICAL_TEST], [GROUP_A], [GROUP_B], etc.
- At generation time, a valid combination is picked, then one or more
flaws are injected by replacing the valid choice with an incorrect one.
- Ground truth records which taxonomy applies and where.
"""
from __future__ import annotations
import random
import textwrap
from typing import Optional
from tasks.base import BaseTask
# ---------------------------------------------------------------------------
# Vocabulary pools — the raw material for procedural generation
# ---------------------------------------------------------------------------
DOMAINS = [
{
"field": "clinical trial",
"intervention": ["Drug A", "Treatment B", "Compound X", "Therapy Z"],
"outcome": ["recovery time", "symptom severity", "blood pressure", "pain score"],
"group_type": "patients",
},
{
"field": "psychology study",
"intervention": ["Mindfulness training", "Cognitive therapy", "Group therapy", "App-based intervention"],
"outcome": ["anxiety score", "depression score", "stress levels", "cognitive performance"],
"group_type": "participants",
},
{
"field": "educational study",
"intervention": ["Active learning", "Flipped classroom", "Peer tutoring", "Digital tools"],
"outcome": ["exam scores", "knowledge retention", "engagement", "completion rate"],
"group_type": "students",
},
]
# (test_name, valid_data_types, invalid_for)
STAT_TESTS = {
"independent samples t-test": {"valid": ["continuous", "normally distributed"], "invalid_for": ["categorical", "ordinal", "non-normal"]},
"chi-square test": {"valid": ["categorical", "frequency data"], "invalid_for": ["continuous", "time-series"]},
"Mann-Whitney U test": {"valid": ["ordinal", "non-normal continuous"], "invalid_for": ["normally distributed small samples"]},
"one-way ANOVA": {"valid": ["continuous", "3+ groups", "normal"], "invalid_for": ["binary outcome", "two groups"]},
"Pearson correlation": {"valid": ["continuous linear relationship"], "invalid_for": ["ordinal", "non-linear"]},
"logistic regression": {"valid": ["binary outcome", "categorical"], "invalid_for": ["continuous outcome", "ordinal multi-class"]},
}
# Flaw templates: (flaw_taxonomy, description_template, section_hint)
FLAW_TEMPLATES = [
{
"taxonomy": "unblinded_investigator_bias",
"inject": lambda ctx, rng: _inject_unblinded_bias(ctx, rng),
"section": "statistical_analysis",
},
{
"taxonomy": "insufficient_power_analysis",
"inject": lambda ctx, rng: _inject_insufficient_power(ctx, rng),
"section": "participants",
},
{
"taxonomy": "protocol_deviation_unreported",
"inject": lambda ctx, rng: _inject_protocol_deviation(ctx, rng),
"section": "results",
},
{
"taxonomy": "endpoint_switching",
"inject": lambda ctx, rng: _inject_endpoint_switching(ctx, rng),
"section": "results",
},
]
# ---------------------------------------------------------------------------
# Task class
# ---------------------------------------------------------------------------
class MethodologyAuditTask(BaseTask):
task_id = "task1_methodology_audit"
task_name = "CONSORT Protocol Violation Audit"
difficulty = "easy"
max_steps = 20
def generate_episode(self) -> dict:
rng = self.rng
domain = rng.choice(DOMAINS)
ctx = {
"field": domain["field"],
"intervention": rng.choice(domain["intervention"]),
"outcome": rng.choice(domain["outcome"]),
"group_type": domain["group_type"],
"n_per_group": rng.choice([18, 22, 24, 26, 28]), # small → underpowered
"total_n": 0,
"alpha": 0.05,
"rng": rng,
"flaws_text": {}, # section -> flaw sentence injected
"flaw_notes": [], # ground truth list
}
ctx["total_n"] = ctx["n_per_group"] * 2
# Inject all 4 flaws
flaw_ids = []
for i, ft in enumerate(FLAW_TEMPLATES):
flaw_id = f"flaw_{i+1}"
flaw_sentence, flaw_note = ft["inject"](ctx, rng)
ctx["flaws_text"][ft["section"]] = ctx["flaws_text"].get(ft["section"], "") + " " + flaw_sentence
flaw_note.update({
"id": flaw_id,
"taxonomy": ft["taxonomy"],
"location": ft["section"],
})
ctx["flaw_notes"].append(flaw_note)
flaw_ids.append(flaw_id)
sections = _build_sections(ctx)
paper_text = _build_paper_text(ctx, sections)
ground_truth = {
"flaws": ctx["flaw_notes"],
"flaw_sections": list({f["location"] for f in ctx["flaw_notes"]}),
"flaw_ids": flaw_ids,
"n_flaws": 4,
}
return {
"paper_text": paper_text,
"paper_sections": sections,
"dataset_path": None,
"ground_truth": ground_truth,
}
def _action_schema(self) -> dict:
return {
"read_section": {"section": "str — e.g. 'abstract', 'methods', 'statistical_analysis', 'results'"},
"flag_flaw": {"flaw_type": "str", "location": "str", "description": "str"},
"submit_audit": {"audit_payload": {"flaws": "[{flaw_type, location, description}]"}},
}
@classmethod
def generate(cls, seed=None):
"""Convenience method for Task 5 consumption."""
task = cls(seed=seed)
ep = task.generate_episode()
state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")}
return state, ep["ground_truth"]
# ---------------------------------------------------------------------------
# Flaw injectors — CONSORT protocol violations
# ---------------------------------------------------------------------------
def _inject_unblinded_bias(ctx: dict, rng: random.Random) -> tuple[str, dict]:
"""Inject unblinded investigator bias (CONSORT violation)."""
outcome = ctx["outcome"]
sentence = (
f"The principal investigator, who was unblinded to group allocation, "
f"personally assessed {outcome} across all {ctx['group_type']}. "
f"No independent blinded assessor was used for outcome evaluation."
)
note = {
"description": f"Unblinded investigator assessed primary outcome ({outcome}); "
f"CONSORT requires blinded outcome assessment to prevent detection bias.",
"hint_keywords": ["unblinded", "investigator bias", "detection bias", "blinding", "assessor"],
}
return sentence, note
def _inject_insufficient_power(ctx: dict, rng: random.Random) -> tuple[str, dict]:
"""Inject an insufficient power analysis (CONSORT violation)."""
n = ctx["n_per_group"]
sentence = (
f"A total of {ctx['total_n']} {ctx['group_type']} were recruited "
f"({n} per group). No a priori power analysis was conducted "
f"and no sample size justification was provided per ICH-GCP guidelines."
)
note = {
"description": f"Sample of {n} per group is underpowered for detecting medium effects "
f"(typically requires n≥64 per group at 80% power, α=0.05). "
f"No power analysis reported — CONSORT Item 7a violation.",
"hint_keywords": ["underpowered", "sample size", "power analysis", "insufficient power", "ICH-GCP"],
}
return sentence, note
def _inject_protocol_deviation(ctx: dict, rng: random.Random) -> tuple[str, dict]:
"""Inject unreported protocol deviation (CONSORT violation)."""
n_excluded = rng.choice([5, 7, 9, 11])
original_n = ctx["total_n"]
reported_n = original_n - n_excluded
outcome = ctx["outcome"]
p_val = round(rng.uniform(0.02, 0.048), 3)
sentence = (
f"After per-protocol analysis, the {ctx['intervention']} group showed significantly "
f"improved {outcome} compared to placebo (p={p_val}, n={reported_n}), "
f"representing a clinically meaningful improvement. Protocol deviations "
f"were not separately reported in the CONSORT flow diagram."
)
note = {
"description": f"Results reported for n={reported_n} but {original_n} were recruited. "
f"{n_excluded} participants excluded from analysis without disclosure "
f"in the CONSORT flow diagram — protocol deviation unreported.",
"hint_keywords": ["protocol deviation", "exclusion", "excluded", "undisclosed", "CONSORT flow"],
"excluded_n": n_excluded,
"reported_n": reported_n,
"original_n": original_n,
}
return sentence, note
def _inject_endpoint_switching(ctx: dict, rng: random.Random) -> tuple[str, dict]:
"""Inject endpoint switching (CONSORT violation)."""
outcomes_tested = [ctx["outcome"], "secondary biomarker", "quality of life score",
"adverse event rate", "dropout rate"]
rng.shuffle(outcomes_tested)
sentence = (
f"The pre-registered primary endpoint was {outcomes_tested[1]}, however "
f"multiple secondary outcomes were assessed including "
f"{', '.join(outcomes_tested[:3])}. "
f"Only {ctx['outcome']} reached statistical significance (p=0.043) "
f"and is reported as the primary outcome in the final analysis."
)
note = {
"description": f"Primary endpoint was switched post-hoc. Multiple outcomes tested "
f"({len(outcomes_tested[:3])}) without correction for multiple comparisons. "
f"Original primary endpoint did not reach significance — CONSORT violation.",
"hint_keywords": ["endpoint switching", "primary endpoint", "outcome switching",
"multiple comparison", "selective reporting"],
}
return sentence, note
# ---------------------------------------------------------------------------
# Paper builder
# ---------------------------------------------------------------------------
def _build_sections(ctx: dict) -> dict:
field = ctx["field"]
intervention = ctx["intervention"]
outcome = ctx["outcome"]
group_type = ctx["group_type"]
total_n = ctx["total_n"]
n_per_group = ctx["n_per_group"]
abstract = textwrap.dedent(f"""
Background: This {field} evaluated the efficacy of {intervention} on {outcome}
in accordance with ICH-GCP guidelines and CONSORT reporting standards.
We hypothesised that {group_type} receiving {intervention} would demonstrate
significantly better {outcome} compared to those receiving placebo.
Methods: A randomised controlled trial design was employed per IRB approval.
Results: {intervention} produced statistically significant improvements.
Conclusion: These findings support adoption of {intervention} in clinical practice.
""").strip()
participants = textwrap.dedent(f"""
{total_n} {group_type} were enrolled from three clinical sites between 2021 and 2023
under IRB protocol #2021-CT-{ctx['rng'].randint(100,999)}.
Inclusion criteria: aged 18–65, no prior treatment exposure, written informed
consent obtained per Declaration of Helsinki.
Exclusion criteria: severe comorbidities, inability to complete clinical assessments.
{ctx['flaws_text'].get('participants', '')}
Participants were randomly assigned to {intervention} (n={n_per_group})
or placebo (n={n_per_group}) using block randomisation (block size=4)
per the CONSORT-compliant allocation sequence.
""").strip()
statistical_analysis = textwrap.dedent(f"""
All analyses were performed using SAS v9.4 and Python 3.10 per the
pre-registered Statistical Analysis Plan (SAP).
The primary outcome ({outcome}) was compared between groups at 12 weeks.
{ctx['flaws_text'].get('statistical_analysis', '')}
Significance threshold was set at α=0.05. Missing data handled via last
observation carried forward (LOCF). No corrections for multiple comparisons
were pre-specified in the protocol or SAP.
""").strip()
results = textwrap.dedent(f"""
{ctx['flaws_text'].get('results', '')}
Secondary analysis of subgroups by age and sex showed consistent direction
of effect. Adverse events were minor and balanced across arms (p=0.71).
Full CONSORT flow diagram available in supplementary materials.
""").strip()
discussion = textwrap.dedent(f"""
The present study demonstrates that {intervention} significantly improves
{outcome} in {group_type}. These results are consistent with prior
mechanistic studies and meet the bar for regulatory consideration.
Limitations include the single-blind design and
relatively short follow-up period of 12 weeks. Future work should
examine long-term durability of effects and dose-response relationships
per FDA post-marketing surveillance requirements.
Generalisability may be limited to populations similar to those studied.
""").strip()
return {
"abstract": abstract,
"participants": participants,
"statistical_analysis": statistical_analysis,
"results": results,
"discussion": discussion,
}
def _build_paper_text(ctx: dict, sections: dict) -> str:
intervention = ctx["intervention"]
outcome = ctx["outcome"]
field = ctx["field"]
return textwrap.dedent(f"""
TITLE: Efficacy of {intervention} on {outcome}: A CONSORT-compliant randomised controlled {field}
ABSTRACT
{sections['abstract']}
1. PARTICIPANTS (CONSORT Items 3-5)
{sections['participants']}
2. STATISTICAL ANALYSIS (CONSORT Item 12)
{sections['statistical_analysis']}
3. RESULTS (CONSORT Items 13-19)
{sections['results']}
4. DISCUSSION (CONSORT Items 20-22)
{sections['discussion']}
---
Available sections for read_section: abstract, participants,
statistical_analysis, results, discussion
""").strip()