"""
Task 1: Methodology Audit — EASY (PeerGuard: CONSORT Protocol Violation Audit)
---------------------------------
Agent reads a synthetic clinical trial paper stub and must identify 4 planted
CONSORT protocol violations. Paper is procedurally generated each episode
to prevent memorisation.

Flaw injection system:
  - Templates have slots: [STATISTICAL_TEST], [GROUP_A], [GROUP_B], etc.
  - At generation time, a valid combination is picked, then one or more
    flaws are injected by replacing the valid choice with an incorrect one.
  - Ground truth records which taxonomy applies and where.
"""
from __future__ import annotations

import random
import textwrap
from typing import Optional

from tasks.base import BaseTask


# ---------------------------------------------------------------------------
# Vocabulary pools — the raw material for procedural generation
# ---------------------------------------------------------------------------

DOMAINS = [
    {
        "field": "clinical trial",
        "intervention": ["Drug A", "Treatment B", "Compound X", "Therapy Z"],
        "outcome": ["recovery time", "symptom severity", "blood pressure", "pain score"],
        "group_type": "patients",
    },
    {
        "field": "psychology study",
        "intervention": ["Mindfulness training", "Cognitive therapy", "Group therapy", "App-based intervention"],
        "outcome": ["anxiety score", "depression score", "stress levels", "cognitive performance"],
        "group_type": "participants",
    },
    {
        "field": "educational study",
        "intervention": ["Active learning", "Flipped classroom", "Peer tutoring", "Digital tools"],
        "outcome": ["exam scores", "knowledge retention", "engagement", "completion rate"],
        "group_type": "students",
    },
]

# (test_name, valid_data_types, invalid_for)
STAT_TESTS = {
    "independent samples t-test":   {"valid": ["continuous", "normally distributed"],     "invalid_for": ["categorical", "ordinal", "non-normal"]},
    "chi-square test":               {"valid": ["categorical", "frequency data"],           "invalid_for": ["continuous", "time-series"]},
    "Mann-Whitney U test":           {"valid": ["ordinal", "non-normal continuous"],        "invalid_for": ["normally distributed small samples"]},
    "one-way ANOVA":                 {"valid": ["continuous", "3+ groups", "normal"],       "invalid_for": ["binary outcome", "two groups"]},
    "Pearson correlation":           {"valid": ["continuous linear relationship"],          "invalid_for": ["ordinal", "non-linear"]},
    "logistic regression":           {"valid": ["binary outcome", "categorical"],           "invalid_for": ["continuous outcome", "ordinal multi-class"]},
}

# Flaw templates: (flaw_taxonomy, description_template, section_hint)
FLAW_TEMPLATES = [
    {
        "taxonomy": "unblinded_investigator_bias",
        "inject":   lambda ctx, rng: _inject_unblinded_bias(ctx, rng),
        "section":  "statistical_analysis",
    },
    {
        "taxonomy": "insufficient_power_analysis",
        "inject":   lambda ctx, rng: _inject_insufficient_power(ctx, rng),
        "section":  "participants",
    },
    {
        "taxonomy": "protocol_deviation_unreported",
        "inject":   lambda ctx, rng: _inject_protocol_deviation(ctx, rng),
        "section":  "results",
    },
    {
        "taxonomy": "endpoint_switching",
        "inject":   lambda ctx, rng: _inject_endpoint_switching(ctx, rng),
        "section":  "results",
    },
]


# ---------------------------------------------------------------------------
# Task class
# ---------------------------------------------------------------------------

class MethodologyAuditTask(BaseTask):
    task_id    = "task1_methodology_audit"
    task_name  = "CONSORT Protocol Violation Audit"
    difficulty = "easy"
    max_steps  = 20

    def generate_episode(self) -> dict:
        rng = self.rng
        domain = rng.choice(DOMAINS)

        ctx = {
            "field":         domain["field"],
            "intervention":  rng.choice(domain["intervention"]),
            "outcome":       rng.choice(domain["outcome"]),
            "group_type":    domain["group_type"],
            "n_per_group":   rng.choice([18, 22, 24, 26, 28]),   # small → underpowered
            "total_n":       0,
            "alpha":         0.05,
            "rng":           rng,
            "flaws_text":    {},   # section -> flaw sentence injected
            "flaw_notes":    [],   # ground truth list
        }
        ctx["total_n"] = ctx["n_per_group"] * 2

        # Inject all 4 flaws
        flaw_ids = []
        for i, ft in enumerate(FLAW_TEMPLATES):
            flaw_id = f"flaw_{i+1}"
            flaw_sentence, flaw_note = ft["inject"](ctx, rng)
            ctx["flaws_text"][ft["section"]] = ctx["flaws_text"].get(ft["section"], "") + " " + flaw_sentence
            flaw_note.update({
                "id":       flaw_id,
                "taxonomy": ft["taxonomy"],
                "location": ft["section"],
            })
            ctx["flaw_notes"].append(flaw_note)
            flaw_ids.append(flaw_id)

        sections = _build_sections(ctx)
        paper_text = _build_paper_text(ctx, sections)

        ground_truth = {
            "flaws":         ctx["flaw_notes"],
            "flaw_sections": list({f["location"] for f in ctx["flaw_notes"]}),
            "flaw_ids":      flaw_ids,
            "n_flaws":       4,
        }

        return {
            "paper_text":     paper_text,
            "paper_sections": sections,
            "dataset_path":   None,
            "ground_truth":   ground_truth,
        }

    def _action_schema(self) -> dict:
        return {
            "read_section":  {"section": "str — e.g. 'abstract', 'methods', 'statistical_analysis', 'results'"},
            "flag_flaw":     {"flaw_type": "str", "location": "str", "description": "str"},
            "submit_audit":  {"audit_payload": {"flaws": "[{flaw_type, location, description}]"}},
        }

    @classmethod
    def generate(cls, seed=None):
        """Convenience method for Task 5 consumption."""
        task = cls(seed=seed)
        ep = task.generate_episode()
        state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")}
        return state, ep["ground_truth"]


# ---------------------------------------------------------------------------
# Flaw injectors — CONSORT protocol violations
# ---------------------------------------------------------------------------

def _inject_unblinded_bias(ctx: dict, rng: random.Random) -> tuple[str, dict]:
    """Inject unblinded investigator bias (CONSORT violation)."""
    outcome = ctx["outcome"]
    sentence = (
        f"The principal investigator, who was unblinded to group allocation, "
        f"personally assessed {outcome} across all {ctx['group_type']}. "
        f"No independent blinded assessor was used for outcome evaluation."
    )
    note = {
        "description": f"Unblinded investigator assessed primary outcome ({outcome}); "
                       f"CONSORT requires blinded outcome assessment to prevent detection bias.",
        "hint_keywords": ["unblinded", "investigator bias", "detection bias", "blinding", "assessor"],
    }
    return sentence, note


def _inject_insufficient_power(ctx: dict, rng: random.Random) -> tuple[str, dict]:
    """Inject an insufficient power analysis (CONSORT violation)."""
    n = ctx["n_per_group"]
    sentence = (
        f"A total of {ctx['total_n']} {ctx['group_type']} were recruited "
        f"({n} per group). No a priori power analysis was conducted "
        f"and no sample size justification was provided per ICH-GCP guidelines."
    )
    note = {
        "description": f"Sample of {n} per group is underpowered for detecting medium effects "
                       f"(typically requires n≥64 per group at 80% power, α=0.05). "
                       f"No power analysis reported — CONSORT Item 7a violation.",
        "hint_keywords": ["underpowered", "sample size", "power analysis", "insufficient power", "ICH-GCP"],
    }
    return sentence, note


def _inject_protocol_deviation(ctx: dict, rng: random.Random) -> tuple[str, dict]:
    """Inject unreported protocol deviation (CONSORT violation)."""
    n_excluded = rng.choice([5, 7, 9, 11])
    original_n = ctx["total_n"]
    reported_n = original_n - n_excluded
    outcome = ctx["outcome"]
    p_val = round(rng.uniform(0.02, 0.048), 3)
    sentence = (
        f"After per-protocol analysis, the {ctx['intervention']} group showed significantly "
        f"improved {outcome} compared to placebo (p={p_val}, n={reported_n}), "
        f"representing a clinically meaningful improvement. Protocol deviations "
        f"were not separately reported in the CONSORT flow diagram."
    )
    note = {
        "description": f"Results reported for n={reported_n} but {original_n} were recruited. "
                       f"{n_excluded} participants excluded from analysis without disclosure "
                       f"in the CONSORT flow diagram — protocol deviation unreported.",
        "hint_keywords": ["protocol deviation", "exclusion", "excluded", "undisclosed", "CONSORT flow"],
        "excluded_n": n_excluded,
        "reported_n": reported_n,
        "original_n": original_n,
    }
    return sentence, note


def _inject_endpoint_switching(ctx: dict, rng: random.Random) -> tuple[str, dict]:
    """Inject endpoint switching (CONSORT violation)."""
    outcomes_tested = [ctx["outcome"], "secondary biomarker", "quality of life score",
                       "adverse event rate", "dropout rate"]
    rng.shuffle(outcomes_tested)
    sentence = (
        f"The pre-registered primary endpoint was {outcomes_tested[1]}, however "
        f"multiple secondary outcomes were assessed including "
        f"{', '.join(outcomes_tested[:3])}. "
        f"Only {ctx['outcome']} reached statistical significance (p=0.043) "
        f"and is reported as the primary outcome in the final analysis."
    )
    note = {
        "description": f"Primary endpoint was switched post-hoc. Multiple outcomes tested "
                       f"({len(outcomes_tested[:3])}) without correction for multiple comparisons. "
                       f"Original primary endpoint did not reach significance — CONSORT violation.",
        "hint_keywords": ["endpoint switching", "primary endpoint", "outcome switching", 
                         "multiple comparison", "selective reporting"],
    }
    return sentence, note


# ---------------------------------------------------------------------------
# Paper builder
# ---------------------------------------------------------------------------

def _build_sections(ctx: dict) -> dict:
    field        = ctx["field"]
    intervention = ctx["intervention"]
    outcome      = ctx["outcome"]
    group_type   = ctx["group_type"]
    total_n      = ctx["total_n"]
    n_per_group  = ctx["n_per_group"]

    abstract = textwrap.dedent(f"""
        Background: This {field} evaluated the efficacy of {intervention} on {outcome}
        in accordance with ICH-GCP guidelines and CONSORT reporting standards.
        We hypothesised that {group_type} receiving {intervention} would demonstrate
        significantly better {outcome} compared to those receiving placebo.
        Methods: A randomised controlled trial design was employed per IRB approval.
        Results: {intervention} produced statistically significant improvements.
        Conclusion: These findings support adoption of {intervention} in clinical practice.
    """).strip()

    participants = textwrap.dedent(f"""
        {total_n} {group_type} were enrolled from three clinical sites between 2021 and 2023
        under IRB protocol #2021-CT-{ctx['rng'].randint(100,999)}.
        Inclusion criteria: aged 18–65, no prior treatment exposure, written informed
        consent obtained per Declaration of Helsinki.
        Exclusion criteria: severe comorbidities, inability to complete clinical assessments.
        {ctx['flaws_text'].get('participants', '')}
        Participants were randomly assigned to {intervention} (n={n_per_group})
        or placebo (n={n_per_group}) using block randomisation (block size=4)
        per the CONSORT-compliant allocation sequence.
    """).strip()

    statistical_analysis = textwrap.dedent(f"""
        All analyses were performed using SAS v9.4 and Python 3.10 per the
        pre-registered Statistical Analysis Plan (SAP).
        The primary outcome ({outcome}) was compared between groups at 12 weeks.
        {ctx['flaws_text'].get('statistical_analysis', '')}
        Significance threshold was set at α=0.05. Missing data handled via last
        observation carried forward (LOCF). No corrections for multiple comparisons
        were pre-specified in the protocol or SAP.
    """).strip()

    results = textwrap.dedent(f"""
        {ctx['flaws_text'].get('results', '')}
        Secondary analysis of subgroups by age and sex showed consistent direction
        of effect. Adverse events were minor and balanced across arms (p=0.71).
        Full CONSORT flow diagram available in supplementary materials.
    """).strip()

    discussion = textwrap.dedent(f"""
        The present study demonstrates that {intervention} significantly improves
        {outcome} in {group_type}. These results are consistent with prior
        mechanistic studies and meet the bar for regulatory consideration.
        Limitations include the single-blind design and
        relatively short follow-up period of 12 weeks. Future work should
        examine long-term durability of effects and dose-response relationships
        per FDA post-marketing surveillance requirements.
        Generalisability may be limited to populations similar to those studied.
    """).strip()

    return {
        "abstract":              abstract,
        "participants":          participants,
        "statistical_analysis":  statistical_analysis,
        "results":               results,
        "discussion":            discussion,
    }


def _build_paper_text(ctx: dict, sections: dict) -> str:
    intervention = ctx["intervention"]
    outcome      = ctx["outcome"]
    field        = ctx["field"]
    return textwrap.dedent(f"""
        TITLE: Efficacy of {intervention} on {outcome}: A CONSORT-compliant randomised controlled {field}

        ABSTRACT
        {sections['abstract']}

        1. PARTICIPANTS (CONSORT Items 3-5)
        {sections['participants']}

        2. STATISTICAL ANALYSIS (CONSORT Item 12)
        {sections['statistical_analysis']}

        3. RESULTS (CONSORT Items 13-19)
        {sections['results']}

        4. DISCUSSION (CONSORT Items 20-22)
        {sections['discussion']}

        ---
        Available sections for read_section: abstract, participants,
        statistical_analysis, results, discussion
    """).strip()