Spaces:

Nexus18
/

research-integrity-gym

Sleeping

File size: 7,163 Bytes

"""
Task 2: Experiment Replication — MEDIUM (PeerGuard: Adverse Event Analysis)
----------------------------------------
Agent receives a methods section describing a logistic regression experiment
on a tabular CSV dataset. The sponsor claims no significant adverse events.
The agent must write and run code to replicate the reported AUC-ROC and F1
score.

Key challenge: the dataset has class imbalance. A naive model trained without
stratified splitting will score ~0.71 AUC — outside the pass threshold.
The agent must notice the imbalance and handle it correctly.

The dataset is generated procedurally each episode with random seed,
varying feature names and imbalance ratio to prevent memorisation.
"""
from __future__ import annotations

import os
import tempfile
import textwrap
from typing import Optional

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tasks.base import BaseTask


class ReplicationTask(BaseTask):
    task_id    = "task2_replication"
    task_name  = "Adverse Event Replication Analysis"
    difficulty = "medium"
    max_steps  = 20

    def generate_episode(self) -> dict:
        rng    = self.rng
        np_rng = np.random.RandomState(rng.randint(0, 2**31 - 1))

        # --- Procedurally vary the domain ---
        domains = [
            {
                "name":     "patient readmission",
                "features": ["age", "num_prior_admissions", "los_days",
                              "num_medications", "comorbidity_score"],
                "target":   "readmitted_30d",
            },
            {
                "name":     "loan default",
                "features": ["credit_score", "debt_to_income", "loan_amount",
                              "employment_years", "num_open_accounts"],
                "target":   "defaulted",
            },
            {
                "name":     "equipment failure",
                "features": ["operating_hours", "temperature_avg", "vibration_rms",
                              "pressure_delta", "maintenance_lag_days"],
                "target":   "failure_within_30d",
            },
        ]
        domain = rng.choice(domains)

        n_samples      = 400
        imbalance_rate = rng.choice([0.18, 0.20, 0.22])   # ~20% positive class
        n_pos          = int(n_samples * imbalance_rate)
        n_neg          = n_samples - n_pos

        # Generate correlated features
        n_feat = len(domain["features"])
        X_neg  = np_rng.randn(n_neg, n_feat)
        X_pos  = np_rng.randn(n_pos, n_feat) + np_rng.uniform(0.4, 0.7, n_feat)

        X = np.vstack([X_neg, X_pos])
        y = np.array([0] * n_neg + [1] * n_pos)

        # Shuffle
        idx = np_rng.permutation(n_samples)
        X, y = X[idx], y[idx]

        df = pd.DataFrame(X, columns=domain["features"])
        df[domain["target"]] = y

        # Save dataset to temp file
        tmp_dir      = tempfile.mkdtemp(prefix="rig_task2_")
        dataset_path = os.path.join(tmp_dir, "task2_readmission.csv")
        df.to_csv(dataset_path, index=False)

        # --- Compute ground truth (stratified split, seed=42) ---
        X_arr = df[domain["features"]].values
        y_arr = df[domain["target"]].values

        X_train, X_test, y_train, y_test = train_test_split(
            X_arr, y_arr, test_size=0.25, random_state=42, stratify=y_arr
        )
        scaler  = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test  = scaler.transform(X_test)

        clf = LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced")
        clf.fit(X_train, y_train)

        y_prob = clf.predict_proba(X_test)[:, 1]
        y_pred = clf.predict(X_test)

        gt_auc = round(float(roc_auc_score(y_test, y_prob)), 4)
        gt_f1  = round(float(f1_score(y_test, y_pred, zero_division=0)), 4)

        # --- Build methods section ---
        feature_list = ", ".join(f"`{f}`" for f in domain["features"])
        paper_text = textwrap.dedent(f"""
            CLINICAL TRIAL SAFETY ANALYSIS: Predictive modelling for {domain['name']}

            SPONSOR CLAIM
            The sponsor claims no significant adverse cardiovascular events
            were observed in the treatment cohort during the trial period.

            METHODS SECTION (to replicate)
            --------------------------------
            Dataset: {n_samples} patient records with {n_feat} clinical features ({feature_list})
            and binary outcome variable `{domain['target']}`.

            Preprocessing: Standardise all features using StandardScaler
            (fit on training set, transform both sets).

            Model: Logistic Regression with class_weight='balanced', random_state=42,
            max_iter=1000.

            Split: 75% train / 25% test using train_test_split with
            random_state=42 and stratify=y.

            Metrics to report:
              - AUC-ROC (roc_auc_score on test set, using predicted probabilities)
              - F1-score (f1_score on test set, using predicted class labels)

            The dataset is available at path: DATASET_PATH
            (use read_dataset action to inspect it, or reference DATASET_PATH in code)

            REPORTED RESULTS (from sponsor analysis):
              AUC-ROC : {gt_auc}
              F1-score: {gt_f1}

            Your task: reproduce these results and assess whether the sponsor's
            claim of no significant adverse events is supported by the data.
            Submit with submit_results action.
            Tolerance: AUC within ±0.03, F1 within ±0.03 for full credit.
        """).strip()

        ground_truth = {
            "auc":          gt_auc,
            "f1":           gt_f1,
            "target_col":   domain["target"],
            "feature_cols": domain["features"],
            "domain":       domain["name"],
            "n_samples":    n_samples,
            "imbalance_rate": imbalance_rate,
        }

        return {
            "paper_text":     paper_text,
            "paper_sections": {"methods": paper_text},
            "dataset_path":   dataset_path,
            "ground_truth":   ground_truth,
        }

    def _action_schema(self) -> dict:
        return {
            "read_dataset":    {"rows": "int (optional, default 20)"},
            "execute_code":    {"code": "str — Python code; DATASET_PATH constant available"},
            "submit_results":  {
                "results_payload": {
                    "auc":            "float",
                    "f1":             "float",
                    "interpretation": "str — brief explanation of results",
                }
            },
        }

    @classmethod
    def generate(cls, seed=None):
        """Convenience method for Task 5 consumption."""
        task = cls(seed=seed)
        ep = task.generate_episode()
        state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")}
        return state, ep["ground_truth"]