File size: 7,163 Bytes
62b6842
8122ba9
62b6842
 
8122ba9
 
 
62b6842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8122ba9
62b6842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8122ba9
62b6842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8122ba9
 
 
 
 
62b6842
 
 
8122ba9
 
62b6842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8122ba9
62b6842
 
 
8122ba9
 
 
62b6842
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8122ba9
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Task 2: Experiment Replication — MEDIUM (PeerGuard: Adverse Event Analysis)
----------------------------------------
Agent receives a methods section describing a logistic regression experiment
on a tabular CSV dataset. The sponsor claims no significant adverse events.
The agent must write and run code to replicate the reported AUC-ROC and F1
score.

Key challenge: the dataset has class imbalance. A naive model trained without
stratified splitting will score ~0.71 AUC — outside the pass threshold.
The agent must notice the imbalance and handle it correctly.

The dataset is generated procedurally each episode with random seed,
varying feature names and imbalance ratio to prevent memorisation.
"""
from __future__ import annotations

import os
import tempfile
import textwrap
from typing import Optional

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from tasks.base import BaseTask


class ReplicationTask(BaseTask):
    task_id    = "task2_replication"
    task_name  = "Adverse Event Replication Analysis"
    difficulty = "medium"
    max_steps  = 20

    def generate_episode(self) -> dict:
        rng    = self.rng
        np_rng = np.random.RandomState(rng.randint(0, 2**31 - 1))

        # --- Procedurally vary the domain ---
        domains = [
            {
                "name":     "patient readmission",
                "features": ["age", "num_prior_admissions", "los_days",
                              "num_medications", "comorbidity_score"],
                "target":   "readmitted_30d",
            },
            {
                "name":     "loan default",
                "features": ["credit_score", "debt_to_income", "loan_amount",
                              "employment_years", "num_open_accounts"],
                "target":   "defaulted",
            },
            {
                "name":     "equipment failure",
                "features": ["operating_hours", "temperature_avg", "vibration_rms",
                              "pressure_delta", "maintenance_lag_days"],
                "target":   "failure_within_30d",
            },
        ]
        domain = rng.choice(domains)

        n_samples      = 400
        imbalance_rate = rng.choice([0.18, 0.20, 0.22])   # ~20% positive class
        n_pos          = int(n_samples * imbalance_rate)
        n_neg          = n_samples - n_pos

        # Generate correlated features
        n_feat = len(domain["features"])
        X_neg  = np_rng.randn(n_neg, n_feat)
        X_pos  = np_rng.randn(n_pos, n_feat) + np_rng.uniform(0.4, 0.7, n_feat)

        X = np.vstack([X_neg, X_pos])
        y = np.array([0] * n_neg + [1] * n_pos)

        # Shuffle
        idx = np_rng.permutation(n_samples)
        X, y = X[idx], y[idx]

        df = pd.DataFrame(X, columns=domain["features"])
        df[domain["target"]] = y

        # Save dataset to temp file
        tmp_dir      = tempfile.mkdtemp(prefix="rig_task2_")
        dataset_path = os.path.join(tmp_dir, "task2_readmission.csv")
        df.to_csv(dataset_path, index=False)

        # --- Compute ground truth (stratified split, seed=42) ---
        X_arr = df[domain["features"]].values
        y_arr = df[domain["target"]].values

        X_train, X_test, y_train, y_test = train_test_split(
            X_arr, y_arr, test_size=0.25, random_state=42, stratify=y_arr
        )
        scaler  = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test  = scaler.transform(X_test)

        clf = LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced")
        clf.fit(X_train, y_train)

        y_prob = clf.predict_proba(X_test)[:, 1]
        y_pred = clf.predict(X_test)

        gt_auc = round(float(roc_auc_score(y_test, y_prob)), 4)
        gt_f1  = round(float(f1_score(y_test, y_pred, zero_division=0)), 4)

        # --- Build methods section ---
        feature_list = ", ".join(f"`{f}`" for f in domain["features"])
        paper_text = textwrap.dedent(f"""
            CLINICAL TRIAL SAFETY ANALYSIS: Predictive modelling for {domain['name']}

            SPONSOR CLAIM
            The sponsor claims no significant adverse cardiovascular events
            were observed in the treatment cohort during the trial period.

            METHODS SECTION (to replicate)
            --------------------------------
            Dataset: {n_samples} patient records with {n_feat} clinical features ({feature_list})
            and binary outcome variable `{domain['target']}`.

            Preprocessing: Standardise all features using StandardScaler
            (fit on training set, transform both sets).

            Model: Logistic Regression with class_weight='balanced', random_state=42,
            max_iter=1000.

            Split: 75% train / 25% test using train_test_split with
            random_state=42 and stratify=y.

            Metrics to report:
              - AUC-ROC (roc_auc_score on test set, using predicted probabilities)
              - F1-score (f1_score on test set, using predicted class labels)

            The dataset is available at path: DATASET_PATH
            (use read_dataset action to inspect it, or reference DATASET_PATH in code)

            REPORTED RESULTS (from sponsor analysis):
              AUC-ROC : {gt_auc}
              F1-score: {gt_f1}

            Your task: reproduce these results and assess whether the sponsor's
            claim of no significant adverse events is supported by the data.
            Submit with submit_results action.
            Tolerance: AUC within ±0.03, F1 within ±0.03 for full credit.
        """).strip()

        ground_truth = {
            "auc":          gt_auc,
            "f1":           gt_f1,
            "target_col":   domain["target"],
            "feature_cols": domain["features"],
            "domain":       domain["name"],
            "n_samples":    n_samples,
            "imbalance_rate": imbalance_rate,
        }

        return {
            "paper_text":     paper_text,
            "paper_sections": {"methods": paper_text},
            "dataset_path":   dataset_path,
            "ground_truth":   ground_truth,
        }

    def _action_schema(self) -> dict:
        return {
            "read_dataset":    {"rows": "int (optional, default 20)"},
            "execute_code":    {"code": "str — Python code; DATASET_PATH constant available"},
            "submit_results":  {
                "results_payload": {
                    "auc":            "float",
                    "f1":             "float",
                    "interpretation": "str — brief explanation of results",
                }
            },
        }

    @classmethod
    def generate(cls, seed=None):
        """Convenience method for Task 5 consumption."""
        task = cls(seed=seed)
        ep = task.generate_episode()
        state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")}
        return state, ep["ground_truth"]