Spaces:
Sleeping
Sleeping
File size: 7,163 Bytes
62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 62b6842 8122ba9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 | """
Task 2: Experiment Replication — MEDIUM (PeerGuard: Adverse Event Analysis)
----------------------------------------
Agent receives a methods section describing a logistic regression experiment
on a tabular CSV dataset. The sponsor claims no significant adverse events.
The agent must write and run code to replicate the reported AUC-ROC and F1
score.
Key challenge: the dataset has class imbalance. A naive model trained without
stratified splitting will score ~0.71 AUC — outside the pass threshold.
The agent must notice the imbalance and handle it correctly.
The dataset is generated procedurally each episode with random seed,
varying feature names and imbalance ratio to prevent memorisation.
"""
from __future__ import annotations
import os
import tempfile
import textwrap
from typing import Optional
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tasks.base import BaseTask
class ReplicationTask(BaseTask):
task_id = "task2_replication"
task_name = "Adverse Event Replication Analysis"
difficulty = "medium"
max_steps = 20
def generate_episode(self) -> dict:
rng = self.rng
np_rng = np.random.RandomState(rng.randint(0, 2**31 - 1))
# --- Procedurally vary the domain ---
domains = [
{
"name": "patient readmission",
"features": ["age", "num_prior_admissions", "los_days",
"num_medications", "comorbidity_score"],
"target": "readmitted_30d",
},
{
"name": "loan default",
"features": ["credit_score", "debt_to_income", "loan_amount",
"employment_years", "num_open_accounts"],
"target": "defaulted",
},
{
"name": "equipment failure",
"features": ["operating_hours", "temperature_avg", "vibration_rms",
"pressure_delta", "maintenance_lag_days"],
"target": "failure_within_30d",
},
]
domain = rng.choice(domains)
n_samples = 400
imbalance_rate = rng.choice([0.18, 0.20, 0.22]) # ~20% positive class
n_pos = int(n_samples * imbalance_rate)
n_neg = n_samples - n_pos
# Generate correlated features
n_feat = len(domain["features"])
X_neg = np_rng.randn(n_neg, n_feat)
X_pos = np_rng.randn(n_pos, n_feat) + np_rng.uniform(0.4, 0.7, n_feat)
X = np.vstack([X_neg, X_pos])
y = np.array([0] * n_neg + [1] * n_pos)
# Shuffle
idx = np_rng.permutation(n_samples)
X, y = X[idx], y[idx]
df = pd.DataFrame(X, columns=domain["features"])
df[domain["target"]] = y
# Save dataset to temp file
tmp_dir = tempfile.mkdtemp(prefix="rig_task2_")
dataset_path = os.path.join(tmp_dir, "task2_readmission.csv")
df.to_csv(dataset_path, index=False)
# --- Compute ground truth (stratified split, seed=42) ---
X_arr = df[domain["features"]].values
y_arr = df[domain["target"]].values
X_train, X_test, y_train, y_test = train_test_split(
X_arr, y_arr, test_size=0.25, random_state=42, stratify=y_arr
)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
clf = LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced")
clf.fit(X_train, y_train)
y_prob = clf.predict_proba(X_test)[:, 1]
y_pred = clf.predict(X_test)
gt_auc = round(float(roc_auc_score(y_test, y_prob)), 4)
gt_f1 = round(float(f1_score(y_test, y_pred, zero_division=0)), 4)
# --- Build methods section ---
feature_list = ", ".join(f"`{f}`" for f in domain["features"])
paper_text = textwrap.dedent(f"""
CLINICAL TRIAL SAFETY ANALYSIS: Predictive modelling for {domain['name']}
SPONSOR CLAIM
The sponsor claims no significant adverse cardiovascular events
were observed in the treatment cohort during the trial period.
METHODS SECTION (to replicate)
--------------------------------
Dataset: {n_samples} patient records with {n_feat} clinical features ({feature_list})
and binary outcome variable `{domain['target']}`.
Preprocessing: Standardise all features using StandardScaler
(fit on training set, transform both sets).
Model: Logistic Regression with class_weight='balanced', random_state=42,
max_iter=1000.
Split: 75% train / 25% test using train_test_split with
random_state=42 and stratify=y.
Metrics to report:
- AUC-ROC (roc_auc_score on test set, using predicted probabilities)
- F1-score (f1_score on test set, using predicted class labels)
The dataset is available at path: DATASET_PATH
(use read_dataset action to inspect it, or reference DATASET_PATH in code)
REPORTED RESULTS (from sponsor analysis):
AUC-ROC : {gt_auc}
F1-score: {gt_f1}
Your task: reproduce these results and assess whether the sponsor's
claim of no significant adverse events is supported by the data.
Submit with submit_results action.
Tolerance: AUC within ±0.03, F1 within ±0.03 for full credit.
""").strip()
ground_truth = {
"auc": gt_auc,
"f1": gt_f1,
"target_col": domain["target"],
"feature_cols": domain["features"],
"domain": domain["name"],
"n_samples": n_samples,
"imbalance_rate": imbalance_rate,
}
return {
"paper_text": paper_text,
"paper_sections": {"methods": paper_text},
"dataset_path": dataset_path,
"ground_truth": ground_truth,
}
def _action_schema(self) -> dict:
return {
"read_dataset": {"rows": "int (optional, default 20)"},
"execute_code": {"code": "str — Python code; DATASET_PATH constant available"},
"submit_results": {
"results_payload": {
"auc": "float",
"f1": "float",
"interpretation": "str — brief explanation of results",
}
},
}
@classmethod
def generate(cls, seed=None):
"""Convenience method for Task 5 consumption."""
task = cls(seed=seed)
ep = task.generate_episode()
state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")}
return state, ep["ground_truth"]
|