""" Task 2: Experiment Replication — MEDIUM (PeerGuard: Adverse Event Analysis) ---------------------------------------- Agent receives a methods section describing a logistic regression experiment on a tabular CSV dataset. The sponsor claims no significant adverse events. The agent must write and run code to replicate the reported AUC-ROC and F1 score. Key challenge: the dataset has class imbalance. A naive model trained without stratified splitting will score ~0.71 AUC — outside the pass threshold. The agent must notice the imbalance and handle it correctly. The dataset is generated procedurally each episode with random seed, varying feature names and imbalance ratio to prevent memorisation. """ from __future__ import annotations import os import tempfile import textwrap from typing import Optional import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score, roc_auc_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from tasks.base import BaseTask class ReplicationTask(BaseTask): task_id = "task2_replication" task_name = "Adverse Event Replication Analysis" difficulty = "medium" max_steps = 20 def generate_episode(self) -> dict: rng = self.rng np_rng = np.random.RandomState(rng.randint(0, 2**31 - 1)) # --- Procedurally vary the domain --- domains = [ { "name": "patient readmission", "features": ["age", "num_prior_admissions", "los_days", "num_medications", "comorbidity_score"], "target": "readmitted_30d", }, { "name": "loan default", "features": ["credit_score", "debt_to_income", "loan_amount", "employment_years", "num_open_accounts"], "target": "defaulted", }, { "name": "equipment failure", "features": ["operating_hours", "temperature_avg", "vibration_rms", "pressure_delta", "maintenance_lag_days"], "target": "failure_within_30d", }, ] domain = rng.choice(domains) n_samples = 400 imbalance_rate = rng.choice([0.18, 0.20, 0.22]) # ~20% positive class n_pos = int(n_samples * imbalance_rate) n_neg = n_samples - n_pos # Generate correlated features n_feat = len(domain["features"]) X_neg = np_rng.randn(n_neg, n_feat) X_pos = np_rng.randn(n_pos, n_feat) + np_rng.uniform(0.4, 0.7, n_feat) X = np.vstack([X_neg, X_pos]) y = np.array([0] * n_neg + [1] * n_pos) # Shuffle idx = np_rng.permutation(n_samples) X, y = X[idx], y[idx] df = pd.DataFrame(X, columns=domain["features"]) df[domain["target"]] = y # Save dataset to temp file tmp_dir = tempfile.mkdtemp(prefix="rig_task2_") dataset_path = os.path.join(tmp_dir, "task2_readmission.csv") df.to_csv(dataset_path, index=False) # --- Compute ground truth (stratified split, seed=42) --- X_arr = df[domain["features"]].values y_arr = df[domain["target"]].values X_train, X_test, y_train, y_test = train_test_split( X_arr, y_arr, test_size=0.25, random_state=42, stratify=y_arr ) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) clf = LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced") clf.fit(X_train, y_train) y_prob = clf.predict_proba(X_test)[:, 1] y_pred = clf.predict(X_test) gt_auc = round(float(roc_auc_score(y_test, y_prob)), 4) gt_f1 = round(float(f1_score(y_test, y_pred, zero_division=0)), 4) # --- Build methods section --- feature_list = ", ".join(f"`{f}`" for f in domain["features"]) paper_text = textwrap.dedent(f""" CLINICAL TRIAL SAFETY ANALYSIS: Predictive modelling for {domain['name']} SPONSOR CLAIM The sponsor claims no significant adverse cardiovascular events were observed in the treatment cohort during the trial period. METHODS SECTION (to replicate) -------------------------------- Dataset: {n_samples} patient records with {n_feat} clinical features ({feature_list}) and binary outcome variable `{domain['target']}`. Preprocessing: Standardise all features using StandardScaler (fit on training set, transform both sets). Model: Logistic Regression with class_weight='balanced', random_state=42, max_iter=1000. Split: 75% train / 25% test using train_test_split with random_state=42 and stratify=y. Metrics to report: - AUC-ROC (roc_auc_score on test set, using predicted probabilities) - F1-score (f1_score on test set, using predicted class labels) The dataset is available at path: DATASET_PATH (use read_dataset action to inspect it, or reference DATASET_PATH in code) REPORTED RESULTS (from sponsor analysis): AUC-ROC : {gt_auc} F1-score: {gt_f1} Your task: reproduce these results and assess whether the sponsor's claim of no significant adverse events is supported by the data. Submit with submit_results action. Tolerance: AUC within ±0.03, F1 within ±0.03 for full credit. """).strip() ground_truth = { "auc": gt_auc, "f1": gt_f1, "target_col": domain["target"], "feature_cols": domain["features"], "domain": domain["name"], "n_samples": n_samples, "imbalance_rate": imbalance_rate, } return { "paper_text": paper_text, "paper_sections": {"methods": paper_text}, "dataset_path": dataset_path, "ground_truth": ground_truth, } def _action_schema(self) -> dict: return { "read_dataset": {"rows": "int (optional, default 20)"}, "execute_code": {"code": "str — Python code; DATASET_PATH constant available"}, "submit_results": { "results_payload": { "auc": "float", "f1": "float", "interpretation": "str — brief explanation of results", } }, } @classmethod def generate(cls, seed=None): """Convenience method for Task 5 consumption.""" task = cls(seed=seed) ep = task.generate_episode() state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")} return state, ep["ground_truth"]