Spaces:

Nexus18
/

research-integrity-gym

Sleeping

research-integrity-gym / tasks /task2_replication.py

Bhavishya011

refactor: PeerGuard clinical trial verification system

8122ba9 14 days ago

7.16 kB

	"""
	Task 2: Experiment Replication — MEDIUM (PeerGuard: Adverse Event Analysis)
	----------------------------------------
	Agent receives a methods section describing a logistic regression experiment
	on a tabular CSV dataset. The sponsor claims no significant adverse events.
	The agent must write and run code to replicate the reported AUC-ROC and F1
	score.

	Key challenge: the dataset has class imbalance. A naive model trained without
	stratified splitting will score ~0.71 AUC — outside the pass threshold.
	The agent must notice the imbalance and handle it correctly.

	The dataset is generated procedurally each episode with random seed,
	varying feature names and imbalance ratio to prevent memorisation.
	"""
	from __future__ import annotations

	import os
	import tempfile
	import textwrap
	from typing import Optional

	import numpy as np
	import pandas as pd
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import f1_score, roc_auc_score
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler

	from tasks.base import BaseTask


	class ReplicationTask(BaseTask):
	task_id = "task2_replication"
	task_name = "Adverse Event Replication Analysis"
	difficulty = "medium"
	max_steps = 20

	def generate_episode(self) -> dict:
	rng = self.rng
	np_rng = np.random.RandomState(rng.randint(0, 2**31 - 1))

	# --- Procedurally vary the domain ---
	domains = [
	{
	"name": "patient readmission",
	"features": ["age", "num_prior_admissions", "los_days",
	"num_medications", "comorbidity_score"],
	"target": "readmitted_30d",
	},
	{
	"name": "loan default",
	"features": ["credit_score", "debt_to_income", "loan_amount",
	"employment_years", "num_open_accounts"],
	"target": "defaulted",
	},
	{
	"name": "equipment failure",
	"features": ["operating_hours", "temperature_avg", "vibration_rms",
	"pressure_delta", "maintenance_lag_days"],
	"target": "failure_within_30d",
	},
	]
	domain = rng.choice(domains)

	n_samples = 400
	imbalance_rate = rng.choice([0.18, 0.20, 0.22]) # ~20% positive class
	n_pos = int(n_samples * imbalance_rate)
	n_neg = n_samples - n_pos

	# Generate correlated features
	n_feat = len(domain["features"])
	X_neg = np_rng.randn(n_neg, n_feat)
	X_pos = np_rng.randn(n_pos, n_feat) + np_rng.uniform(0.4, 0.7, n_feat)

	X = np.vstack([X_neg, X_pos])
	y = np.array([0] * n_neg + [1] * n_pos)

	# Shuffle
	idx = np_rng.permutation(n_samples)
	X, y = X[idx], y[idx]

	df = pd.DataFrame(X, columns=domain["features"])
	df[domain["target"]] = y

	# Save dataset to temp file
	tmp_dir = tempfile.mkdtemp(prefix="rig_task2_")
	dataset_path = os.path.join(tmp_dir, "task2_readmission.csv")
	df.to_csv(dataset_path, index=False)

	# --- Compute ground truth (stratified split, seed=42) ---
	X_arr = df[domain["features"]].values
	y_arr = df[domain["target"]].values

	X_train, X_test, y_train, y_test = train_test_split(
	X_arr, y_arr, test_size=0.25, random_state=42, stratify=y_arr
	)
	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	clf = LogisticRegression(random_state=42, max_iter=1000, class_weight="balanced")
	clf.fit(X_train, y_train)

	y_prob = clf.predict_proba(X_test)[:, 1]
	y_pred = clf.predict(X_test)

	gt_auc = round(float(roc_auc_score(y_test, y_prob)), 4)
	gt_f1 = round(float(f1_score(y_test, y_pred, zero_division=0)), 4)

	# --- Build methods section ---
	feature_list = ", ".join(f"`{f}`" for f in domain["features"])
	paper_text = textwrap.dedent(f"""
	CLINICAL TRIAL SAFETY ANALYSIS: Predictive modelling for {domain['name']}

	SPONSOR CLAIM
	The sponsor claims no significant adverse cardiovascular events
	were observed in the treatment cohort during the trial period.

	METHODS SECTION (to replicate)
	--------------------------------
	Dataset: {n_samples} patient records with {n_feat} clinical features ({feature_list})
	and binary outcome variable `{domain['target']}`.

	Preprocessing: Standardise all features using StandardScaler
	(fit on training set, transform both sets).

	Model: Logistic Regression with class_weight='balanced', random_state=42,
	max_iter=1000.

	Split: 75% train / 25% test using train_test_split with
	random_state=42 and stratify=y.

	Metrics to report:
	- AUC-ROC (roc_auc_score on test set, using predicted probabilities)
	- F1-score (f1_score on test set, using predicted class labels)

	The dataset is available at path: DATASET_PATH
	(use read_dataset action to inspect it, or reference DATASET_PATH in code)

	REPORTED RESULTS (from sponsor analysis):
	AUC-ROC : {gt_auc}
	F1-score: {gt_f1}

	Your task: reproduce these results and assess whether the sponsor's
	claim of no significant adverse events is supported by the data.
	Submit with submit_results action.
	Tolerance: AUC within ±0.03, F1 within ±0.03 for full credit.
	""").strip()

	ground_truth = {
	"auc": gt_auc,
	"f1": gt_f1,
	"target_col": domain["target"],
	"feature_cols": domain["features"],
	"domain": domain["name"],
	"n_samples": n_samples,
	"imbalance_rate": imbalance_rate,
	}

	return {
	"paper_text": paper_text,
	"paper_sections": {"methods": paper_text},
	"dataset_path": dataset_path,
	"ground_truth": ground_truth,
	}

	def _action_schema(self) -> dict:
	return {
	"read_dataset": {"rows": "int (optional, default 20)"},
	"execute_code": {"code": "str — Python code; DATASET_PATH constant available"},
	"submit_results": {
	"results_payload": {
	"auc": "float",
	"f1": "float",
	"interpretation": "str — brief explanation of results",
	}
	},
	}

	@classmethod
	def generate(cls, seed=None):
	"""Convenience method for Task 5 consumption."""
	task = cls(seed=seed)
	ep = task.generate_episode()
	state = {"paper_text": ep["paper_text"], "dataset_path": ep.get("dataset_path")}
	return state, ep["ground_truth"]