BioRLHF / create_expanded_sft_dataset.py

Initial commit: BioRLHF v0.1.0

c7ebaa1 3 months ago

38.2 kB

	#!/usr/bin/env python3
	"""
	BioRLHF Expanded SFT Dataset Generator
	Creates 200+ instruction-tuning examples from KMP data
	"""

	import json
	import random

	# =============================================================================
	# GROUND TRUTH DATA
	# =============================================================================

	STRESSOR_EFFECTS = {
	'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910},
	'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510},
	'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213},
	'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830},
	}

	STRESSOR_DIRECTION = {
	'Heart': {'HU': {'up': 67, 'down': 98}, 'IR': {'up': 17, 'down': 16}, 'HU_IR': {'up': 334, 'down': 576}},
	'Hippocampus': {'HU': {'up': 711, 'down': 844}, 'IR': {'up': 2554, 'down': 2923}, 'HU_IR': {'up': 2523, 'down': 2987}},
	'Liver': {'HU': {'up': 2189, 'down': 1921}, 'IR': {'up': 413, 'down': 860}, 'HU_IR': {'up': 2429, 'down': 3784}},
	'Soleus': {'HU': {'up': 3251, 'down': 3174}, 'IR': {'up': 28, 'down': 39}, 'HU_IR': {'up': 3447, 'down': 3383}},
	}

	KMP_EFFECTS = {
	'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110},
	'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140},
	'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3},
	'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491},
	}

	INTERACTIONS = {
	'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29},
	'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221},
	'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247},
	'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484},
	}

	TISSUE_TYPES = {
	'Heart': 'Type A (stress-activated)',
	'Soleus': 'Type A (stress-activated)',
	'Hippocampus': 'Type B (baseline-active)',
	'Liver': 'Type C (stress-blocked)',
	}

	OXPHOS_PATTERNS = {
	'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE'},
	'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'NS'},
	'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION'},
	'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE'},
	}

	PATHWAY_DATA = {
	'Heart': {
	'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.302, 'kmp': 3.691, 'pattern': 'RESCUE'},
	'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'},
	'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'},
	'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'},
	'INTERFERON_ALPHA_RESPONSE': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'},
	},
	'Liver': {
	'OXIDATIVE_PHOSPHORYLATION': {'stress': 3.596, 'kmp': -1.6, 'pattern': 'SUPPRESSION'},
	'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'},
	'INTERFERON_GAMMA_RESPONSE': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'},
	},
	'Soleus': {
	'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.997, 'kmp': 2.46, 'pattern': 'RESCUE'},
	'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'},
	}
	}

	HUB_GENES = {
	'Heart': [
	{'gene': 'Alb', 'lfc': 4.26, 'function': 'albumin, carrier protein'},
	{'gene': 'Eda2r', 'lfc': 0.75, 'function': 'ectodysplasin receptor'},
	{'gene': 'Cps1', 'lfc': 3.21, 'function': 'carbamoyl phosphate synthetase'},
	{'gene': 'Cdkn1a', 'lfc': 1.12, 'function': 'p21, cell cycle inhibitor'},
	{'gene': 'Arntl', 'lfc': 1.32, 'function': 'BMAL1, circadian regulator'},
	{'gene': 'Npas2', 'lfc': 1.17, 'function': 'circadian clock gene'},
	{'gene': 'Lcn2', 'lfc': 1.35, 'function': 'lipocalin, acute phase'},
	{'gene': 'Per2', 'lfc': 1.00, 'function': 'period circadian protein'},
	],
	'Soleus': [
	{'gene': 'Myh4', 'lfc': 2.5, 'function': 'myosin heavy chain, fast fiber'},
	{'gene': 'Mybpc2', 'lfc': 1.8, 'function': 'myosin binding protein'},
	{'gene': 'Tnnt3', 'lfc': 1.6, 'function': 'troponin T, fast skeletal'},
	],
	}

	TISSUES = ['Heart', 'Hippocampus', 'Liver', 'Soleus']

	# =============================================================================
	# EXAMPLE GENERATORS
	# =============================================================================

	def generate_examples():
	examples = []

	# =========================================================================
	# CATEGORY 1: FACTUAL RECALL (~60 examples)
	# =========================================================================

	# Basic DEG counts for each stressor × tissue
	for tissue in TISSUES:
	for stressor, count in STRESSOR_EFFECTS[tissue].items():
	stressor_name = {'HU': 'hindlimb unloading', 'IR': 'ionizing radiation', 'HU_IR': 'combined HU+IR stress'}[stressor]
	examples.append({
	"instruction": f"How many DEGs does {stressor_name} cause in {tissue.lower()}?",
	"input": "",
	"output": f"{stressor_name.capitalize()} causes {count:,} differentially expressed genes (padj < 0.05) in {tissue.lower()}."
	})

	# DEG direction
	for tissue in TISSUES:
	dirs = STRESSOR_DIRECTION[tissue]['HU_IR']
	examples.append({
	"instruction": f"What is the direction of gene expression changes in {tissue.lower()} under combined stress?",
	"input": "",
	"output": f"Under combined HU+IR stress, {tissue.lower()} shows {dirs['up']:,} upregulated and {dirs['down']:,} downregulated genes. {'Downregulation predominates' if dirs['down'] > dirs['up'] else 'Upregulation predominates' if dirs['up'] > dirs['down'] else 'Changes are balanced'} ({dirs['up']/(dirs['up']+dirs['down'])*100:.1f}% upregulated)."
	})

	# KMP effects
	for tissue in TISSUES:
	for context, count in KMP_EFFECTS[tissue].items():
	context_name = {'baseline': 'at baseline', 'in_HU': 'under HU stress', 'in_IR': 'under IR stress', 'in_HU_IR': 'under combined HU+IR stress'}[context]
	examples.append({
	"instruction": f"How many DEGs does KMP produce in {tissue.lower()} {context_name}?",
	"input": "",
	"output": f"KMP produces {count:,} DEGs in {tissue.lower()} {context_name}. {f'This indicates strong stress-activated response.' if context == 'in_HU_IR' and count > 100 else f'This indicates minimal/blocked response.' if count < 10 else ''}"
	})

	# Tissue type classification
	for tissue, ttype in TISSUE_TYPES.items():
	examples.append({
	"instruction": f"What is the KMP response classification for {tissue.lower()}?",
	"input": "",
	"output": f"{tissue} is classified as {ttype}. This means KMP shows {'minimal baseline effect but strong stress-activated protection' if 'stress-activated' in ttype else 'strong baseline effect that diminishes under stress' if 'baseline-active' in ttype else 'effects that are blocked by stress pathways'}."
	})

	# Interaction effects
	for tissue in TISSUES:
	for interaction, count in INTERACTIONS[tissue].items():
	int_name = interaction.replace('_x_', ' × ').replace('_', ' ')
	examples.append({
	"instruction": f"What is the {int_name} interaction effect in {tissue.lower()}?",
	"input": "",
	"output": f"The {int_name} interaction in {tissue.lower()} produces {count:,} DEGs. {'This is the largest interaction in the dataset.' if count > 5000 else 'This indicates substantial non-additive effects.' if count > 500 else 'This indicates moderate interaction.'}"
	})

	# OXPHOS patterns
	for tissue, pattern in OXPHOS_PATTERNS.items():
	examples.append({
	"instruction": f"Describe the OXPHOS pathway response in {tissue.lower()}.",
	"input": "",
	"output": f"In {tissue.lower()}, combined stress produces OXPHOS NES = {pattern['stress_NES']:.2f}, while KMP shifts this to NES = {pattern['KMP_NES']:.2f}. Pattern: {pattern['pattern']}. {'KMP restores suppressed mitochondrial function.' if pattern['pattern'] == 'RESCUE' else 'KMP prevents metabolic over-activation (hepatoprotective braking).' if pattern['pattern'] == 'SUPPRESSION' else 'Mixed effects.'}"
	})

	# Hub genes
	for tissue, genes in HUB_GENES.items():
	for gene_info in genes:
	examples.append({
	"instruction": f"What is the role of {gene_info['gene']} in {tissue.lower()} stress response?",
	"input": "",
	"output": f"{gene_info['gene']} ({gene_info['function']}) is a hub gene in {tissue.lower()} with average \|log2FC\| = {gene_info['lfc']:.2f}. It appears significantly changed across multiple comparisons, indicating a central role in the stress/KMP response network."
	})

	# =========================================================================
	# CATEGORY 2: COMPARISON QUESTIONS (~40 examples)
	# =========================================================================

	# Which tissue most sensitive to each stressor
	examples.append({
	"instruction": "Rank tissues by sensitivity to hindlimb unloading.",
	"input": "",
	"output": f"Tissues ranked by HU sensitivity (DEG count):\n1. Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['HU']:,}\n3. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,}\n4. Heart: {STRESSOR_EFFECTS['Heart']['HU']}\n\nSoleus dominates due to severe muscle atrophy from unloading."
	})

	examples.append({
	"instruction": "Rank tissues by sensitivity to ionizing radiation.",
	"input": "",
	"output": f"Tissues ranked by IR sensitivity (DEG count):\n1. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['IR']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['IR']:,}\n3. Heart: {STRESSOR_EFFECTS['Heart']['IR']}\n4. Soleus: {STRESSOR_EFFECTS['Soleus']['IR']}\n\nHippocampus shows extreme IR sensitivity (166× more than soleus)."
	})

	# Cross-tissue KMP comparisons
	for t1, t2 in [('Heart', 'Liver'), ('Heart', 'Soleus'), ('Hippocampus', 'Liver'), ('Soleus', 'Liver')]:
	examples.append({
	"instruction": f"Compare KMP effects between {t1.lower()} and {t2.lower()} under stress.",
	"input": "",
	"output": f"""{t1} ({TISSUE_TYPES[t1]}): {KMP_EFFECTS[t1]['in_HU_IR']:,} DEGs under stress
	{t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['in_HU_IR']:,} DEGs under stress

	{'Both show stress-activated patterns.' if 'stress-activated' in TISSUE_TYPES[t1] and 'stress-activated' in TISSUE_TYPES[t2] else 'These tissues show opposite pharmacodynamic patterns, indicating tissue-specific KMP mechanisms.'}"""
	})

	# Stressor dominance classification
	examples.append({
	"instruction": "Classify tissues as HU-dominant or IR-dominant.",
	"input": "",
	"output": f"""HU-dominant (mechanical stress sensitivity):
	- Heart: {STRESSOR_EFFECTS['Heart']['HU']} HU vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Heart']['HU']/max(1,STRESSOR_EFFECTS['Heart']['IR']):.1f}×)
	- Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,} HU vs {STRESSOR_EFFECTS['Soleus']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Soleus']['HU']/max(1,STRESSOR_EFFECTS['Soleus']['IR']):.1f}×)

	IR-dominant (radiation sensitivity):
	- Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,} HU vs {STRESSOR_EFFECTS['Hippocampus']['IR']:,} IR DEGs (ratio: {STRESSOR_EFFECTS['Hippocampus']['IR']/max(1,STRESSOR_EFFECTS['Hippocampus']['HU']):.1f}×)

	Both stressors (synergistic):
	- Liver: {STRESSOR_EFFECTS['Liver']['HU']:,} HU vs {STRESSOR_EFFECTS['Liver']['IR']:,} IR DEGs"""
	})

	# Compare interaction magnitudes
	examples.append({
	"instruction": "Which tissue shows the largest KMP × stressor interaction?",
	"input": "",
	"output": f"""KMP × HU interactions ranked:
	1. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest in dataset)
	2. Liver: {INTERACTIONS['Liver']['KMP_x_HU']:,}
	3. Heart: {INTERACTIONS['Heart']['KMP_x_HU']}
	4. Hippocampus: {INTERACTIONS['Hippocampus']['KMP_x_HU']}

	The massive soleus interaction indicates KMP specifically engages stress pathways in mechanically unloaded muscle."""
	})

	# Compare OXPHOS patterns
	examples.append({
	"instruction": "Compare OXPHOS responses across tissues.",
	"input": "",
	"output": f"""OXPHOS pathway responses:

	RESCUE pattern (Stress↓ → KMP↑):
	- Heart: {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}
	- Soleus: {OXPHOS_PATTERNS['Soleus']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Soleus']['KMP_NES']:.1f}

	SUPPRESSION pattern (Stress↑ → KMP↓):
	- Liver: {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}

	Contractile tissues need OXPHOS restoration; liver needs metabolic braking."""
	})

	# More specific comparisons
	for pathway in ['OXIDATIVE_PHOSPHORYLATION', 'FATTY_ACID_METABOLISM']:
	examples.append({
	"instruction": f"Compare {pathway.replace('_', ' ').lower()} pathway response between heart and liver.",
	"input": "",
	"output": f"""Heart: Stress NES = {PATHWAY_DATA['Heart'][pathway]['stress']:.2f}, KMP NES = {PATHWAY_DATA['Heart'][pathway]['kmp']:.2f} → {PATHWAY_DATA['Heart'][pathway]['pattern']}
	Liver: Stress NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['stress']}, KMP NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['kmp']} → {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['pattern']}

	These opposite patterns reflect different metabolic requirements."""
	})

	# =========================================================================
	# CATEGORY 3: INTERACTION PREDICTION (~50 examples)
	# =========================================================================

	# Predict combined stress from main effects
	for tissue in TISSUES:
	hu = STRESSOR_EFFECTS[tissue]['HU']
	ir = STRESSOR_EFFECTS[tissue]['IR']
	combined = STRESSOR_EFFECTS[tissue]['HU_IR']

	examples.append({
	"instruction": f"Given HU causes {hu:,} DEGs and IR causes {ir:,} DEGs in {tissue.lower()}, predict combined effect.",
	"input": f"Main effects in {tissue.lower()}:\n- HU: {hu:,} DEGs\n- IR: {ir:,} DEGs",
	"output": f"""Prediction approach: Simple addition suggests {hu + ir:,} DEGs maximum.

	Actual: {combined:,} DEGs

	Analysis: {'HU-dominated response; IR adds minimal contribution' if ir < hu * 0.1 else 'IR-dominated response' if ir > hu * 3 else 'Both stressors contribute'}.
	{'Sub-additive (pathway overlap)' if combined < hu + ir * 0.8 else 'Approximately additive' if combined < (hu + ir) * 1.2 else 'Super-additive (synergy)'}."""
	})

	# Predict KMP stress effect from baseline
	for tissue in TISSUES:
	baseline = KMP_EFFECTS[tissue]['baseline']
	stressed = KMP_EFFECTS[tissue]['in_HU_IR']

	examples.append({
	"instruction": f"KMP produces {baseline} DEGs at baseline in {tissue.lower()}. Predict effect under combined stress.",
	"input": f"KMP baseline effect: {baseline} DEGs\nTissue type: {TISSUE_TYPES[tissue]}",
	"output": f"""Prediction based on tissue type ({TISSUE_TYPES[tissue]}):
	{'Expect INCREASED effect (stress-activated)' if 'stress-activated' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (stress-blocked)' if 'stress-blocked' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (baseline-active)'}

	Actual: {stressed:,} DEGs

	Ratio: {stressed/max(1, baseline):.1f}× {'increase' if stressed > baseline else 'decrease'}
	This {'confirms' if (stressed > baseline and 'stress-activated' in TISSUE_TYPES[tissue]) or (stressed < baseline and 'stress-activated' not in TISSUE_TYPES[tissue]) else 'matches'} the {TISSUE_TYPES[tissue]} classification."""
	})

	# Cross-tissue prediction
	examples.append({
	"instruction": "Heart shows stress-activated KMP response (112→2,110 DEGs). Predict soleus pattern.",
	"input": "Both are striated muscle with similar OXPHOS dependence.",
	"output": f"""Prediction: Soleus should also show stress-activated pattern (Type A).

	Reasoning:
	1. Both are contractile tissues requiring ATP for function
	2. Both show HU sensitivity (mechanical stress)
	3. Similar mitochondrial dependence

	Actual: Soleus is Type A
	- Baseline: {KMP_EFFECTS['Soleus']['baseline']} DEGs
	- Stressed: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs

	Confirmed: Both contractile tissues show stress-activated KMP pharmacodynamics with OXPHOS rescue."""
	})

	examples.append({
	"instruction": "Given liver is Type C (stress-blocked), predict hippocampus response pattern.",
	"input": "Both are non-contractile tissues.",
	"output": f"""Prediction: Cannot assume same pattern—tissue function differs.

	Liver (metabolic organ): Type C (stress-blocked)
	- Baseline: {KMP_EFFECTS['Liver']['baseline']} DEGs
	- Stressed: {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs

	Hippocampus (neural tissue): Actually Type B (baseline-active)
	- Baseline: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs
	- Stressed: {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs

	Key insight: Non-contractile tissues don't share a common pattern. Must test each tissue type independently."""
	})

	# OXPHOS prediction
	examples.append({
	"instruction": "Heart shows OXPHOS rescue (NES: -2.3→+3.7). Predict liver pattern.",
	"input": "Liver has different metabolic role (detoxification vs pumping).",
	"output": f"""Prediction: Opposite pattern (SUPPRESSION)

	Reasoning:
	1. Heart stress SUPPRESSES OXPHOS → KMP restores
	2. Liver may ACTIVATE OXPHOS under stress (compensatory)
	3. "Rescue" in liver = prevent over-activation

	Actual:
	- Liver stress OXPHOS: NES = +3.6 (ACTIVATED, opposite to heart)
	- Liver KMP OXPHOS: NES = -1.6 (SUPPRESSES)
	- Pattern: SUPPRESSION (hepatoprotective braking)

	Confirmed: Metabolic context determines whether "rescue" means restoration or prevention."""
	})

	# Interaction magnitude prediction
	examples.append({
	"instruction": "Heart KMP×HU interaction = 479 DEGs. Soleus has 40× larger HU main effect. Predict soleus KMP×HU.",
	"input": f"Heart HU effect: {STRESSOR_EFFECTS['Heart']['HU']} DEGs\nSoleus HU effect: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs",
	"output": f"""Prediction: Much larger interaction, possibly 5,000-15,000 DEGs

	Reasoning:
	1. Soleus is ~40× more HU-sensitive than heart
	2. Both are Type A (stress-activated)
	3. Larger main effect often → larger interaction potential
	4. Soleus baseline KMP = 0, so ALL effects are stress-dependent

	Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs

	This is {INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}× larger than heart—the biggest interaction in the dataset."""
	})

	# More interaction predictions
	for tissue in TISSUES:
	examples.append({
	"instruction": f"Predict whether {tissue.lower()} will show larger KMP×HU or KMP×IR interaction.",
	"input": f"HU effect: {STRESSOR_EFFECTS[tissue]['HU']:,} DEGs\nIR effect: {STRESSOR_EFFECTS[tissue]['IR']:,} DEGs",
	"output": f"""Prediction: {'KMP×HU larger (HU-dominant tissue)' if STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR'] * 3 else 'KMP×IR larger (IR-dominant tissue)' if STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU'] * 3 else 'Comparable interactions expected'}

	Actual:
	- KMP×HU: {INTERACTIONS[tissue]['KMP_x_HU']:,} DEGs
	- KMP×IR: {INTERACTIONS[tissue]['KMP_x_IR']:,} DEGs
	- Ratio: {INTERACTIONS[tissue]['KMP_x_HU']/max(1, INTERACTIONS[tissue]['KMP_x_IR']):.1f}×

	{'Confirmed: Interaction follows main effect dominance.' if (INTERACTIONS[tissue]['KMP_x_HU'] > INTERACTIONS[tissue]['KMP_x_IR'] and STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR']) or (INTERACTIONS[tissue]['KMP_x_IR'] > INTERACTIONS[tissue]['KMP_x_HU'] and STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU']) else 'Note: Interaction pattern differs from main effect dominance.'}"""
	})

	# =========================================================================
	# CATEGORY 4: DESIGN CRITIQUE (~30 examples)
	# =========================================================================

	examples.append({
	"instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection potential.",
	"input": "",
	"output": f"""Major flaw: Would miss KMP's primary protective mechanism.

	Evidence:
	- Heart: {KMP_EFFECTS['Heart']['baseline']} DEGs baseline vs {KMP_EFFECTS['Heart']['in_HU_IR']:,} stressed
	- Soleus: {KMP_EFFECTS['Soleus']['baseline']} DEGs baseline vs {KMP_EFFECTS['Soleus']['in_HU_IR']} stressed

	Baseline-only testing would conclude "KMP has no effect" when it actually shows massive stress-activated protection.

	Recommendation: Always include stress conditions in countermeasure testing."""
	})

	examples.append({
	"instruction": "Critique: Using only heart data to predict KMP effects in all tissues.",
	"input": "",
	"output": f"""Major flaw: Tissue responses are fundamentally different.

	Heart (Type A): {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs under stress, OXPHOS rescue
	Liver (Type C): {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress, OXPHOS suppression
	Hippocampus (Type B): {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs under stress

	Extrapolating heart→liver would predict RESCUE when actual pattern is SUPPRESSION.

	Recommendation: Test each target tissue independently."""
	})

	examples.append({
	"instruction": "Critique: Using \|log2FC\| > 2 threshold for DEG detection.",
	"input": "",
	"output": """Potential issue: May miss coordinated small changes with large pathway effects.

	Our analysis shows:
	- Many hub genes have \|log2FC\| < 2 (e.g., Eda2r: 0.75, Cdkn1a: 1.12)
	- OXPHOS rescue (NES shift +6.0) occurs through many genes with modest individual changes
	- Pathway-level effects can be dramatic even with small per-gene changes

	Recommendation:
	- Use padj-only for discovery
	- Report at multiple thresholds
	- Always include pathway analysis
	- Don't conclude "no effect" from stringent gene-level thresholds alone."""
	})

	examples.append({
	"instruction": "Critique: Single time-point measurement for KMP efficacy.",
	"input": "",
	"output": """Limitation: Cannot assess temporal dynamics.

	Missing information:
	1. Onset timing (when does KMP effect begin?)
	2. Duration (how long does protection last?)
	3. Recovery trajectory
	4. Optimal treatment window

	For spaceflight:
	- Missions last months
	- Single time-point may miss transient effects
	- Recovery kinetics matter for crew health

	Recommendation: Time-course studies with multiple sampling points."""
	})

	examples.append({
	"instruction": "Critique: Concluding KMP is 'ineffective' in liver based on 3 DEGs under stress.",
	"input": "",
	"output": f"""Flawed conclusion: Low DEG count ≠ no effect.

	Observations:
	1. Liver shows {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress (seemingly minimal)
	2. BUT: KMP×HU interaction = {INTERACTIONS['Liver']['KMP_x_HU']:,} DEGs (substantial!)
	3. OXPHOS pathway shows clear SUPPRESSION pattern

	What's happening:
	- KMP effect is MASKED by stress, not absent
	- Pathway-level modulation still occurs
	- Different pharmacodynamic mechanism (prevention vs activation)

	Correct interpretation: Liver shows stress-blocked KMP transcriptional response, but pathway modulation still occurs through different mechanisms."""
	})

	examples.append({
	"instruction": "Evaluate: Proposing KMP human trials based solely on mouse data.",
	"input": "",
	"output": """Assessment: Premature without additional studies.

	What mouse data provides:
	✓ Proof of concept (stress-activated protection)
	✓ Tissue-specific patterns identified
	✓ Mechanistic pathway data
	✓ Dose-response in mice

	What's still needed:
	✗ Human pharmacokinetics
	✗ Human tissue responses
	✗ True microgravity effects (HU is analog)
	✗ Long-duration effects
	✗ Safety profile
	✗ Drug-drug interactions

	Recommended path:
	1. Human cell/organoid studies
	2. PK modeling
	3. Ground-based human analog studies
	4. Then Phase I trials

	Mouse data provides rationale, not sufficient evidence for direct human trials."""
	})

	# =========================================================================
	# CATEGORY 5: MECHANISTIC REASONING (~30 examples)
	# =========================================================================

	examples.append({
	"instruction": "Explain why soleus shows 0 KMP DEGs at baseline but 491 under stress.",
	"input": "",
	"output": """This "stress-activated" pattern indicates conditional pharmacodynamics:

	Mechanism hypotheses:

	1. Target availability: KMP receptors/targets may be expressed only under stress
	- Mechanical unloading activates mechanosensitive gene programs
	- Creates binding sites for KMP-responsive transcription factors

	2. Pathway priming: Stress activates pathways that synergize with KMP
	- Oxidative stress → antioxidant response elements accessible
	- Inflammation → NF-κB pathway engagement

	3. Metabolic context: Mitochondrial dysfunction creates KMP targets
	- Normal mitochondria: No dysfunction to rescue
	- Stressed mitochondria: OXPHOS suppression creates rescue opportunity

	Implication: KMP is a conditional countermeasure—active when needed, dormant otherwise. This is advantageous: minimal side effects in healthy tissue."""
	})

	examples.append({
	"instruction": "Why does liver show OXPHOS SUPPRESSION while heart shows OXPHOS RESCUE?",
	"input": "",
	"output": f"""Opposite patterns reflect different metabolic contexts:

	Heart (RESCUE):
	- Function: Continuous contraction requires ATP
	- Stress effect: OXPHOS suppressed (NES = {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f})
	- Problem: Energy deficit → contractile failure
	- KMP solution: Restore OXPHOS (NES → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f})

	Liver (SUPPRESSION):
	- Function: Metabolic processing, detoxification
	- Stress effect: OXPHOS activated (NES = {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f})
	- Problem: Over-activation → ROS, oxidative damage
	- KMP solution: Brake metabolism (NES → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f})

	Key insight: "Protection" means different things:
	- Heart: Restore lost function
	- Liver: Prevent harmful over-activation

	This is why cross-tissue drug extrapolation is dangerous without mechanistic understanding."""
	})

	examples.append({
	"instruction": "Explain the biological significance of the 8,484 DEG KMP×HU interaction in soleus.",
	"input": "",
	"output": f"""This massive interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs) is the largest in the dataset.

	What it means mathematically:
	- KMP alone: {KMP_EFFECTS['Soleus']['baseline']} DEGs
	- HU alone: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs (many genes, but different set)
	- Interaction: 8,484 DEGs that are NEITHER KMP-only NOR HU-only

	Biological interpretation:

	1. Emergent pharmacology: KMP creates effects that ONLY exist in unloaded muscle

	2. Mechanosensitive activation: Unloading activates pathways that:
	- Create KMP binding sites
	- Express KMP target genes
	- Enable KMP transcriptional effects

	3. Myokine signaling: Unloaded muscle secretes different myokines that may synergize with KMP

	4. Mitochondrial crosstalk: KMP's mitochondrial effects only manifest when organelles are stressed

	Therapeutic implication: This interaction is exactly what you want—a countermeasure that specifically engages when the tissue is stressed, not when it's healthy."""
	})

	examples.append({
	"instruction": "Why do circadian genes (Arntl, Per2, Npas2) appear as hub genes in heart?",
	"input": "",
	"output": """Circadian genes as hub genes suggests spaceflight disrupts daily rhythms:

	Biological connection:

	1. Circadian-metabolic coupling:
	- Heart metabolism is circadian-regulated
	- BMAL1 (Arntl) controls mitochondrial biogenesis
	- PER2 regulates fatty acid metabolism

	2. Spaceflight disruption:
	- Altered light cycles
	- Shifted sleep patterns
	- Stress hormone rhythms disrupted

	3. KMP circadian effects:
	- Flavonoids can modulate circadian machinery
	- May help resynchronize metabolic rhythms
	- OXPHOS rescue could occur through circadian normalization

	Implication: KMP's cardiac protection may involve circadian rhythm restoration, not just direct metabolic effects. This suggests timing of KMP administration might matter."""
	})

	# =========================================================================
	# CATEGORY 6: UNCERTAINTY CALIBRATION (~25 examples)
	# =========================================================================

	examples.append({
	"instruction": "What is the effect of KMP on cardiac contractility?",
	"input": "",
	"output": """Cannot directly answer from transcriptomic data.

	What data shows:
	- 2,110 DEGs in heart under stress
	- OXPHOS rescue (NES +6.0 shift)
	- Circadian gene normalization

	What data does NOT show:
	- Actual force generation
	- Calcium handling
	- Protein-level changes
	- Ejection fraction

	Reasonable inference (with uncertainty):
	- OXPHOS rescue suggests improved ATP availability
	- ATP is required for contraction
	- Therefore contractility MIGHT improve

	Confidence: Low for functional claims, high for transcriptional patterns

	Would need: Echocardiography, isolated heart studies, or cardiomyocyte assays."""
	})

	examples.append({
	"instruction": "Will KMP prevent muscle atrophy in astronauts?",
	"input": "",
	"output": """Cannot confidently answer this question.

	Supportive mouse data:
	✓ 491 DEGs in soleus under stress (stress-activated)
	✓ 8,484 DEG KMP×HU interaction
	✓ OXPHOS rescue pattern
	✓ 86.6% upregulated genes (anabolic signature)

	Unknown factors:
	✗ Human pharmacokinetics
	✗ Required human dose
	✗ True microgravity vs HU analog
	✗ 6+ month mission duration effects
	✗ Translation of transcriptomics to muscle mass

	My assessment:
	- Strong mechanistic rationale: YES
	- Sufficient evidence for human use: NO

	Recommended next steps: Human muscle organoids, dose-finding, analog studies."""
	})

	examples.append({
	"instruction": "Is the exact DEG count of 2,110 in heart reliable?",
	"input": "",
	"output": """The number is precise but context-dependent.

	2,110 is:
	- Exact count at padj < 0.05 threshold
	- Specific to this analysis pipeline
	- Reproducible with same parameters

	Sources of variation:
	1. Threshold choice (log2FC filter would change count)
	2. Normalization method affects p-values
	3. Multiple testing correction method
	4. Biological variation between replicates

	Appropriate statements:
	✓ "Approximately 2,100 DEGs"
	✓ "About 2,000 significantly changed genes"
	✗ "Exactly 2,110 is the true biological count"

	The order of magnitude and relative ranking across tissues is robust; exact number is method-dependent."""
	})

	examples.append({
	"instruction": "Which gene is most important for KMP's protective effect?",
	"input": "",
	"output": """Cannot definitively answer from transcriptomic data.

	Why we can't identify "most important":
	1. Large change ≠ causal role (correlation ≠ causation)
	2. Pathway redundancy (multiple genes, same function)
	3. No functional validation (knockdown/knockout)
	4. Tissue-specificity (different genes in different tissues)

	What we CAN say:
	- Hub genes show largest/most consistent effects
	- Heart: Cdkn1a, Arntl appear frequently
	- Pathway level: OXPHOS genes collectively important

	To identify causal genes would need:
	- CRISPR screening
	- Pharmacological inhibitors
	- Time-course (early responders)
	- Cross-species conservation

	Transcriptomics identifies candidates; functional studies identify mechanisms."""
	})

	examples.append({
	"instruction": "What dose of KMP should humans take?",
	"input": "",
	"output": """This data cannot inform human dosing.

	What's missing:
	1. Dose-response relationship (only one dose tested)
	2. Human pharmacokinetics (ADME)
	3. Therapeutic window (efficacy vs toxicity)
	4. Bioavailability differences
	5. Target tissue concentrations

	Why mouse→human scaling is unreliable:
	- Metabolic rate differences
	- Body surface area scaling often off by 10-100×
	- Species-specific drug metabolism
	- Individual human variation

	Required studies for dosing:
	1. Mouse dose-response
	2. PK modeling
	3. Human cell studies
	4. Phase I safety/PK
	5. Then efficacy trials

	Do not extrapolate: Mechanism supports development; dosing requires dedicated PK/PD work."""
	})

	# =========================================================================
	# ADDITIONAL EXAMPLES TO REACH 200+
	# =========================================================================

	# Specific pathway questions
	for tissue in ['Heart', 'Liver']:
	for pathway, data in PATHWAY_DATA.get(tissue, {}).items():
	examples.append({
	"instruction": f"What is the {pathway.replace('_', ' ')} NES in {tissue.lower()} under stress?",
	"input": "",
	"output": f"Under combined HU+IR stress, {tissue.lower()} shows {pathway.replace('_', ' ')} NES = {data['stress']:.2f}. After KMP treatment, NES shifts to {data['kmp']:.2f}. Pattern: {data['pattern']}."
	})

	# Binary comparison questions
	comparisons = [
	("Heart", "Soleus", "HU sensitivity", lambda t: STRESSOR_EFFECTS[t]['HU']),
	("Heart", "Hippocampus", "IR sensitivity", lambda t: STRESSOR_EFFECTS[t]['IR']),
	("Liver", "Soleus", "KMP effect under stress", lambda t: KMP_EFFECTS[t]['in_HU_IR']),
	]

	for t1, t2, metric, func in comparisons:
	v1, v2 = func(t1), func(t2)
	examples.append({
	"instruction": f"Which has higher {metric}: {t1.lower()} or {t2.lower()}?",
	"input": "",
	"output": f"{t1 if v1 > v2 else t2} has higher {metric} ({max(v1,v2):,} vs {min(v1,v2):,} DEGs)."
	})

	# Yes/No questions
	yn_questions = [
	("Is heart more sensitive to HU than IR?", STRESSOR_EFFECTS['Heart']['HU'] > STRESSOR_EFFECTS['Heart']['IR'], f"Yes. Heart shows {STRESSOR_EFFECTS['Heart']['HU']} HU DEGs vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs."),
	("Does KMP show stress-activated response in liver?", False, f"No. Liver is Type C (stress-blocked): {KMP_EFFECTS['Liver']['baseline']} DEGs at baseline → {KMP_EFFECTS['Liver']['in_HU_IR']} under stress."),
	("Is the KMP×HU interaction larger than KMP×IR in soleus?", INTERACTIONS['Soleus']['KMP_x_HU'] > INTERACTIONS['Soleus']['KMP_x_IR'], f"Yes. KMP×HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} vs KMP×IR = {INTERACTIONS['Soleus']['KMP_x_IR']} DEGs."),
	("Does hippocampus show OXPHOS rescue?", False, "No. Hippocampus shows minimal stress effect on OXPHOS (NES = 0.93, NS). Cannot rescue what isn't suppressed."),
	]

	for q, answer, explanation in yn_questions:
	examples.append({
	"instruction": q,
	"input": "",
	"output": explanation
	})

	return examples


	def format_for_training(examples):
	"""Format examples for SFT training."""
	formatted = []
	for ex in examples:
	if ex.get('input'):
	text = f"""### Instruction:
	{ex['instruction']}

	### Input:
	{ex['input']}

	### Response:
	{ex['output']}"""
	else:
	text = f"""### Instruction:
	{ex['instruction']}

	### Response:
	{ex['output']}"""
	formatted.append({"text": text})
	return formatted


	def main():
	print("Generating expanded SFT dataset...")
	examples = generate_examples()
	formatted = format_for_training(examples)

	# Save
	with open('kmp_sft_dataset.json', 'w') as f:
	json.dump(formatted, f, indent=2)

	print(f"\n{'='*60}")
	print(f"SFT Dataset Summary")
	print(f"{'='*60}")
	print(f"Total examples: {len(formatted)}")
	print(f"Output: kmp_sft_dataset.json")

	# Count by approximate category (based on keywords)
	categories = {
	'Factual': 0, 'Comparison': 0, 'Prediction': 0,
	'Critique': 0, 'Mechanistic': 0, 'Calibration': 0
	}
	for ex in examples:
	inst = ex['instruction'].lower()
	if 'how many' in inst or 'what is the' in inst or 'describe' in inst:
	categories['Factual'] += 1
	elif 'compare' in inst or 'rank' in inst or 'which' in inst:
	categories['Comparison'] += 1
	elif 'predict' in inst or 'given' in inst:
	categories['Prediction'] += 1
	elif 'critique' in inst or 'evaluate' in inst:
	categories['Critique'] += 1
	elif 'explain' in inst or 'why' in inst:
	categories['Mechanistic'] += 1
	else:
	categories['Calibration'] += 1

	print(f"\nApproximate category breakdown:")
	for cat, count in categories.items():
	print(f" - {cat}: {count}")


	if __name__ == "__main__":
	main()