#!/usr/bin/env python3 """ BioRLHF Expanded SFT Dataset Generator Creates 200+ instruction-tuning examples from KMP data """ import json import random # ============================================================================= # GROUND TRUTH DATA # ============================================================================= STRESSOR_EFFECTS = { 'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910}, 'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510}, 'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213}, 'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830}, } STRESSOR_DIRECTION = { 'Heart': {'HU': {'up': 67, 'down': 98}, 'IR': {'up': 17, 'down': 16}, 'HU_IR': {'up': 334, 'down': 576}}, 'Hippocampus': {'HU': {'up': 711, 'down': 844}, 'IR': {'up': 2554, 'down': 2923}, 'HU_IR': {'up': 2523, 'down': 2987}}, 'Liver': {'HU': {'up': 2189, 'down': 1921}, 'IR': {'up': 413, 'down': 860}, 'HU_IR': {'up': 2429, 'down': 3784}}, 'Soleus': {'HU': {'up': 3251, 'down': 3174}, 'IR': {'up': 28, 'down': 39}, 'HU_IR': {'up': 3447, 'down': 3383}}, } KMP_EFFECTS = { 'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110}, 'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140}, 'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3}, 'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491}, } INTERACTIONS = { 'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29}, 'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221}, 'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247}, 'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484}, } TISSUE_TYPES = { 'Heart': 'Type A (stress-activated)', 'Soleus': 'Type A (stress-activated)', 'Hippocampus': 'Type B (baseline-active)', 'Liver': 'Type C (stress-blocked)', } OXPHOS_PATTERNS = { 'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE'}, 'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'NS'}, 'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION'}, 'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE'}, } PATHWAY_DATA = { 'Heart': { 'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.302, 'kmp': 3.691, 'pattern': 'RESCUE'}, 'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'}, 'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'}, 'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'}, 'INTERFERON_ALPHA_RESPONSE': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'}, }, 'Liver': { 'OXIDATIVE_PHOSPHORYLATION': {'stress': 3.596, 'kmp': -1.6, 'pattern': 'SUPPRESSION'}, 'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'}, 'INTERFERON_GAMMA_RESPONSE': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'}, }, 'Soleus': { 'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.997, 'kmp': 2.46, 'pattern': 'RESCUE'}, 'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'}, } } HUB_GENES = { 'Heart': [ {'gene': 'Alb', 'lfc': 4.26, 'function': 'albumin, carrier protein'}, {'gene': 'Eda2r', 'lfc': 0.75, 'function': 'ectodysplasin receptor'}, {'gene': 'Cps1', 'lfc': 3.21, 'function': 'carbamoyl phosphate synthetase'}, {'gene': 'Cdkn1a', 'lfc': 1.12, 'function': 'p21, cell cycle inhibitor'}, {'gene': 'Arntl', 'lfc': 1.32, 'function': 'BMAL1, circadian regulator'}, {'gene': 'Npas2', 'lfc': 1.17, 'function': 'circadian clock gene'}, {'gene': 'Lcn2', 'lfc': 1.35, 'function': 'lipocalin, acute phase'}, {'gene': 'Per2', 'lfc': 1.00, 'function': 'period circadian protein'}, ], 'Soleus': [ {'gene': 'Myh4', 'lfc': 2.5, 'function': 'myosin heavy chain, fast fiber'}, {'gene': 'Mybpc2', 'lfc': 1.8, 'function': 'myosin binding protein'}, {'gene': 'Tnnt3', 'lfc': 1.6, 'function': 'troponin T, fast skeletal'}, ], } TISSUES = ['Heart', 'Hippocampus', 'Liver', 'Soleus'] # ============================================================================= # EXAMPLE GENERATORS # ============================================================================= def generate_examples(): examples = [] # ========================================================================= # CATEGORY 1: FACTUAL RECALL (~60 examples) # ========================================================================= # Basic DEG counts for each stressor × tissue for tissue in TISSUES: for stressor, count in STRESSOR_EFFECTS[tissue].items(): stressor_name = {'HU': 'hindlimb unloading', 'IR': 'ionizing radiation', 'HU_IR': 'combined HU+IR stress'}[stressor] examples.append({ "instruction": f"How many DEGs does {stressor_name} cause in {tissue.lower()}?", "input": "", "output": f"{stressor_name.capitalize()} causes {count:,} differentially expressed genes (padj < 0.05) in {tissue.lower()}." }) # DEG direction for tissue in TISSUES: dirs = STRESSOR_DIRECTION[tissue]['HU_IR'] examples.append({ "instruction": f"What is the direction of gene expression changes in {tissue.lower()} under combined stress?", "input": "", "output": f"Under combined HU+IR stress, {tissue.lower()} shows {dirs['up']:,} upregulated and {dirs['down']:,} downregulated genes. {'Downregulation predominates' if dirs['down'] > dirs['up'] else 'Upregulation predominates' if dirs['up'] > dirs['down'] else 'Changes are balanced'} ({dirs['up']/(dirs['up']+dirs['down'])*100:.1f}% upregulated)." }) # KMP effects for tissue in TISSUES: for context, count in KMP_EFFECTS[tissue].items(): context_name = {'baseline': 'at baseline', 'in_HU': 'under HU stress', 'in_IR': 'under IR stress', 'in_HU_IR': 'under combined HU+IR stress'}[context] examples.append({ "instruction": f"How many DEGs does KMP produce in {tissue.lower()} {context_name}?", "input": "", "output": f"KMP produces {count:,} DEGs in {tissue.lower()} {context_name}. {f'This indicates strong stress-activated response.' if context == 'in_HU_IR' and count > 100 else f'This indicates minimal/blocked response.' if count < 10 else ''}" }) # Tissue type classification for tissue, ttype in TISSUE_TYPES.items(): examples.append({ "instruction": f"What is the KMP response classification for {tissue.lower()}?", "input": "", "output": f"{tissue} is classified as {ttype}. This means KMP shows {'minimal baseline effect but strong stress-activated protection' if 'stress-activated' in ttype else 'strong baseline effect that diminishes under stress' if 'baseline-active' in ttype else 'effects that are blocked by stress pathways'}." }) # Interaction effects for tissue in TISSUES: for interaction, count in INTERACTIONS[tissue].items(): int_name = interaction.replace('_x_', ' × ').replace('_', ' ') examples.append({ "instruction": f"What is the {int_name} interaction effect in {tissue.lower()}?", "input": "", "output": f"The {int_name} interaction in {tissue.lower()} produces {count:,} DEGs. {'This is the largest interaction in the dataset.' if count > 5000 else 'This indicates substantial non-additive effects.' if count > 500 else 'This indicates moderate interaction.'}" }) # OXPHOS patterns for tissue, pattern in OXPHOS_PATTERNS.items(): examples.append({ "instruction": f"Describe the OXPHOS pathway response in {tissue.lower()}.", "input": "", "output": f"In {tissue.lower()}, combined stress produces OXPHOS NES = {pattern['stress_NES']:.2f}, while KMP shifts this to NES = {pattern['KMP_NES']:.2f}. Pattern: {pattern['pattern']}. {'KMP restores suppressed mitochondrial function.' if pattern['pattern'] == 'RESCUE' else 'KMP prevents metabolic over-activation (hepatoprotective braking).' if pattern['pattern'] == 'SUPPRESSION' else 'Mixed effects.'}" }) # Hub genes for tissue, genes in HUB_GENES.items(): for gene_info in genes: examples.append({ "instruction": f"What is the role of {gene_info['gene']} in {tissue.lower()} stress response?", "input": "", "output": f"{gene_info['gene']} ({gene_info['function']}) is a hub gene in {tissue.lower()} with average |log2FC| = {gene_info['lfc']:.2f}. It appears significantly changed across multiple comparisons, indicating a central role in the stress/KMP response network." }) # ========================================================================= # CATEGORY 2: COMPARISON QUESTIONS (~40 examples) # ========================================================================= # Which tissue most sensitive to each stressor examples.append({ "instruction": "Rank tissues by sensitivity to hindlimb unloading.", "input": "", "output": f"Tissues ranked by HU sensitivity (DEG count):\n1. Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['HU']:,}\n3. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,}\n4. Heart: {STRESSOR_EFFECTS['Heart']['HU']}\n\nSoleus dominates due to severe muscle atrophy from unloading." }) examples.append({ "instruction": "Rank tissues by sensitivity to ionizing radiation.", "input": "", "output": f"Tissues ranked by IR sensitivity (DEG count):\n1. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['IR']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['IR']:,}\n3. Heart: {STRESSOR_EFFECTS['Heart']['IR']}\n4. Soleus: {STRESSOR_EFFECTS['Soleus']['IR']}\n\nHippocampus shows extreme IR sensitivity (166× more than soleus)." }) # Cross-tissue KMP comparisons for t1, t2 in [('Heart', 'Liver'), ('Heart', 'Soleus'), ('Hippocampus', 'Liver'), ('Soleus', 'Liver')]: examples.append({ "instruction": f"Compare KMP effects between {t1.lower()} and {t2.lower()} under stress.", "input": "", "output": f"""{t1} ({TISSUE_TYPES[t1]}): {KMP_EFFECTS[t1]['in_HU_IR']:,} DEGs under stress {t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['in_HU_IR']:,} DEGs under stress {'Both show stress-activated patterns.' if 'stress-activated' in TISSUE_TYPES[t1] and 'stress-activated' in TISSUE_TYPES[t2] else 'These tissues show opposite pharmacodynamic patterns, indicating tissue-specific KMP mechanisms.'}""" }) # Stressor dominance classification examples.append({ "instruction": "Classify tissues as HU-dominant or IR-dominant.", "input": "", "output": f"""**HU-dominant** (mechanical stress sensitivity): - Heart: {STRESSOR_EFFECTS['Heart']['HU']} HU vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Heart']['HU']/max(1,STRESSOR_EFFECTS['Heart']['IR']):.1f}×) - Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,} HU vs {STRESSOR_EFFECTS['Soleus']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Soleus']['HU']/max(1,STRESSOR_EFFECTS['Soleus']['IR']):.1f}×) **IR-dominant** (radiation sensitivity): - Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,} HU vs {STRESSOR_EFFECTS['Hippocampus']['IR']:,} IR DEGs (ratio: {STRESSOR_EFFECTS['Hippocampus']['IR']/max(1,STRESSOR_EFFECTS['Hippocampus']['HU']):.1f}×) **Both stressors** (synergistic): - Liver: {STRESSOR_EFFECTS['Liver']['HU']:,} HU vs {STRESSOR_EFFECTS['Liver']['IR']:,} IR DEGs""" }) # Compare interaction magnitudes examples.append({ "instruction": "Which tissue shows the largest KMP × stressor interaction?", "input": "", "output": f"""KMP × HU interactions ranked: 1. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest in dataset) 2. Liver: {INTERACTIONS['Liver']['KMP_x_HU']:,} 3. Heart: {INTERACTIONS['Heart']['KMP_x_HU']} 4. Hippocampus: {INTERACTIONS['Hippocampus']['KMP_x_HU']} The massive soleus interaction indicates KMP specifically engages stress pathways in mechanically unloaded muscle.""" }) # Compare OXPHOS patterns examples.append({ "instruction": "Compare OXPHOS responses across tissues.", "input": "", "output": f"""OXPHOS pathway responses: **RESCUE pattern** (Stress↓ → KMP↑): - Heart: {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f} - Soleus: {OXPHOS_PATTERNS['Soleus']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Soleus']['KMP_NES']:.1f} **SUPPRESSION pattern** (Stress↑ → KMP↓): - Liver: {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f} Contractile tissues need OXPHOS restoration; liver needs metabolic braking.""" }) # More specific comparisons for pathway in ['OXIDATIVE_PHOSPHORYLATION', 'FATTY_ACID_METABOLISM']: examples.append({ "instruction": f"Compare {pathway.replace('_', ' ').lower()} pathway response between heart and liver.", "input": "", "output": f"""Heart: Stress NES = {PATHWAY_DATA['Heart'][pathway]['stress']:.2f}, KMP NES = {PATHWAY_DATA['Heart'][pathway]['kmp']:.2f} → {PATHWAY_DATA['Heart'][pathway]['pattern']} Liver: Stress NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['stress']}, KMP NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['kmp']} → {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['pattern']} These opposite patterns reflect different metabolic requirements.""" }) # ========================================================================= # CATEGORY 3: INTERACTION PREDICTION (~50 examples) # ========================================================================= # Predict combined stress from main effects for tissue in TISSUES: hu = STRESSOR_EFFECTS[tissue]['HU'] ir = STRESSOR_EFFECTS[tissue]['IR'] combined = STRESSOR_EFFECTS[tissue]['HU_IR'] examples.append({ "instruction": f"Given HU causes {hu:,} DEGs and IR causes {ir:,} DEGs in {tissue.lower()}, predict combined effect.", "input": f"Main effects in {tissue.lower()}:\n- HU: {hu:,} DEGs\n- IR: {ir:,} DEGs", "output": f"""Prediction approach: Simple addition suggests {hu + ir:,} DEGs maximum. Actual: {combined:,} DEGs Analysis: {'HU-dominated response; IR adds minimal contribution' if ir < hu * 0.1 else 'IR-dominated response' if ir > hu * 3 else 'Both stressors contribute'}. {'Sub-additive (pathway overlap)' if combined < hu + ir * 0.8 else 'Approximately additive' if combined < (hu + ir) * 1.2 else 'Super-additive (synergy)'}.""" }) # Predict KMP stress effect from baseline for tissue in TISSUES: baseline = KMP_EFFECTS[tissue]['baseline'] stressed = KMP_EFFECTS[tissue]['in_HU_IR'] examples.append({ "instruction": f"KMP produces {baseline} DEGs at baseline in {tissue.lower()}. Predict effect under combined stress.", "input": f"KMP baseline effect: {baseline} DEGs\nTissue type: {TISSUE_TYPES[tissue]}", "output": f"""Prediction based on tissue type ({TISSUE_TYPES[tissue]}): {'Expect INCREASED effect (stress-activated)' if 'stress-activated' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (stress-blocked)' if 'stress-blocked' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (baseline-active)'} Actual: {stressed:,} DEGs Ratio: {stressed/max(1, baseline):.1f}× {'increase' if stressed > baseline else 'decrease'} This {'confirms' if (stressed > baseline and 'stress-activated' in TISSUE_TYPES[tissue]) or (stressed < baseline and 'stress-activated' not in TISSUE_TYPES[tissue]) else 'matches'} the {TISSUE_TYPES[tissue]} classification.""" }) # Cross-tissue prediction examples.append({ "instruction": "Heart shows stress-activated KMP response (112→2,110 DEGs). Predict soleus pattern.", "input": "Both are striated muscle with similar OXPHOS dependence.", "output": f"""Prediction: Soleus should also show stress-activated pattern (Type A). Reasoning: 1. Both are contractile tissues requiring ATP for function 2. Both show HU sensitivity (mechanical stress) 3. Similar mitochondrial dependence Actual: Soleus is Type A - Baseline: {KMP_EFFECTS['Soleus']['baseline']} DEGs - Stressed: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs Confirmed: Both contractile tissues show stress-activated KMP pharmacodynamics with OXPHOS rescue.""" }) examples.append({ "instruction": "Given liver is Type C (stress-blocked), predict hippocampus response pattern.", "input": "Both are non-contractile tissues.", "output": f"""Prediction: Cannot assume same pattern—tissue function differs. Liver (metabolic organ): Type C (stress-blocked) - Baseline: {KMP_EFFECTS['Liver']['baseline']} DEGs - Stressed: {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs Hippocampus (neural tissue): Actually Type B (baseline-active) - Baseline: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs - Stressed: {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs Key insight: Non-contractile tissues don't share a common pattern. Must test each tissue type independently.""" }) # OXPHOS prediction examples.append({ "instruction": "Heart shows OXPHOS rescue (NES: -2.3→+3.7). Predict liver pattern.", "input": "Liver has different metabolic role (detoxification vs pumping).", "output": f"""Prediction: Opposite pattern (SUPPRESSION) Reasoning: 1. Heart stress SUPPRESSES OXPHOS → KMP restores 2. Liver may ACTIVATE OXPHOS under stress (compensatory) 3. "Rescue" in liver = prevent over-activation Actual: - Liver stress OXPHOS: NES = +3.6 (ACTIVATED, opposite to heart) - Liver KMP OXPHOS: NES = -1.6 (SUPPRESSES) - Pattern: SUPPRESSION (hepatoprotective braking) Confirmed: Metabolic context determines whether "rescue" means restoration or prevention.""" }) # Interaction magnitude prediction examples.append({ "instruction": "Heart KMP×HU interaction = 479 DEGs. Soleus has 40× larger HU main effect. Predict soleus KMP×HU.", "input": f"Heart HU effect: {STRESSOR_EFFECTS['Heart']['HU']} DEGs\nSoleus HU effect: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs", "output": f"""Prediction: Much larger interaction, possibly 5,000-15,000 DEGs Reasoning: 1. Soleus is ~40× more HU-sensitive than heart 2. Both are Type A (stress-activated) 3. Larger main effect often → larger interaction potential 4. Soleus baseline KMP = 0, so ALL effects are stress-dependent Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs This is {INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}× larger than heart—the biggest interaction in the dataset.""" }) # More interaction predictions for tissue in TISSUES: examples.append({ "instruction": f"Predict whether {tissue.lower()} will show larger KMP×HU or KMP×IR interaction.", "input": f"HU effect: {STRESSOR_EFFECTS[tissue]['HU']:,} DEGs\nIR effect: {STRESSOR_EFFECTS[tissue]['IR']:,} DEGs", "output": f"""Prediction: {'KMP×HU larger (HU-dominant tissue)' if STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR'] * 3 else 'KMP×IR larger (IR-dominant tissue)' if STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU'] * 3 else 'Comparable interactions expected'} Actual: - KMP×HU: {INTERACTIONS[tissue]['KMP_x_HU']:,} DEGs - KMP×IR: {INTERACTIONS[tissue]['KMP_x_IR']:,} DEGs - Ratio: {INTERACTIONS[tissue]['KMP_x_HU']/max(1, INTERACTIONS[tissue]['KMP_x_IR']):.1f}× {'Confirmed: Interaction follows main effect dominance.' if (INTERACTIONS[tissue]['KMP_x_HU'] > INTERACTIONS[tissue]['KMP_x_IR'] and STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR']) or (INTERACTIONS[tissue]['KMP_x_IR'] > INTERACTIONS[tissue]['KMP_x_HU'] and STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU']) else 'Note: Interaction pattern differs from main effect dominance.'}""" }) # ========================================================================= # CATEGORY 4: DESIGN CRITIQUE (~30 examples) # ========================================================================= examples.append({ "instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection potential.", "input": "", "output": f"""**Major flaw**: Would miss KMP's primary protective mechanism. Evidence: - Heart: {KMP_EFFECTS['Heart']['baseline']} DEGs baseline vs {KMP_EFFECTS['Heart']['in_HU_IR']:,} stressed - Soleus: {KMP_EFFECTS['Soleus']['baseline']} DEGs baseline vs {KMP_EFFECTS['Soleus']['in_HU_IR']} stressed Baseline-only testing would conclude "KMP has no effect" when it actually shows massive stress-activated protection. **Recommendation**: Always include stress conditions in countermeasure testing.""" }) examples.append({ "instruction": "Critique: Using only heart data to predict KMP effects in all tissues.", "input": "", "output": f"""**Major flaw**: Tissue responses are fundamentally different. Heart (Type A): {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs under stress, OXPHOS rescue Liver (Type C): {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress, OXPHOS suppression Hippocampus (Type B): {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs under stress Extrapolating heart→liver would predict RESCUE when actual pattern is SUPPRESSION. **Recommendation**: Test each target tissue independently.""" }) examples.append({ "instruction": "Critique: Using |log2FC| > 2 threshold for DEG detection.", "input": "", "output": """**Potential issue**: May miss coordinated small changes with large pathway effects. Our analysis shows: - Many hub genes have |log2FC| < 2 (e.g., Eda2r: 0.75, Cdkn1a: 1.12) - OXPHOS rescue (NES shift +6.0) occurs through many genes with modest individual changes - Pathway-level effects can be dramatic even with small per-gene changes **Recommendation**: - Use padj-only for discovery - Report at multiple thresholds - Always include pathway analysis - Don't conclude "no effect" from stringent gene-level thresholds alone.""" }) examples.append({ "instruction": "Critique: Single time-point measurement for KMP efficacy.", "input": "", "output": """**Limitation**: Cannot assess temporal dynamics. Missing information: 1. Onset timing (when does KMP effect begin?) 2. Duration (how long does protection last?) 3. Recovery trajectory 4. Optimal treatment window For spaceflight: - Missions last months - Single time-point may miss transient effects - Recovery kinetics matter for crew health **Recommendation**: Time-course studies with multiple sampling points.""" }) examples.append({ "instruction": "Critique: Concluding KMP is 'ineffective' in liver based on 3 DEGs under stress.", "input": "", "output": f"""**Flawed conclusion**: Low DEG count ≠ no effect. Observations: 1. Liver shows {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress (seemingly minimal) 2. BUT: KMP×HU interaction = {INTERACTIONS['Liver']['KMP_x_HU']:,} DEGs (substantial!) 3. OXPHOS pathway shows clear SUPPRESSION pattern What's happening: - KMP effect is MASKED by stress, not absent - Pathway-level modulation still occurs - Different pharmacodynamic mechanism (prevention vs activation) **Correct interpretation**: Liver shows stress-blocked KMP transcriptional response, but pathway modulation still occurs through different mechanisms.""" }) examples.append({ "instruction": "Evaluate: Proposing KMP human trials based solely on mouse data.", "input": "", "output": """**Assessment**: Premature without additional studies. What mouse data provides: ✓ Proof of concept (stress-activated protection) ✓ Tissue-specific patterns identified ✓ Mechanistic pathway data ✓ Dose-response in mice What's still needed: ✗ Human pharmacokinetics ✗ Human tissue responses ✗ True microgravity effects (HU is analog) ✗ Long-duration effects ✗ Safety profile ✗ Drug-drug interactions **Recommended path**: 1. Human cell/organoid studies 2. PK modeling 3. Ground-based human analog studies 4. Then Phase I trials Mouse data provides rationale, not sufficient evidence for direct human trials.""" }) # ========================================================================= # CATEGORY 5: MECHANISTIC REASONING (~30 examples) # ========================================================================= examples.append({ "instruction": "Explain why soleus shows 0 KMP DEGs at baseline but 491 under stress.", "input": "", "output": """This "stress-activated" pattern indicates conditional pharmacodynamics: **Mechanism hypotheses**: 1. **Target availability**: KMP receptors/targets may be expressed only under stress - Mechanical unloading activates mechanosensitive gene programs - Creates binding sites for KMP-responsive transcription factors 2. **Pathway priming**: Stress activates pathways that synergize with KMP - Oxidative stress → antioxidant response elements accessible - Inflammation → NF-κB pathway engagement 3. **Metabolic context**: Mitochondrial dysfunction creates KMP targets - Normal mitochondria: No dysfunction to rescue - Stressed mitochondria: OXPHOS suppression creates rescue opportunity **Implication**: KMP is a conditional countermeasure—active when needed, dormant otherwise. This is advantageous: minimal side effects in healthy tissue.""" }) examples.append({ "instruction": "Why does liver show OXPHOS SUPPRESSION while heart shows OXPHOS RESCUE?", "input": "", "output": f"""Opposite patterns reflect different metabolic contexts: **Heart (RESCUE)**: - Function: Continuous contraction requires ATP - Stress effect: OXPHOS suppressed (NES = {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f}) - Problem: Energy deficit → contractile failure - KMP solution: Restore OXPHOS (NES → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}) **Liver (SUPPRESSION)**: - Function: Metabolic processing, detoxification - Stress effect: OXPHOS activated (NES = {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f}) - Problem: Over-activation → ROS, oxidative damage - KMP solution: Brake metabolism (NES → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}) **Key insight**: "Protection" means different things: - Heart: Restore lost function - Liver: Prevent harmful over-activation This is why cross-tissue drug extrapolation is dangerous without mechanistic understanding.""" }) examples.append({ "instruction": "Explain the biological significance of the 8,484 DEG KMP×HU interaction in soleus.", "input": "", "output": f"""This massive interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs) is the largest in the dataset. **What it means mathematically**: - KMP alone: {KMP_EFFECTS['Soleus']['baseline']} DEGs - HU alone: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs (many genes, but different set) - Interaction: 8,484 DEGs that are NEITHER KMP-only NOR HU-only **Biological interpretation**: 1. **Emergent pharmacology**: KMP creates effects that ONLY exist in unloaded muscle 2. **Mechanosensitive activation**: Unloading activates pathways that: - Create KMP binding sites - Express KMP target genes - Enable KMP transcriptional effects 3. **Myokine signaling**: Unloaded muscle secretes different myokines that may synergize with KMP 4. **Mitochondrial crosstalk**: KMP's mitochondrial effects only manifest when organelles are stressed **Therapeutic implication**: This interaction is exactly what you want—a countermeasure that specifically engages when the tissue is stressed, not when it's healthy.""" }) examples.append({ "instruction": "Why do circadian genes (Arntl, Per2, Npas2) appear as hub genes in heart?", "input": "", "output": """Circadian genes as hub genes suggests spaceflight disrupts daily rhythms: **Biological connection**: 1. **Circadian-metabolic coupling**: - Heart metabolism is circadian-regulated - BMAL1 (Arntl) controls mitochondrial biogenesis - PER2 regulates fatty acid metabolism 2. **Spaceflight disruption**: - Altered light cycles - Shifted sleep patterns - Stress hormone rhythms disrupted 3. **KMP circadian effects**: - Flavonoids can modulate circadian machinery - May help resynchronize metabolic rhythms - OXPHOS rescue could occur through circadian normalization **Implication**: KMP's cardiac protection may involve circadian rhythm restoration, not just direct metabolic effects. This suggests timing of KMP administration might matter.""" }) # ========================================================================= # CATEGORY 6: UNCERTAINTY CALIBRATION (~25 examples) # ========================================================================= examples.append({ "instruction": "What is the effect of KMP on cardiac contractility?", "input": "", "output": """**Cannot directly answer from transcriptomic data.** What data shows: - 2,110 DEGs in heart under stress - OXPHOS rescue (NES +6.0 shift) - Circadian gene normalization What data does NOT show: - Actual force generation - Calcium handling - Protein-level changes - Ejection fraction **Reasonable inference** (with uncertainty): - OXPHOS rescue suggests improved ATP availability - ATP is required for contraction - Therefore contractility MIGHT improve **Confidence**: Low for functional claims, high for transcriptional patterns **Would need**: Echocardiography, isolated heart studies, or cardiomyocyte assays.""" }) examples.append({ "instruction": "Will KMP prevent muscle atrophy in astronauts?", "input": "", "output": """**Cannot confidently answer this question.** **Supportive mouse data**: ✓ 491 DEGs in soleus under stress (stress-activated) ✓ 8,484 DEG KMP×HU interaction ✓ OXPHOS rescue pattern ✓ 86.6% upregulated genes (anabolic signature) **Unknown factors**: ✗ Human pharmacokinetics ✗ Required human dose ✗ True microgravity vs HU analog ✗ 6+ month mission duration effects ✗ Translation of transcriptomics to muscle mass **My assessment**: - Strong mechanistic rationale: YES - Sufficient evidence for human use: NO Recommended next steps: Human muscle organoids, dose-finding, analog studies.""" }) examples.append({ "instruction": "Is the exact DEG count of 2,110 in heart reliable?", "input": "", "output": """**The number is precise but context-dependent.** 2,110 is: - Exact count at padj < 0.05 threshold - Specific to this analysis pipeline - Reproducible with same parameters Sources of variation: 1. Threshold choice (log2FC filter would change count) 2. Normalization method affects p-values 3. Multiple testing correction method 4. Biological variation between replicates **Appropriate statements**: ✓ "Approximately 2,100 DEGs" ✓ "About 2,000 significantly changed genes" ✗ "Exactly 2,110 is the true biological count" The order of magnitude and relative ranking across tissues is robust; exact number is method-dependent.""" }) examples.append({ "instruction": "Which gene is most important for KMP's protective effect?", "input": "", "output": """**Cannot definitively answer from transcriptomic data.** **Why we can't identify "most important"**: 1. Large change ≠ causal role (correlation ≠ causation) 2. Pathway redundancy (multiple genes, same function) 3. No functional validation (knockdown/knockout) 4. Tissue-specificity (different genes in different tissues) **What we CAN say**: - Hub genes show largest/most consistent effects - Heart: Cdkn1a, Arntl appear frequently - Pathway level: OXPHOS genes collectively important **To identify causal genes would need**: - CRISPR screening - Pharmacological inhibitors - Time-course (early responders) - Cross-species conservation Transcriptomics identifies candidates; functional studies identify mechanisms.""" }) examples.append({ "instruction": "What dose of KMP should humans take?", "input": "", "output": """**This data cannot inform human dosing.** **What's missing**: 1. Dose-response relationship (only one dose tested) 2. Human pharmacokinetics (ADME) 3. Therapeutic window (efficacy vs toxicity) 4. Bioavailability differences 5. Target tissue concentrations **Why mouse→human scaling is unreliable**: - Metabolic rate differences - Body surface area scaling often off by 10-100× - Species-specific drug metabolism - Individual human variation **Required studies for dosing**: 1. Mouse dose-response 2. PK modeling 3. Human cell studies 4. Phase I safety/PK 5. Then efficacy trials **Do not extrapolate**: Mechanism supports development; dosing requires dedicated PK/PD work.""" }) # ========================================================================= # ADDITIONAL EXAMPLES TO REACH 200+ # ========================================================================= # Specific pathway questions for tissue in ['Heart', 'Liver']: for pathway, data in PATHWAY_DATA.get(tissue, {}).items(): examples.append({ "instruction": f"What is the {pathway.replace('_', ' ')} NES in {tissue.lower()} under stress?", "input": "", "output": f"Under combined HU+IR stress, {tissue.lower()} shows {pathway.replace('_', ' ')} NES = {data['stress']:.2f}. After KMP treatment, NES shifts to {data['kmp']:.2f}. Pattern: {data['pattern']}." }) # Binary comparison questions comparisons = [ ("Heart", "Soleus", "HU sensitivity", lambda t: STRESSOR_EFFECTS[t]['HU']), ("Heart", "Hippocampus", "IR sensitivity", lambda t: STRESSOR_EFFECTS[t]['IR']), ("Liver", "Soleus", "KMP effect under stress", lambda t: KMP_EFFECTS[t]['in_HU_IR']), ] for t1, t2, metric, func in comparisons: v1, v2 = func(t1), func(t2) examples.append({ "instruction": f"Which has higher {metric}: {t1.lower()} or {t2.lower()}?", "input": "", "output": f"{t1 if v1 > v2 else t2} has higher {metric} ({max(v1,v2):,} vs {min(v1,v2):,} DEGs)." }) # Yes/No questions yn_questions = [ ("Is heart more sensitive to HU than IR?", STRESSOR_EFFECTS['Heart']['HU'] > STRESSOR_EFFECTS['Heart']['IR'], f"Yes. Heart shows {STRESSOR_EFFECTS['Heart']['HU']} HU DEGs vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs."), ("Does KMP show stress-activated response in liver?", False, f"No. Liver is Type C (stress-blocked): {KMP_EFFECTS['Liver']['baseline']} DEGs at baseline → {KMP_EFFECTS['Liver']['in_HU_IR']} under stress."), ("Is the KMP×HU interaction larger than KMP×IR in soleus?", INTERACTIONS['Soleus']['KMP_x_HU'] > INTERACTIONS['Soleus']['KMP_x_IR'], f"Yes. KMP×HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} vs KMP×IR = {INTERACTIONS['Soleus']['KMP_x_IR']} DEGs."), ("Does hippocampus show OXPHOS rescue?", False, "No. Hippocampus shows minimal stress effect on OXPHOS (NES = 0.93, NS). Cannot rescue what isn't suppressed."), ] for q, answer, explanation in yn_questions: examples.append({ "instruction": q, "input": "", "output": explanation }) return examples def format_for_training(examples): """Format examples for SFT training.""" formatted = [] for ex in examples: if ex.get('input'): text = f"""### Instruction: {ex['instruction']} ### Input: {ex['input']} ### Response: {ex['output']}""" else: text = f"""### Instruction: {ex['instruction']} ### Response: {ex['output']}""" formatted.append({"text": text}) return formatted def main(): print("Generating expanded SFT dataset...") examples = generate_examples() formatted = format_for_training(examples) # Save with open('kmp_sft_dataset.json', 'w') as f: json.dump(formatted, f, indent=2) print(f"\n{'='*60}") print(f"SFT Dataset Summary") print(f"{'='*60}") print(f"Total examples: {len(formatted)}") print(f"Output: kmp_sft_dataset.json") # Count by approximate category (based on keywords) categories = { 'Factual': 0, 'Comparison': 0, 'Prediction': 0, 'Critique': 0, 'Mechanistic': 0, 'Calibration': 0 } for ex in examples: inst = ex['instruction'].lower() if 'how many' in inst or 'what is the' in inst or 'describe' in inst: categories['Factual'] += 1 elif 'compare' in inst or 'rank' in inst or 'which' in inst: categories['Comparison'] += 1 elif 'predict' in inst or 'given' in inst: categories['Prediction'] += 1 elif 'critique' in inst or 'evaluate' in inst: categories['Critique'] += 1 elif 'explain' in inst or 'why' in inst: categories['Mechanistic'] += 1 else: categories['Calibration'] += 1 print(f"\nApproximate category breakdown:") for cat, count in categories.items(): print(f" - {cat}: {count}") if __name__ == "__main__": main()