| |
| """ |
| BioRLHF Expanded SFT Dataset Generator |
| Creates 200+ instruction-tuning examples from KMP data |
| """ |
|
|
| import json |
| import random |
|
|
| |
| |
| |
|
|
| STRESSOR_EFFECTS = { |
| 'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910}, |
| 'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510}, |
| 'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213}, |
| 'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830}, |
| } |
|
|
| STRESSOR_DIRECTION = { |
| 'Heart': {'HU': {'up': 67, 'down': 98}, 'IR': {'up': 17, 'down': 16}, 'HU_IR': {'up': 334, 'down': 576}}, |
| 'Hippocampus': {'HU': {'up': 711, 'down': 844}, 'IR': {'up': 2554, 'down': 2923}, 'HU_IR': {'up': 2523, 'down': 2987}}, |
| 'Liver': {'HU': {'up': 2189, 'down': 1921}, 'IR': {'up': 413, 'down': 860}, 'HU_IR': {'up': 2429, 'down': 3784}}, |
| 'Soleus': {'HU': {'up': 3251, 'down': 3174}, 'IR': {'up': 28, 'down': 39}, 'HU_IR': {'up': 3447, 'down': 3383}}, |
| } |
|
|
| KMP_EFFECTS = { |
| 'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110}, |
| 'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140}, |
| 'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3}, |
| 'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491}, |
| } |
|
|
| INTERACTIONS = { |
| 'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29}, |
| 'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221}, |
| 'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247}, |
| 'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484}, |
| } |
|
|
| TISSUE_TYPES = { |
| 'Heart': 'Type A (stress-activated)', |
| 'Soleus': 'Type A (stress-activated)', |
| 'Hippocampus': 'Type B (baseline-active)', |
| 'Liver': 'Type C (stress-blocked)', |
| } |
|
|
| OXPHOS_PATTERNS = { |
| 'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE'}, |
| 'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'NS'}, |
| 'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION'}, |
| 'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE'}, |
| } |
|
|
| PATHWAY_DATA = { |
| 'Heart': { |
| 'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.302, 'kmp': 3.691, 'pattern': 'RESCUE'}, |
| 'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'}, |
| 'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'}, |
| 'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'}, |
| 'INTERFERON_ALPHA_RESPONSE': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'}, |
| }, |
| 'Liver': { |
| 'OXIDATIVE_PHOSPHORYLATION': {'stress': 3.596, 'kmp': -1.6, 'pattern': 'SUPPRESSION'}, |
| 'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'}, |
| 'INTERFERON_GAMMA_RESPONSE': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'}, |
| }, |
| 'Soleus': { |
| 'OXIDATIVE_PHOSPHORYLATION': {'stress': -2.997, 'kmp': 2.46, 'pattern': 'RESCUE'}, |
| 'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'}, |
| } |
| } |
|
|
| HUB_GENES = { |
| 'Heart': [ |
| {'gene': 'Alb', 'lfc': 4.26, 'function': 'albumin, carrier protein'}, |
| {'gene': 'Eda2r', 'lfc': 0.75, 'function': 'ectodysplasin receptor'}, |
| {'gene': 'Cps1', 'lfc': 3.21, 'function': 'carbamoyl phosphate synthetase'}, |
| {'gene': 'Cdkn1a', 'lfc': 1.12, 'function': 'p21, cell cycle inhibitor'}, |
| {'gene': 'Arntl', 'lfc': 1.32, 'function': 'BMAL1, circadian regulator'}, |
| {'gene': 'Npas2', 'lfc': 1.17, 'function': 'circadian clock gene'}, |
| {'gene': 'Lcn2', 'lfc': 1.35, 'function': 'lipocalin, acute phase'}, |
| {'gene': 'Per2', 'lfc': 1.00, 'function': 'period circadian protein'}, |
| ], |
| 'Soleus': [ |
| {'gene': 'Myh4', 'lfc': 2.5, 'function': 'myosin heavy chain, fast fiber'}, |
| {'gene': 'Mybpc2', 'lfc': 1.8, 'function': 'myosin binding protein'}, |
| {'gene': 'Tnnt3', 'lfc': 1.6, 'function': 'troponin T, fast skeletal'}, |
| ], |
| } |
|
|
| TISSUES = ['Heart', 'Hippocampus', 'Liver', 'Soleus'] |
|
|
| |
| |
| |
|
|
| def generate_examples(): |
| examples = [] |
| |
| |
| |
| |
| |
| |
| for tissue in TISSUES: |
| for stressor, count in STRESSOR_EFFECTS[tissue].items(): |
| stressor_name = {'HU': 'hindlimb unloading', 'IR': 'ionizing radiation', 'HU_IR': 'combined HU+IR stress'}[stressor] |
| examples.append({ |
| "instruction": f"How many DEGs does {stressor_name} cause in {tissue.lower()}?", |
| "input": "", |
| "output": f"{stressor_name.capitalize()} causes {count:,} differentially expressed genes (padj < 0.05) in {tissue.lower()}." |
| }) |
| |
| |
| for tissue in TISSUES: |
| dirs = STRESSOR_DIRECTION[tissue]['HU_IR'] |
| examples.append({ |
| "instruction": f"What is the direction of gene expression changes in {tissue.lower()} under combined stress?", |
| "input": "", |
| "output": f"Under combined HU+IR stress, {tissue.lower()} shows {dirs['up']:,} upregulated and {dirs['down']:,} downregulated genes. {'Downregulation predominates' if dirs['down'] > dirs['up'] else 'Upregulation predominates' if dirs['up'] > dirs['down'] else 'Changes are balanced'} ({dirs['up']/(dirs['up']+dirs['down'])*100:.1f}% upregulated)." |
| }) |
| |
| |
| for tissue in TISSUES: |
| for context, count in KMP_EFFECTS[tissue].items(): |
| context_name = {'baseline': 'at baseline', 'in_HU': 'under HU stress', 'in_IR': 'under IR stress', 'in_HU_IR': 'under combined HU+IR stress'}[context] |
| examples.append({ |
| "instruction": f"How many DEGs does KMP produce in {tissue.lower()} {context_name}?", |
| "input": "", |
| "output": f"KMP produces {count:,} DEGs in {tissue.lower()} {context_name}. {f'This indicates strong stress-activated response.' if context == 'in_HU_IR' and count > 100 else f'This indicates minimal/blocked response.' if count < 10 else ''}" |
| }) |
| |
| |
| for tissue, ttype in TISSUE_TYPES.items(): |
| examples.append({ |
| "instruction": f"What is the KMP response classification for {tissue.lower()}?", |
| "input": "", |
| "output": f"{tissue} is classified as {ttype}. This means KMP shows {'minimal baseline effect but strong stress-activated protection' if 'stress-activated' in ttype else 'strong baseline effect that diminishes under stress' if 'baseline-active' in ttype else 'effects that are blocked by stress pathways'}." |
| }) |
| |
| |
| for tissue in TISSUES: |
| for interaction, count in INTERACTIONS[tissue].items(): |
| int_name = interaction.replace('_x_', ' × ').replace('_', ' ') |
| examples.append({ |
| "instruction": f"What is the {int_name} interaction effect in {tissue.lower()}?", |
| "input": "", |
| "output": f"The {int_name} interaction in {tissue.lower()} produces {count:,} DEGs. {'This is the largest interaction in the dataset.' if count > 5000 else 'This indicates substantial non-additive effects.' if count > 500 else 'This indicates moderate interaction.'}" |
| }) |
| |
| |
| for tissue, pattern in OXPHOS_PATTERNS.items(): |
| examples.append({ |
| "instruction": f"Describe the OXPHOS pathway response in {tissue.lower()}.", |
| "input": "", |
| "output": f"In {tissue.lower()}, combined stress produces OXPHOS NES = {pattern['stress_NES']:.2f}, while KMP shifts this to NES = {pattern['KMP_NES']:.2f}. Pattern: {pattern['pattern']}. {'KMP restores suppressed mitochondrial function.' if pattern['pattern'] == 'RESCUE' else 'KMP prevents metabolic over-activation (hepatoprotective braking).' if pattern['pattern'] == 'SUPPRESSION' else 'Mixed effects.'}" |
| }) |
| |
| |
| for tissue, genes in HUB_GENES.items(): |
| for gene_info in genes: |
| examples.append({ |
| "instruction": f"What is the role of {gene_info['gene']} in {tissue.lower()} stress response?", |
| "input": "", |
| "output": f"{gene_info['gene']} ({gene_info['function']}) is a hub gene in {tissue.lower()} with average |log2FC| = {gene_info['lfc']:.2f}. It appears significantly changed across multiple comparisons, indicating a central role in the stress/KMP response network." |
| }) |
| |
| |
| |
| |
| |
| |
| examples.append({ |
| "instruction": "Rank tissues by sensitivity to hindlimb unloading.", |
| "input": "", |
| "output": f"Tissues ranked by HU sensitivity (DEG count):\n1. Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['HU']:,}\n3. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,}\n4. Heart: {STRESSOR_EFFECTS['Heart']['HU']}\n\nSoleus dominates due to severe muscle atrophy from unloading." |
| }) |
| |
| examples.append({ |
| "instruction": "Rank tissues by sensitivity to ionizing radiation.", |
| "input": "", |
| "output": f"Tissues ranked by IR sensitivity (DEG count):\n1. Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['IR']:,}\n2. Liver: {STRESSOR_EFFECTS['Liver']['IR']:,}\n3. Heart: {STRESSOR_EFFECTS['Heart']['IR']}\n4. Soleus: {STRESSOR_EFFECTS['Soleus']['IR']}\n\nHippocampus shows extreme IR sensitivity (166× more than soleus)." |
| }) |
| |
| |
| for t1, t2 in [('Heart', 'Liver'), ('Heart', 'Soleus'), ('Hippocampus', 'Liver'), ('Soleus', 'Liver')]: |
| examples.append({ |
| "instruction": f"Compare KMP effects between {t1.lower()} and {t2.lower()} under stress.", |
| "input": "", |
| "output": f"""{t1} ({TISSUE_TYPES[t1]}): {KMP_EFFECTS[t1]['in_HU_IR']:,} DEGs under stress |
| {t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['in_HU_IR']:,} DEGs under stress |
| |
| {'Both show stress-activated patterns.' if 'stress-activated' in TISSUE_TYPES[t1] and 'stress-activated' in TISSUE_TYPES[t2] else 'These tissues show opposite pharmacodynamic patterns, indicating tissue-specific KMP mechanisms.'}""" |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Classify tissues as HU-dominant or IR-dominant.", |
| "input": "", |
| "output": f"""**HU-dominant** (mechanical stress sensitivity): |
| - Heart: {STRESSOR_EFFECTS['Heart']['HU']} HU vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Heart']['HU']/max(1,STRESSOR_EFFECTS['Heart']['IR']):.1f}×) |
| - Soleus: {STRESSOR_EFFECTS['Soleus']['HU']:,} HU vs {STRESSOR_EFFECTS['Soleus']['IR']} IR DEGs (ratio: {STRESSOR_EFFECTS['Soleus']['HU']/max(1,STRESSOR_EFFECTS['Soleus']['IR']):.1f}×) |
| |
| **IR-dominant** (radiation sensitivity): |
| - Hippocampus: {STRESSOR_EFFECTS['Hippocampus']['HU']:,} HU vs {STRESSOR_EFFECTS['Hippocampus']['IR']:,} IR DEGs (ratio: {STRESSOR_EFFECTS['Hippocampus']['IR']/max(1,STRESSOR_EFFECTS['Hippocampus']['HU']):.1f}×) |
| |
| **Both stressors** (synergistic): |
| - Liver: {STRESSOR_EFFECTS['Liver']['HU']:,} HU vs {STRESSOR_EFFECTS['Liver']['IR']:,} IR DEGs""" |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Which tissue shows the largest KMP × stressor interaction?", |
| "input": "", |
| "output": f"""KMP × HU interactions ranked: |
| 1. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest in dataset) |
| 2. Liver: {INTERACTIONS['Liver']['KMP_x_HU']:,} |
| 3. Heart: {INTERACTIONS['Heart']['KMP_x_HU']} |
| 4. Hippocampus: {INTERACTIONS['Hippocampus']['KMP_x_HU']} |
| |
| The massive soleus interaction indicates KMP specifically engages stress pathways in mechanically unloaded muscle.""" |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Compare OXPHOS responses across tissues.", |
| "input": "", |
| "output": f"""OXPHOS pathway responses: |
| |
| **RESCUE pattern** (Stress↓ → KMP↑): |
| - Heart: {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f} |
| - Soleus: {OXPHOS_PATTERNS['Soleus']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Soleus']['KMP_NES']:.1f} |
| |
| **SUPPRESSION pattern** (Stress↑ → KMP↓): |
| - Liver: {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f} |
| |
| Contractile tissues need OXPHOS restoration; liver needs metabolic braking.""" |
| }) |
| |
| |
| for pathway in ['OXIDATIVE_PHOSPHORYLATION', 'FATTY_ACID_METABOLISM']: |
| examples.append({ |
| "instruction": f"Compare {pathway.replace('_', ' ').lower()} pathway response between heart and liver.", |
| "input": "", |
| "output": f"""Heart: Stress NES = {PATHWAY_DATA['Heart'][pathway]['stress']:.2f}, KMP NES = {PATHWAY_DATA['Heart'][pathway]['kmp']:.2f} → {PATHWAY_DATA['Heart'][pathway]['pattern']} |
| Liver: Stress NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['stress']}, KMP NES = {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['kmp']} → {PATHWAY_DATA['Liver'].get(pathway, {'stress': 'N/A', 'kmp': 'N/A', 'pattern': 'N/A'})['pattern']} |
| |
| These opposite patterns reflect different metabolic requirements.""" |
| }) |
| |
| |
| |
| |
| |
| |
| for tissue in TISSUES: |
| hu = STRESSOR_EFFECTS[tissue]['HU'] |
| ir = STRESSOR_EFFECTS[tissue]['IR'] |
| combined = STRESSOR_EFFECTS[tissue]['HU_IR'] |
| |
| examples.append({ |
| "instruction": f"Given HU causes {hu:,} DEGs and IR causes {ir:,} DEGs in {tissue.lower()}, predict combined effect.", |
| "input": f"Main effects in {tissue.lower()}:\n- HU: {hu:,} DEGs\n- IR: {ir:,} DEGs", |
| "output": f"""Prediction approach: Simple addition suggests {hu + ir:,} DEGs maximum. |
| |
| Actual: {combined:,} DEGs |
| |
| Analysis: {'HU-dominated response; IR adds minimal contribution' if ir < hu * 0.1 else 'IR-dominated response' if ir > hu * 3 else 'Both stressors contribute'}. |
| {'Sub-additive (pathway overlap)' if combined < hu + ir * 0.8 else 'Approximately additive' if combined < (hu + ir) * 1.2 else 'Super-additive (synergy)'}.""" |
| }) |
| |
| |
| for tissue in TISSUES: |
| baseline = KMP_EFFECTS[tissue]['baseline'] |
| stressed = KMP_EFFECTS[tissue]['in_HU_IR'] |
| |
| examples.append({ |
| "instruction": f"KMP produces {baseline} DEGs at baseline in {tissue.lower()}. Predict effect under combined stress.", |
| "input": f"KMP baseline effect: {baseline} DEGs\nTissue type: {TISSUE_TYPES[tissue]}", |
| "output": f"""Prediction based on tissue type ({TISSUE_TYPES[tissue]}): |
| {'Expect INCREASED effect (stress-activated)' if 'stress-activated' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (stress-blocked)' if 'stress-blocked' in TISSUE_TYPES[tissue] else 'Expect DECREASED effect (baseline-active)'} |
| |
| Actual: {stressed:,} DEGs |
| |
| Ratio: {stressed/max(1, baseline):.1f}× {'increase' if stressed > baseline else 'decrease'} |
| This {'confirms' if (stressed > baseline and 'stress-activated' in TISSUE_TYPES[tissue]) or (stressed < baseline and 'stress-activated' not in TISSUE_TYPES[tissue]) else 'matches'} the {TISSUE_TYPES[tissue]} classification.""" |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Heart shows stress-activated KMP response (112→2,110 DEGs). Predict soleus pattern.", |
| "input": "Both are striated muscle with similar OXPHOS dependence.", |
| "output": f"""Prediction: Soleus should also show stress-activated pattern (Type A). |
| |
| Reasoning: |
| 1. Both are contractile tissues requiring ATP for function |
| 2. Both show HU sensitivity (mechanical stress) |
| 3. Similar mitochondrial dependence |
| |
| Actual: Soleus is Type A |
| - Baseline: {KMP_EFFECTS['Soleus']['baseline']} DEGs |
| - Stressed: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs |
| |
| Confirmed: Both contractile tissues show stress-activated KMP pharmacodynamics with OXPHOS rescue.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Given liver is Type C (stress-blocked), predict hippocampus response pattern.", |
| "input": "Both are non-contractile tissues.", |
| "output": f"""Prediction: Cannot assume same pattern—tissue function differs. |
| |
| Liver (metabolic organ): Type C (stress-blocked) |
| - Baseline: {KMP_EFFECTS['Liver']['baseline']} DEGs |
| - Stressed: {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs |
| |
| Hippocampus (neural tissue): Actually Type B (baseline-active) |
| - Baseline: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs |
| - Stressed: {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs |
| |
| Key insight: Non-contractile tissues don't share a common pattern. Must test each tissue type independently.""" |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Heart shows OXPHOS rescue (NES: -2.3→+3.7). Predict liver pattern.", |
| "input": "Liver has different metabolic role (detoxification vs pumping).", |
| "output": f"""Prediction: Opposite pattern (SUPPRESSION) |
| |
| Reasoning: |
| 1. Heart stress SUPPRESSES OXPHOS → KMP restores |
| 2. Liver may ACTIVATE OXPHOS under stress (compensatory) |
| 3. "Rescue" in liver = prevent over-activation |
| |
| Actual: |
| - Liver stress OXPHOS: NES = +3.6 (ACTIVATED, opposite to heart) |
| - Liver KMP OXPHOS: NES = -1.6 (SUPPRESSES) |
| - Pattern: SUPPRESSION (hepatoprotective braking) |
| |
| Confirmed: Metabolic context determines whether "rescue" means restoration or prevention.""" |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Heart KMP×HU interaction = 479 DEGs. Soleus has 40× larger HU main effect. Predict soleus KMP×HU.", |
| "input": f"Heart HU effect: {STRESSOR_EFFECTS['Heart']['HU']} DEGs\nSoleus HU effect: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs", |
| "output": f"""Prediction: Much larger interaction, possibly 5,000-15,000 DEGs |
| |
| Reasoning: |
| 1. Soleus is ~40× more HU-sensitive than heart |
| 2. Both are Type A (stress-activated) |
| 3. Larger main effect often → larger interaction potential |
| 4. Soleus baseline KMP = 0, so ALL effects are stress-dependent |
| |
| Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs |
| |
| This is {INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}× larger than heart—the biggest interaction in the dataset.""" |
| }) |
| |
| |
| for tissue in TISSUES: |
| examples.append({ |
| "instruction": f"Predict whether {tissue.lower()} will show larger KMP×HU or KMP×IR interaction.", |
| "input": f"HU effect: {STRESSOR_EFFECTS[tissue]['HU']:,} DEGs\nIR effect: {STRESSOR_EFFECTS[tissue]['IR']:,} DEGs", |
| "output": f"""Prediction: {'KMP×HU larger (HU-dominant tissue)' if STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR'] * 3 else 'KMP×IR larger (IR-dominant tissue)' if STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU'] * 3 else 'Comparable interactions expected'} |
| |
| Actual: |
| - KMP×HU: {INTERACTIONS[tissue]['KMP_x_HU']:,} DEGs |
| - KMP×IR: {INTERACTIONS[tissue]['KMP_x_IR']:,} DEGs |
| - Ratio: {INTERACTIONS[tissue]['KMP_x_HU']/max(1, INTERACTIONS[tissue]['KMP_x_IR']):.1f}× |
| |
| {'Confirmed: Interaction follows main effect dominance.' if (INTERACTIONS[tissue]['KMP_x_HU'] > INTERACTIONS[tissue]['KMP_x_IR'] and STRESSOR_EFFECTS[tissue]['HU'] > STRESSOR_EFFECTS[tissue]['IR']) or (INTERACTIONS[tissue]['KMP_x_IR'] > INTERACTIONS[tissue]['KMP_x_HU'] and STRESSOR_EFFECTS[tissue]['IR'] > STRESSOR_EFFECTS[tissue]['HU']) else 'Note: Interaction pattern differs from main effect dominance.'}""" |
| }) |
| |
| |
| |
| |
| |
| examples.append({ |
| "instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection potential.", |
| "input": "", |
| "output": f"""**Major flaw**: Would miss KMP's primary protective mechanism. |
| |
| Evidence: |
| - Heart: {KMP_EFFECTS['Heart']['baseline']} DEGs baseline vs {KMP_EFFECTS['Heart']['in_HU_IR']:,} stressed |
| - Soleus: {KMP_EFFECTS['Soleus']['baseline']} DEGs baseline vs {KMP_EFFECTS['Soleus']['in_HU_IR']} stressed |
| |
| Baseline-only testing would conclude "KMP has no effect" when it actually shows massive stress-activated protection. |
| |
| **Recommendation**: Always include stress conditions in countermeasure testing.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Using only heart data to predict KMP effects in all tissues.", |
| "input": "", |
| "output": f"""**Major flaw**: Tissue responses are fundamentally different. |
| |
| Heart (Type A): {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs under stress, OXPHOS rescue |
| Liver (Type C): {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress, OXPHOS suppression |
| Hippocampus (Type B): {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs under stress |
| |
| Extrapolating heart→liver would predict RESCUE when actual pattern is SUPPRESSION. |
| |
| **Recommendation**: Test each target tissue independently.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Using |log2FC| > 2 threshold for DEG detection.", |
| "input": "", |
| "output": """**Potential issue**: May miss coordinated small changes with large pathway effects. |
| |
| Our analysis shows: |
| - Many hub genes have |log2FC| < 2 (e.g., Eda2r: 0.75, Cdkn1a: 1.12) |
| - OXPHOS rescue (NES shift +6.0) occurs through many genes with modest individual changes |
| - Pathway-level effects can be dramatic even with small per-gene changes |
| |
| **Recommendation**: |
| - Use padj-only for discovery |
| - Report at multiple thresholds |
| - Always include pathway analysis |
| - Don't conclude "no effect" from stringent gene-level thresholds alone.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Single time-point measurement for KMP efficacy.", |
| "input": "", |
| "output": """**Limitation**: Cannot assess temporal dynamics. |
| |
| Missing information: |
| 1. Onset timing (when does KMP effect begin?) |
| 2. Duration (how long does protection last?) |
| 3. Recovery trajectory |
| 4. Optimal treatment window |
| |
| For spaceflight: |
| - Missions last months |
| - Single time-point may miss transient effects |
| - Recovery kinetics matter for crew health |
| |
| **Recommendation**: Time-course studies with multiple sampling points.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Concluding KMP is 'ineffective' in liver based on 3 DEGs under stress.", |
| "input": "", |
| "output": f"""**Flawed conclusion**: Low DEG count ≠ no effect. |
| |
| Observations: |
| 1. Liver shows {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress (seemingly minimal) |
| 2. BUT: KMP×HU interaction = {INTERACTIONS['Liver']['KMP_x_HU']:,} DEGs (substantial!) |
| 3. OXPHOS pathway shows clear SUPPRESSION pattern |
| |
| What's happening: |
| - KMP effect is MASKED by stress, not absent |
| - Pathway-level modulation still occurs |
| - Different pharmacodynamic mechanism (prevention vs activation) |
| |
| **Correct interpretation**: Liver shows stress-blocked KMP transcriptional response, but pathway modulation still occurs through different mechanisms.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Evaluate: Proposing KMP human trials based solely on mouse data.", |
| "input": "", |
| "output": """**Assessment**: Premature without additional studies. |
| |
| What mouse data provides: |
| ✓ Proof of concept (stress-activated protection) |
| ✓ Tissue-specific patterns identified |
| ✓ Mechanistic pathway data |
| ✓ Dose-response in mice |
| |
| What's still needed: |
| ✗ Human pharmacokinetics |
| ✗ Human tissue responses |
| ✗ True microgravity effects (HU is analog) |
| ✗ Long-duration effects |
| ✗ Safety profile |
| ✗ Drug-drug interactions |
| |
| **Recommended path**: |
| 1. Human cell/organoid studies |
| 2. PK modeling |
| 3. Ground-based human analog studies |
| 4. Then Phase I trials |
| |
| Mouse data provides rationale, not sufficient evidence for direct human trials.""" |
| }) |
| |
| |
| |
| |
| |
| examples.append({ |
| "instruction": "Explain why soleus shows 0 KMP DEGs at baseline but 491 under stress.", |
| "input": "", |
| "output": """This "stress-activated" pattern indicates conditional pharmacodynamics: |
| |
| **Mechanism hypotheses**: |
| |
| 1. **Target availability**: KMP receptors/targets may be expressed only under stress |
| - Mechanical unloading activates mechanosensitive gene programs |
| - Creates binding sites for KMP-responsive transcription factors |
| |
| 2. **Pathway priming**: Stress activates pathways that synergize with KMP |
| - Oxidative stress → antioxidant response elements accessible |
| - Inflammation → NF-κB pathway engagement |
| |
| 3. **Metabolic context**: Mitochondrial dysfunction creates KMP targets |
| - Normal mitochondria: No dysfunction to rescue |
| - Stressed mitochondria: OXPHOS suppression creates rescue opportunity |
| |
| **Implication**: KMP is a conditional countermeasure—active when needed, dormant otherwise. This is advantageous: minimal side effects in healthy tissue.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Why does liver show OXPHOS SUPPRESSION while heart shows OXPHOS RESCUE?", |
| "input": "", |
| "output": f"""Opposite patterns reflect different metabolic contexts: |
| |
| **Heart (RESCUE)**: |
| - Function: Continuous contraction requires ATP |
| - Stress effect: OXPHOS suppressed (NES = {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f}) |
| - Problem: Energy deficit → contractile failure |
| - KMP solution: Restore OXPHOS (NES → {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}) |
| |
| **Liver (SUPPRESSION)**: |
| - Function: Metabolic processing, detoxification |
| - Stress effect: OXPHOS activated (NES = {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f}) |
| - Problem: Over-activation → ROS, oxidative damage |
| - KMP solution: Brake metabolism (NES → {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}) |
| |
| **Key insight**: "Protection" means different things: |
| - Heart: Restore lost function |
| - Liver: Prevent harmful over-activation |
| |
| This is why cross-tissue drug extrapolation is dangerous without mechanistic understanding.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Explain the biological significance of the 8,484 DEG KMP×HU interaction in soleus.", |
| "input": "", |
| "output": f"""This massive interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs) is the largest in the dataset. |
| |
| **What it means mathematically**: |
| - KMP alone: {KMP_EFFECTS['Soleus']['baseline']} DEGs |
| - HU alone: {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs (many genes, but different set) |
| - Interaction: 8,484 DEGs that are NEITHER KMP-only NOR HU-only |
| |
| **Biological interpretation**: |
| |
| 1. **Emergent pharmacology**: KMP creates effects that ONLY exist in unloaded muscle |
| |
| 2. **Mechanosensitive activation**: Unloading activates pathways that: |
| - Create KMP binding sites |
| - Express KMP target genes |
| - Enable KMP transcriptional effects |
| |
| 3. **Myokine signaling**: Unloaded muscle secretes different myokines that may synergize with KMP |
| |
| 4. **Mitochondrial crosstalk**: KMP's mitochondrial effects only manifest when organelles are stressed |
| |
| **Therapeutic implication**: This interaction is exactly what you want—a countermeasure that specifically engages when the tissue is stressed, not when it's healthy.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Why do circadian genes (Arntl, Per2, Npas2) appear as hub genes in heart?", |
| "input": "", |
| "output": """Circadian genes as hub genes suggests spaceflight disrupts daily rhythms: |
| |
| **Biological connection**: |
| |
| 1. **Circadian-metabolic coupling**: |
| - Heart metabolism is circadian-regulated |
| - BMAL1 (Arntl) controls mitochondrial biogenesis |
| - PER2 regulates fatty acid metabolism |
| |
| 2. **Spaceflight disruption**: |
| - Altered light cycles |
| - Shifted sleep patterns |
| - Stress hormone rhythms disrupted |
| |
| 3. **KMP circadian effects**: |
| - Flavonoids can modulate circadian machinery |
| - May help resynchronize metabolic rhythms |
| - OXPHOS rescue could occur through circadian normalization |
| |
| **Implication**: KMP's cardiac protection may involve circadian rhythm restoration, not just direct metabolic effects. This suggests timing of KMP administration might matter.""" |
| }) |
| |
| |
| |
| |
| |
| examples.append({ |
| "instruction": "What is the effect of KMP on cardiac contractility?", |
| "input": "", |
| "output": """**Cannot directly answer from transcriptomic data.** |
| |
| What data shows: |
| - 2,110 DEGs in heart under stress |
| - OXPHOS rescue (NES +6.0 shift) |
| - Circadian gene normalization |
| |
| What data does NOT show: |
| - Actual force generation |
| - Calcium handling |
| - Protein-level changes |
| - Ejection fraction |
| |
| **Reasonable inference** (with uncertainty): |
| - OXPHOS rescue suggests improved ATP availability |
| - ATP is required for contraction |
| - Therefore contractility MIGHT improve |
| |
| **Confidence**: Low for functional claims, high for transcriptional patterns |
| |
| **Would need**: Echocardiography, isolated heart studies, or cardiomyocyte assays.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Will KMP prevent muscle atrophy in astronauts?", |
| "input": "", |
| "output": """**Cannot confidently answer this question.** |
| |
| **Supportive mouse data**: |
| ✓ 491 DEGs in soleus under stress (stress-activated) |
| ✓ 8,484 DEG KMP×HU interaction |
| ✓ OXPHOS rescue pattern |
| ✓ 86.6% upregulated genes (anabolic signature) |
| |
| **Unknown factors**: |
| ✗ Human pharmacokinetics |
| ✗ Required human dose |
| ✗ True microgravity vs HU analog |
| ✗ 6+ month mission duration effects |
| ✗ Translation of transcriptomics to muscle mass |
| |
| **My assessment**: |
| - Strong mechanistic rationale: YES |
| - Sufficient evidence for human use: NO |
| |
| Recommended next steps: Human muscle organoids, dose-finding, analog studies.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Is the exact DEG count of 2,110 in heart reliable?", |
| "input": "", |
| "output": """**The number is precise but context-dependent.** |
| |
| 2,110 is: |
| - Exact count at padj < 0.05 threshold |
| - Specific to this analysis pipeline |
| - Reproducible with same parameters |
| |
| Sources of variation: |
| 1. Threshold choice (log2FC filter would change count) |
| 2. Normalization method affects p-values |
| 3. Multiple testing correction method |
| 4. Biological variation between replicates |
| |
| **Appropriate statements**: |
| ✓ "Approximately 2,100 DEGs" |
| ✓ "About 2,000 significantly changed genes" |
| ✗ "Exactly 2,110 is the true biological count" |
| |
| The order of magnitude and relative ranking across tissues is robust; exact number is method-dependent.""" |
| }) |
| |
| examples.append({ |
| "instruction": "Which gene is most important for KMP's protective effect?", |
| "input": "", |
| "output": """**Cannot definitively answer from transcriptomic data.** |
| |
| **Why we can't identify "most important"**: |
| 1. Large change ≠ causal role (correlation ≠ causation) |
| 2. Pathway redundancy (multiple genes, same function) |
| 3. No functional validation (knockdown/knockout) |
| 4. Tissue-specificity (different genes in different tissues) |
| |
| **What we CAN say**: |
| - Hub genes show largest/most consistent effects |
| - Heart: Cdkn1a, Arntl appear frequently |
| - Pathway level: OXPHOS genes collectively important |
| |
| **To identify causal genes would need**: |
| - CRISPR screening |
| - Pharmacological inhibitors |
| - Time-course (early responders) |
| - Cross-species conservation |
| |
| Transcriptomics identifies candidates; functional studies identify mechanisms.""" |
| }) |
| |
| examples.append({ |
| "instruction": "What dose of KMP should humans take?", |
| "input": "", |
| "output": """**This data cannot inform human dosing.** |
| |
| **What's missing**: |
| 1. Dose-response relationship (only one dose tested) |
| 2. Human pharmacokinetics (ADME) |
| 3. Therapeutic window (efficacy vs toxicity) |
| 4. Bioavailability differences |
| 5. Target tissue concentrations |
| |
| **Why mouse→human scaling is unreliable**: |
| - Metabolic rate differences |
| - Body surface area scaling often off by 10-100× |
| - Species-specific drug metabolism |
| - Individual human variation |
| |
| **Required studies for dosing**: |
| 1. Mouse dose-response |
| 2. PK modeling |
| 3. Human cell studies |
| 4. Phase I safety/PK |
| 5. Then efficacy trials |
| |
| **Do not extrapolate**: Mechanism supports development; dosing requires dedicated PK/PD work.""" |
| }) |
| |
| |
| |
| |
| |
| |
| for tissue in ['Heart', 'Liver']: |
| for pathway, data in PATHWAY_DATA.get(tissue, {}).items(): |
| examples.append({ |
| "instruction": f"What is the {pathway.replace('_', ' ')} NES in {tissue.lower()} under stress?", |
| "input": "", |
| "output": f"Under combined HU+IR stress, {tissue.lower()} shows {pathway.replace('_', ' ')} NES = {data['stress']:.2f}. After KMP treatment, NES shifts to {data['kmp']:.2f}. Pattern: {data['pattern']}." |
| }) |
| |
| |
| comparisons = [ |
| ("Heart", "Soleus", "HU sensitivity", lambda t: STRESSOR_EFFECTS[t]['HU']), |
| ("Heart", "Hippocampus", "IR sensitivity", lambda t: STRESSOR_EFFECTS[t]['IR']), |
| ("Liver", "Soleus", "KMP effect under stress", lambda t: KMP_EFFECTS[t]['in_HU_IR']), |
| ] |
| |
| for t1, t2, metric, func in comparisons: |
| v1, v2 = func(t1), func(t2) |
| examples.append({ |
| "instruction": f"Which has higher {metric}: {t1.lower()} or {t2.lower()}?", |
| "input": "", |
| "output": f"{t1 if v1 > v2 else t2} has higher {metric} ({max(v1,v2):,} vs {min(v1,v2):,} DEGs)." |
| }) |
| |
| |
| yn_questions = [ |
| ("Is heart more sensitive to HU than IR?", STRESSOR_EFFECTS['Heart']['HU'] > STRESSOR_EFFECTS['Heart']['IR'], f"Yes. Heart shows {STRESSOR_EFFECTS['Heart']['HU']} HU DEGs vs {STRESSOR_EFFECTS['Heart']['IR']} IR DEGs."), |
| ("Does KMP show stress-activated response in liver?", False, f"No. Liver is Type C (stress-blocked): {KMP_EFFECTS['Liver']['baseline']} DEGs at baseline → {KMP_EFFECTS['Liver']['in_HU_IR']} under stress."), |
| ("Is the KMP×HU interaction larger than KMP×IR in soleus?", INTERACTIONS['Soleus']['KMP_x_HU'] > INTERACTIONS['Soleus']['KMP_x_IR'], f"Yes. KMP×HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} vs KMP×IR = {INTERACTIONS['Soleus']['KMP_x_IR']} DEGs."), |
| ("Does hippocampus show OXPHOS rescue?", False, "No. Hippocampus shows minimal stress effect on OXPHOS (NES = 0.93, NS). Cannot rescue what isn't suppressed."), |
| ] |
| |
| for q, answer, explanation in yn_questions: |
| examples.append({ |
| "instruction": q, |
| "input": "", |
| "output": explanation |
| }) |
| |
| return examples |
|
|
|
|
| def format_for_training(examples): |
| """Format examples for SFT training.""" |
| formatted = [] |
| for ex in examples: |
| if ex.get('input'): |
| text = f"""### Instruction: |
| {ex['instruction']} |
| |
| ### Input: |
| {ex['input']} |
| |
| ### Response: |
| {ex['output']}""" |
| else: |
| text = f"""### Instruction: |
| {ex['instruction']} |
| |
| ### Response: |
| {ex['output']}""" |
| formatted.append({"text": text}) |
| return formatted |
|
|
|
|
| def main(): |
| print("Generating expanded SFT dataset...") |
| examples = generate_examples() |
| formatted = format_for_training(examples) |
| |
| |
| with open('kmp_sft_dataset.json', 'w') as f: |
| json.dump(formatted, f, indent=2) |
| |
| print(f"\n{'='*60}") |
| print(f"SFT Dataset Summary") |
| print(f"{'='*60}") |
| print(f"Total examples: {len(formatted)}") |
| print(f"Output: kmp_sft_dataset.json") |
| |
| |
| categories = { |
| 'Factual': 0, 'Comparison': 0, 'Prediction': 0, |
| 'Critique': 0, 'Mechanistic': 0, 'Calibration': 0 |
| } |
| for ex in examples: |
| inst = ex['instruction'].lower() |
| if 'how many' in inst or 'what is the' in inst or 'describe' in inst: |
| categories['Factual'] += 1 |
| elif 'compare' in inst or 'rank' in inst or 'which' in inst: |
| categories['Comparison'] += 1 |
| elif 'predict' in inst or 'given' in inst: |
| categories['Prediction'] += 1 |
| elif 'critique' in inst or 'evaluate' in inst: |
| categories['Critique'] += 1 |
| elif 'explain' in inst or 'why' in inst: |
| categories['Mechanistic'] += 1 |
| else: |
| categories['Calibration'] += 1 |
| |
| print(f"\nApproximate category breakdown:") |
| for cat, count in categories.items(): |
| print(f" - {cat}: {count}") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|