| |
| """ |
| BioRLHF SFT Dataset Generator |
| Creates instruction-tuning dataset from KMP 2x2x2 factorial mouse data |
| |
| Usage: |
| python create_sft_dataset.py --output kmp_sft_dataset.json |
| """ |
|
|
| import json |
| import argparse |
| from typing import List, Dict |
|
|
| |
| |
| |
|
|
| STRESSOR_EFFECTS = { |
| 'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910}, |
| 'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510}, |
| 'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213}, |
| 'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830}, |
| } |
|
|
| KMP_EFFECTS = { |
| 'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110}, |
| 'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140}, |
| 'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3}, |
| 'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491}, |
| } |
|
|
| INTERACTIONS = { |
| 'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29}, |
| 'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221}, |
| 'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247}, |
| 'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484}, |
| } |
|
|
| TISSUE_TYPES = { |
| 'Heart': 'Type A (stress-activated)', |
| 'Soleus': 'Type A (stress-activated)', |
| 'Hippocampus': 'Type B (baseline-active)', |
| 'Liver': 'Type C (stress-blocked)', |
| } |
|
|
| OXPHOS_PATTERNS = { |
| 'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE'}, |
| 'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'KMP Only'}, |
| 'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION'}, |
| 'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE'}, |
| } |
|
|
|
|
| def generate_factual_examples() -> List[Dict]: |
| examples = [] |
| |
| for tissue, effects in STRESSOR_EFFECTS.items(): |
| if tissue == 'Soleus': |
| hu_desc = "This reflects the severe muscle atrophy that occurs during simulated microgravity." |
| else: |
| hu_desc = f"This indicates {tissue.lower()} sensitivity to mechanical unloading." |
| |
| examples.append({ |
| "instruction": f"How many differentially expressed genes does hindlimb unloading cause in {tissue.lower()}?", |
| "input": "", |
| "output": f"Hindlimb unloading (HU) causes {effects['HU']:,} differentially expressed genes (padj < 0.05) in {tissue.lower()}. {hu_desc}" |
| }) |
| |
| if tissue == 'Hippocampus': |
| ir_desc = "This high number reflects the brain's particular sensitivity to radiation-induced damage." |
| elif effects['IR'] < 100: |
| ir_desc = "This minimal response indicates low radiation sensitivity in this tissue." |
| else: |
| ir_desc = "This moderate response indicates radiation sensitivity in this tissue." |
| |
| examples.append({ |
| "instruction": f"How many DEGs does ionizing radiation induce in {tissue.lower()}?", |
| "input": "", |
| "output": f"Ionizing radiation (IR) induces {effects['IR']:,} DEGs in {tissue.lower()}. {ir_desc}" |
| }) |
| |
| examples.append({ |
| "instruction": f"What is the combined effect of HU and IR stress on {tissue.lower()} transcriptome?", |
| "input": "", |
| "output": f"Combined HU+IR stress produces {effects['HU_IR']:,} DEGs in {tissue.lower()}." |
| }) |
| |
| for tissue, effects in KMP_EFFECTS.items(): |
| pattern = "stress-activated" if effects['in_HU_IR'] > effects['baseline'] else "stress-blocked" |
| examples.append({ |
| "instruction": f"How does kaempferol (KMP) affect gene expression in {tissue.lower()} under combined stress?", |
| "input": "", |
| "output": f"In {tissue.lower()}, KMP produces {effects['in_HU_IR']:,} DEGs under combined HU+IR stress, compared to {effects['baseline']:,} DEGs at baseline. This {pattern} pattern indicates KMP's pharmacodynamics are highly context-dependent." |
| }) |
| |
| for tissue, ints in INTERACTIONS.items(): |
| examples.append({ |
| "instruction": f"What is the KMP x HU interaction effect in {tissue.lower()}?", |
| "input": "", |
| "output": f"The KMP x HU interaction in {tissue.lower()} produces {ints['KMP_x_HU']:,} DEGs, indicating non-additive pharmacodynamics." |
| }) |
| |
| for tissue, pattern in OXPHOS_PATTERNS.items(): |
| examples.append({ |
| "instruction": f"What is the OXPHOS pathway pattern in {tissue.lower()} under stress and KMP treatment?", |
| "input": "", |
| "output": f"In {tissue.lower()}, combined stress produces OXPHOS NES = {pattern['stress_NES']:.2f}, while KMP treatment shifts this to NES = {pattern['KMP_NES']:.2f}. This represents a {pattern['pattern']} pattern." |
| }) |
| |
| for tissue, ttype in TISSUE_TYPES.items(): |
| examples.append({ |
| "instruction": f"What type of KMP response pattern does {tissue.lower()} show?", |
| "input": "", |
| "output": f"{tissue} is classified as {ttype}." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_comparison_examples() -> List[Dict]: |
| examples = [] |
| |
| examples.append({ |
| "instruction": "Which tissue is most sensitive to hindlimb unloading?", |
| "input": "", |
| "output": f"Soleus muscle is most sensitive to hindlimb unloading with {STRESSOR_EFFECTS['Soleus']['HU']:,} DEGs, followed by Liver ({STRESSOR_EFFECTS['Liver']['HU']:,}), Hippocampus ({STRESSOR_EFFECTS['Hippocampus']['HU']:,}), and Heart ({STRESSOR_EFFECTS['Heart']['HU']})." |
| }) |
| |
| examples.append({ |
| "instruction": "Which tissue is most sensitive to ionizing radiation?", |
| "input": "", |
| "output": f"Hippocampus is most sensitive to ionizing radiation with {STRESSOR_EFFECTS['Hippocampus']['IR']:,} DEGs, dramatically higher than other tissues." |
| }) |
| |
| examples.append({ |
| "instruction": "Compare KMP's transcriptional effects across all four tissues under combined stress.", |
| "input": "", |
| "output": f"KMP shows dramatically different effects across tissues under HU+IR stress: Heart ({KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs, stress-activated), Soleus ({KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs, stress-activated), Hippocampus ({KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs, stress-blocked), and Liver ({KMP_EFFECTS['Liver']['in_HU_IR']} DEGs, stress-blocked)." |
| }) |
| |
| examples.append({ |
| "instruction": "Which tissue shows the largest KMP x HU interaction effect?", |
| "input": "", |
| "output": f"Soleus shows the largest KMP x HU interaction with {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs, the largest interaction in the entire dataset." |
| }) |
| |
| examples.append({ |
| "instruction": "Classify tissues by their dominant stressor sensitivity.", |
| "input": "", |
| "output": f"HU-dominant: Heart ({STRESSOR_EFFECTS['Heart']['HU']} vs {STRESSOR_EFFECTS['Heart']['IR']}), Soleus ({STRESSOR_EFFECTS['Soleus']['HU']:,} vs {STRESSOR_EFFECTS['Soleus']['IR']}). IR-dominant: Hippocampus ({STRESSOR_EFFECTS['Hippocampus']['HU']:,} vs {STRESSOR_EFFECTS['Hippocampus']['IR']:,}). Both: Liver." |
| }) |
| |
| tissue_pairs = [('Heart', 'Soleus'), ('Heart', 'Liver'), ('Hippocampus', 'Liver')] |
| for t1, t2 in tissue_pairs: |
| examples.append({ |
| "instruction": f"Compare KMP context-dependency between {t1.lower()} and {t2.lower()}.", |
| "input": "", |
| "output": f"{t1} ({TISSUE_TYPES[t1]}): baseline {KMP_EFFECTS[t1]['baseline']} DEGs, stressed {KMP_EFFECTS[t1]['in_HU_IR']:,} DEGs. {t2} ({TISSUE_TYPES[t2]}): baseline {KMP_EFFECTS[t2]['baseline']} DEGs, stressed {KMP_EFFECTS[t2]['in_HU_IR']} DEGs." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_interaction_examples() -> List[Dict]: |
| examples = [] |
| |
| for tissue in ['Heart', 'Hippocampus', 'Liver', 'Soleus']: |
| hu = STRESSOR_EFFECTS[tissue]['HU'] |
| ir = STRESSOR_EFFECTS[tissue]['IR'] |
| combined = STRESSOR_EFFECTS[tissue]['HU_IR'] |
| |
| examples.append({ |
| "instruction": f"Given the main effects of HU and IR separately in {tissue.lower()}, predict the combined HU+IR effect.", |
| "input": f"{tissue} main effects: HU alone: {hu:,} DEGs, IR alone: {ir:,} DEGs. Predict the combined HU+IR effect.", |
| "output": f"If additive, expect up to {hu + ir:,} DEGs. Actual result: {combined:,} DEGs. The combined effect reflects tissue-specific integration of stress responses." |
| }) |
| |
| for tissue in ['Heart', 'Soleus', 'Liver', 'Hippocampus']: |
| baseline = KMP_EFFECTS[tissue]['baseline'] |
| stressed = KMP_EFFECTS[tissue]['in_HU_IR'] |
| ttype = TISSUE_TYPES[tissue] |
| |
| examples.append({ |
| "instruction": f"KMP shows {baseline} DEGs at baseline in {tissue.lower()}. Predict KMP effect under combined HU+IR stress.", |
| "input": f"KMP at baseline in {tissue.lower()}: {baseline} DEGs. {tissue} stress response (HU+IR): {STRESSOR_EFFECTS[tissue]['HU_IR']:,} DEGs.", |
| "output": f"Actual result: {stressed:,} DEGs ({ttype}). {'Stress activates KMP response.' if stressed > baseline else 'Stress blocks KMP response.'}" |
| }) |
| |
| examples.append({ |
| "instruction": "The KMP x HU interaction in heart produces 479 DEGs. Predict the magnitude in soleus.", |
| "input": "Heart KMP x HU: 479 DEGs. Both are striated muscle. Soleus has larger HU response.", |
| "output": f"Prediction: Larger than heart. Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs, the largest interaction in the dataset." |
| }) |
| |
| examples.append({ |
| "instruction": "Given OXPHOS RESCUE in heart, predict liver OXPHOS response to KMP.", |
| "input": f"Heart: Stress suppresses OXPHOS (NES={OXPHOS_PATTERNS['Heart']['stress_NES']}), KMP rescues (NES={OXPHOS_PATTERNS['Heart']['KMP_NES']}). Liver is Type C.", |
| "output": f"Prediction: Opposite pattern. Actual: Liver stress NES={OXPHOS_PATTERNS['Liver']['stress_NES']:.2f}, KMP NES={OXPHOS_PATTERNS['Liver']['KMP_NES']:.2f}. SUPPRESSION pattern (hepatoprotective braking)." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_design_critique_examples() -> List[Dict]: |
| examples = [] |
| |
| examples.append({ |
| "instruction": "A researcher proposes testing KMP efficacy by comparing KMP-treated vs vehicle-treated mice under normal conditions. Critique this design.", |
| "input": "Proposed: Compare KMP_vs_Ctrl under normal conditions.", |
| "output": f"This design would miss KMP's primary mechanism. Heart: {KMP_EFFECTS['Heart']['baseline']} DEGs at baseline vs {KMP_EFFECTS['Heart']['in_HU_IR']:,} under stress. Soleus: {KMP_EFFECTS['Soleus']['baseline']} vs {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. Recommendation: Use factorial design including stress conditions." |
| }) |
| |
| examples.append({ |
| "instruction": "Evaluate whether results from heart can predict liver response to KMP under stress.", |
| "input": "Heart shows KMP rescues OXPHOS under stress. Can we extrapolate to liver?", |
| "output": f"Direct extrapolation would be INCORRECT. Heart is Type A, Liver is Type C. OXPHOS patterns are opposite: Heart stress suppresses then KMP rescues; Liver stress activates then KMP suppresses. Each tissue requires independent testing." |
| }) |
| |
| examples.append({ |
| "instruction": "A study finds KMP reduces oxidative stress markers in cell culture. How confident can we be this translates to in vivo protection?", |
| "input": "", |
| "output": "Confidence: LOW to MODERATE. Cell culture shows mechanism plausibility but cannot capture context-dependency (in vivo effects range 0-2,110 DEGs), tissue specificity (opposite OXPHOS patterns), or interaction effects (KMP x HU = 8,484 DEGs in soleus)." |
| }) |
| |
| examples.append({ |
| "instruction": "Critique a study using padj < 0.05 AND |log2FC| > 1 threshold for DEG detection.", |
| "input": "Study reports 'no effect' of KMP in heart under stress using stringent thresholds.", |
| "output": f"Overly stringent thresholds miss meaningful effects. With padj < 0.05 only: KMP produces {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs in heart. Many coordinated small changes drive pathway-level effects. Use padj-only for discovery, complement with pathway analysis." |
| }) |
| |
| examples.append({ |
| "instruction": "A grant proposal plans to test KMP in astronauts based solely on the mouse data. Evaluate the translational gap.", |
| "input": "", |
| "output": "SUBSTANTIAL gaps remain. Mouse data provides: proof of concept, tissue patterns, OXPHOS mechanism. Missing: human PK, true microgravity effects (HU is analog), long-duration effects, safety profile. Needed: human organoid studies, dose-finding, analog studies before human trials." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_mechanistic_examples() -> List[Dict]: |
| examples = [] |
| |
| examples.append({ |
| "instruction": "Explain why hippocampus shows strong KMP effect at baseline (4,110 DEGs) but minimal effect under combined stress (140 DEGs).", |
| "input": "", |
| "output": "This 'baseline-active, stress-blocked' pattern suggests: (1) Receptor saturation - stress signals already engage KMP targets, (2) Pathway competition - IR-induced DNA damage response overrides KMP effects, (3) Altered cellular state - radiation changes transcriptional machinery. Implications: may need pre-treatment before stress." |
| }) |
| |
| examples.append({ |
| "instruction": "The KMP x HU interaction in soleus produces 8,484 DEGs. Explain this super-additive interaction.", |
| "input": "", |
| "output": "This massive interaction indicates: (1) Mechanosensitive activation - unloading creates binding sites for KMP-responsive factors, (2) Stress-primed responsiveness - HU upregulates KMP targets normally not expressed, (3) Mitochondrial crosstalk - KMP effects on mitochondria only manifest when organelles are stressed. Key insight: KMP specifically protects stressed tissue." |
| }) |
| |
| examples.append({ |
| "instruction": "Why does liver show OXPHOS suppression by KMP when heart and soleus show OXPHOS rescue?", |
| "input": "", |
| "output": "Different metabolic contexts: Heart/Soleus (RESCUE) - stress suppresses OXPHOS, insufficient ATP, KMP restores function. Liver (SUPPRESSION) - stress activates compensatory metabolism, over-activation causes damage, KMP provides 'hepatoprotective braking'. 'Rescue' is tissue-specific: restore suppressed function vs prevent harmful over-activation." |
| }) |
| |
| examples.append({ |
| "instruction": "Explain the tissue-type classification system (Type A/B/C).", |
| "input": "", |
| "output": f"Type A (stress-activated): Heart, Soleus - minimal baseline effect, strong under stress. Type B (baseline-active): Hippocampus - strong baseline ({KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs), minimal under stress ({KMP_EFFECTS['Hippocampus']['in_HU_IR']}). Type C (stress-blocked): Liver - moderate baseline ({KMP_EFFECTS['Liver']['baseline']}), nearly zero under stress ({KMP_EFFECTS['Liver']['in_HU_IR']}). Clinical implications: tissue-specific dosing, timing, and endpoints needed." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_calibration_examples() -> List[Dict]: |
| examples = [] |
| |
| examples.append({ |
| "instruction": "What is the effect of KMP on cardiac contractility?", |
| "input": "", |
| "output": "I cannot directly answer this. The data shows transcriptomic changes (2,110 DEGs, OXPHOS rescue) but NOT functional contractility. OXPHOS rescue suggests improved ATP availability, which might improve contractility, but transcription-to-function is not guaranteed. Confidence: low for functional claims." |
| }) |
| |
| examples.append({ |
| "instruction": "Will KMP prevent muscle atrophy in human astronauts?", |
| "input": "", |
| "output": "I cannot confidently answer this. Mouse data supports: stress-activated protection, OXPHOS rescue, KMP x HU interaction. Mouse data cannot tell us: human PK, dose requirements, true microgravity effects, long-duration effects. Strong mechanistic rationale but insufficient for human predictions without additional studies." |
| }) |
| |
| examples.append({ |
| "instruction": "Which gene is most important for KMP's protective effect?", |
| "input": "", |
| "output": "Cannot be definitively answered from transcriptomic data. Problems: correlation is not causation, pathway redundancy, no functional validation, tissue-specificity. Better questions: 'Which genes show largest effects?' (answerable), 'Which pathways are enriched?' (answerable), 'Which gene is necessary?' (requires CRISPR studies)." |
| }) |
| |
| examples.append({ |
| "instruction": "Based on this data, what dose of KMP should be used for spaceflight countermeasures?", |
| "input": "", |
| "output": "This data CANNOT inform dosing. Missing: dose-response relationship (only one dose tested), human PK, therapeutic window, bioavailability differences. Mouse-to-human scaling is unreliable (10-100x errors possible). Needed: mouse dose-response, PK modeling, human Phase I studies. Do not extrapolate dosing from this study." |
| }) |
| |
| examples.append({ |
| "instruction": "Is the KMP x HU interaction (8,484 DEGs) statistically significant?", |
| "input": "", |
| "output": "Yes, with context. Each gene has padj < 0.05 for the interaction term. However: (1) exact number is threshold-dependent, (2) effect sizes vary across genes later. Appropriate statement: 'approximately 8,500 significantly changed genes (padj < 0.05), indicating substantial non-additive pharmacological effect.'" |
| }) |
| |
| return examples |
|
|
|
|
| def compile_sft_dataset(output_file: str = 'kmp_sft_dataset.json'): |
| all_examples = [] |
| |
| print("Generating factual examples...") |
| all_examples.extend(generate_factual_examples()) |
| |
| print("Generating comparison examples...") |
| all_examples.extend(generate_comparison_examples()) |
| |
| print("Generating interaction prediction examples...") |
| all_examples.extend(generate_interaction_examples()) |
| |
| print("Generating design critique examples...") |
| all_examples.extend(generate_design_critique_examples()) |
| |
| print("Generating mechanistic reasoning examples...") |
| all_examples.extend(generate_mechanistic_examples()) |
| |
| print("Generating calibration examples...") |
| all_examples.extend(generate_calibration_examples()) |
| |
| formatted = [] |
| for ex in all_examples: |
| if ex.get('input'): |
| text = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" |
| else: |
| text = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}" |
| formatted.append({"text": text}) |
| |
| with open(output_file, 'w') as f: |
| json.dump(formatted, f, indent=2) |
| |
| print(f"\n{'='*60}") |
| print(f"SFT Dataset Summary") |
| print(f"{'='*60}") |
| print(f"Total examples: {len(formatted)}") |
| print(f"Output file: {output_file}") |
| |
| return formatted |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--output', default='kmp_sft_dataset.json') |
| args = parser.parse_args() |
| compile_sft_dataset(args.output) |
|
|