| |
| """ |
| BioRLHF Quickstart Example |
| |
| This script demonstrates the basic workflow for using BioRLHF: |
| 1. Loading ground truth biological data |
| 2. Creating an SFT dataset |
| 3. Exploring the generated examples |
| |
| Note: This example does not require a GPU and is safe to run locally. |
| """ |
|
|
| import json |
| import tempfile |
| from pathlib import Path |
|
|
| |
| from biorlhf.data.ground_truth import ( |
| STRESSOR_EFFECTS, |
| KMP_EFFECTS, |
| TISSUE_TYPES, |
| OXPHOS_PATTERNS, |
| ) |
|
|
| |
| from biorlhf.data.dataset import create_sft_dataset |
|
|
|
|
| def explore_ground_truth(): |
| """Explore the ground truth experimental data.""" |
| print("=" * 60) |
| print("BioRLHF Ground Truth Data Explorer") |
| print("=" * 60) |
|
|
| print("\n1. STRESSOR EFFECTS (DEG counts by tissue)") |
| print("-" * 40) |
| for tissue, effects in STRESSOR_EFFECTS.items(): |
| print(f"\n{tissue}:") |
| print(f" Hindlimb Unloading (HU): {effects['HU']:,} DEGs") |
| print(f" Ionizing Radiation (IR): {effects['IR']:,} DEGs") |
| print(f" Combined HU+IR: {effects['HU_IR']:,} DEGs") |
|
|
| print("\n\n2. KMP EFFECTS UNDER DIFFERENT CONDITIONS") |
| print("-" * 40) |
| for tissue, effects in KMP_EFFECTS.items(): |
| print(f"\n{tissue}:") |
| print(f" Baseline: {effects['baseline']:,} DEGs") |
| print(f" Under HU: {effects['in_HU']:,} DEGs") |
| print(f" Under IR: {effects['in_IR']:,} DEGs") |
| print(f" Under HU+IR: {effects['in_HU_IR']:,} DEGs") |
|
|
| print("\n\n3. TISSUE CLASSIFICATIONS") |
| print("-" * 40) |
| for tissue, ttype in TISSUE_TYPES.items(): |
| print(f" {tissue}: {ttype}") |
|
|
| print("\n\n4. OXPHOS PATHWAY PATTERNS") |
| print("-" * 40) |
| for tissue, data in OXPHOS_PATTERNS.items(): |
| print(f"\n{tissue}:") |
| print(f" Stress NES: {data['stress_NES']}") |
| print(f" KMP NES: {data['KMP_NES']}") |
| print(f" Pattern: {data['pattern']}") |
|
|
|
|
| def create_example_dataset(): |
| """Create and explore an example SFT dataset.""" |
| print("\n\n" + "=" * 60) |
| print("Creating Example SFT Dataset") |
| print("=" * 60) |
|
|
| |
| with tempfile.TemporaryDirectory() as tmpdir: |
| output_path = Path(tmpdir) / "example_dataset.json" |
|
|
| |
| examples = create_sft_dataset( |
| output_path=output_path, |
| include_calibration=True, |
| include_chain_of_thought=True, |
| ) |
|
|
| print(f"\nDataset created with {len(examples)} examples") |
| print(f"Saved to: {output_path}") |
|
|
| |
| print("\n\nSample Examples by Category:") |
| print("-" * 40) |
|
|
| |
| for i, ex in enumerate(examples[:3]): |
| print(f"\n--- Example {i+1} ---") |
| text = ex["text"] |
| |
| if len(text) > 500: |
| text = text[:500] + "..." |
| print(text) |
|
|
|
|
| def main(): |
| """Run the quickstart demonstration.""" |
| print("\n" + "=" * 60) |
| print("Welcome to BioRLHF!") |
| print("=" * 60) |
| print(""" |
| This quickstart demonstrates the BioRLHF framework for fine-tuning |
| LLMs on biological reasoning tasks. |
| |
| Key features: |
| - Ground truth data from KMP 2x2x2 factorial transcriptomic study |
| - Automated SFT dataset generation |
| - Support for factual, reasoning, and calibration examples |
| """) |
|
|
| |
| explore_ground_truth() |
| create_example_dataset() |
|
|
| print("\n\n" + "=" * 60) |
| print("Next Steps") |
| print("=" * 60) |
| print(""" |
| To train a model, see the full training examples: |
| - examples/train_sft.py - Supervised fine-tuning |
| - examples/evaluate_model.py - Model evaluation |
| |
| For GPU training, ensure you have: |
| - CUDA-compatible GPU |
| - torch with CUDA support |
| - Sufficient VRAM (16GB+ recommended) |
| """) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|