BioRLHF / examples /quickstart.py
jang1563's picture
Initial commit: BioRLHF v0.1.0
c7ebaa1
#!/usr/bin/env python3
"""
BioRLHF Quickstart Example
This script demonstrates the basic workflow for using BioRLHF:
1. Loading ground truth biological data
2. Creating an SFT dataset
3. Exploring the generated examples
Note: This example does not require a GPU and is safe to run locally.
"""
import json
import tempfile
from pathlib import Path
# Import ground truth data
from biorlhf.data.ground_truth import (
STRESSOR_EFFECTS,
KMP_EFFECTS,
TISSUE_TYPES,
OXPHOS_PATTERNS,
)
# Import dataset creation utilities
from biorlhf.data.dataset import create_sft_dataset
def explore_ground_truth():
"""Explore the ground truth experimental data."""
print("=" * 60)
print("BioRLHF Ground Truth Data Explorer")
print("=" * 60)
print("\n1. STRESSOR EFFECTS (DEG counts by tissue)")
print("-" * 40)
for tissue, effects in STRESSOR_EFFECTS.items():
print(f"\n{tissue}:")
print(f" Hindlimb Unloading (HU): {effects['HU']:,} DEGs")
print(f" Ionizing Radiation (IR): {effects['IR']:,} DEGs")
print(f" Combined HU+IR: {effects['HU_IR']:,} DEGs")
print("\n\n2. KMP EFFECTS UNDER DIFFERENT CONDITIONS")
print("-" * 40)
for tissue, effects in KMP_EFFECTS.items():
print(f"\n{tissue}:")
print(f" Baseline: {effects['baseline']:,} DEGs")
print(f" Under HU: {effects['in_HU']:,} DEGs")
print(f" Under IR: {effects['in_IR']:,} DEGs")
print(f" Under HU+IR: {effects['in_HU_IR']:,} DEGs")
print("\n\n3. TISSUE CLASSIFICATIONS")
print("-" * 40)
for tissue, ttype in TISSUE_TYPES.items():
print(f" {tissue}: {ttype}")
print("\n\n4. OXPHOS PATHWAY PATTERNS")
print("-" * 40)
for tissue, data in OXPHOS_PATTERNS.items():
print(f"\n{tissue}:")
print(f" Stress NES: {data['stress_NES']}")
print(f" KMP NES: {data['KMP_NES']}")
print(f" Pattern: {data['pattern']}")
def create_example_dataset():
"""Create and explore an example SFT dataset."""
print("\n\n" + "=" * 60)
print("Creating Example SFT Dataset")
print("=" * 60)
# Create a temporary directory for the output
with tempfile.TemporaryDirectory() as tmpdir:
output_path = Path(tmpdir) / "example_dataset.json"
# Create the dataset
examples = create_sft_dataset(
output_path=output_path,
include_calibration=True,
include_chain_of_thought=True,
)
print(f"\nDataset created with {len(examples)} examples")
print(f"Saved to: {output_path}")
# Show example categories
print("\n\nSample Examples by Category:")
print("-" * 40)
# Show a few examples
for i, ex in enumerate(examples[:3]):
print(f"\n--- Example {i+1} ---")
text = ex["text"]
# Truncate long outputs for display
if len(text) > 500:
text = text[:500] + "..."
print(text)
def main():
"""Run the quickstart demonstration."""
print("\n" + "=" * 60)
print("Welcome to BioRLHF!")
print("=" * 60)
print("""
This quickstart demonstrates the BioRLHF framework for fine-tuning
LLMs on biological reasoning tasks.
Key features:
- Ground truth data from KMP 2x2x2 factorial transcriptomic study
- Automated SFT dataset generation
- Support for factual, reasoning, and calibration examples
""")
# Run demonstrations
explore_ground_truth()
create_example_dataset()
print("\n\n" + "=" * 60)
print("Next Steps")
print("=" * 60)
print("""
To train a model, see the full training examples:
- examples/train_sft.py - Supervised fine-tuning
- examples/evaluate_model.py - Model evaluation
For GPU training, ensure you have:
- CUDA-compatible GPU
- torch with CUDA support
- Sufficient VRAM (16GB+ recommended)
""")
if __name__ == "__main__":
main()