File size: 3,920 Bytes

c7ebaa1

#!/usr/bin/env python3
"""
BioRLHF Quickstart Example

This script demonstrates the basic workflow for using BioRLHF:
1. Loading ground truth biological data
2. Creating an SFT dataset
3. Exploring the generated examples

Note: This example does not require a GPU and is safe to run locally.
"""

import json
import tempfile
from pathlib import Path

# Import ground truth data
from biorlhf.data.ground_truth import (
    STRESSOR_EFFECTS,
    KMP_EFFECTS,
    TISSUE_TYPES,
    OXPHOS_PATTERNS,
)

# Import dataset creation utilities
from biorlhf.data.dataset import create_sft_dataset


def explore_ground_truth():
    """Explore the ground truth experimental data."""
    print("=" * 60)
    print("BioRLHF Ground Truth Data Explorer")
    print("=" * 60)

    print("\n1. STRESSOR EFFECTS (DEG counts by tissue)")
    print("-" * 40)
    for tissue, effects in STRESSOR_EFFECTS.items():
        print(f"\n{tissue}:")
        print(f"  Hindlimb Unloading (HU): {effects['HU']:,} DEGs")
        print(f"  Ionizing Radiation (IR): {effects['IR']:,} DEGs")
        print(f"  Combined HU+IR: {effects['HU_IR']:,} DEGs")

    print("\n\n2. KMP EFFECTS UNDER DIFFERENT CONDITIONS")
    print("-" * 40)
    for tissue, effects in KMP_EFFECTS.items():
        print(f"\n{tissue}:")
        print(f"  Baseline: {effects['baseline']:,} DEGs")
        print(f"  Under HU: {effects['in_HU']:,} DEGs")
        print(f"  Under IR: {effects['in_IR']:,} DEGs")
        print(f"  Under HU+IR: {effects['in_HU_IR']:,} DEGs")

    print("\n\n3. TISSUE CLASSIFICATIONS")
    print("-" * 40)
    for tissue, ttype in TISSUE_TYPES.items():
        print(f"  {tissue}: {ttype}")

    print("\n\n4. OXPHOS PATHWAY PATTERNS")
    print("-" * 40)
    for tissue, data in OXPHOS_PATTERNS.items():
        print(f"\n{tissue}:")
        print(f"  Stress NES: {data['stress_NES']}")
        print(f"  KMP NES: {data['KMP_NES']}")
        print(f"  Pattern: {data['pattern']}")


def create_example_dataset():
    """Create and explore an example SFT dataset."""
    print("\n\n" + "=" * 60)
    print("Creating Example SFT Dataset")
    print("=" * 60)

    # Create a temporary directory for the output
    with tempfile.TemporaryDirectory() as tmpdir:
        output_path = Path(tmpdir) / "example_dataset.json"

        # Create the dataset
        examples = create_sft_dataset(
            output_path=output_path,
            include_calibration=True,
            include_chain_of_thought=True,
        )

        print(f"\nDataset created with {len(examples)} examples")
        print(f"Saved to: {output_path}")

        # Show example categories
        print("\n\nSample Examples by Category:")
        print("-" * 40)

        # Show a few examples
        for i, ex in enumerate(examples[:3]):
            print(f"\n--- Example {i+1} ---")
            text = ex["text"]
            # Truncate long outputs for display
            if len(text) > 500:
                text = text[:500] + "..."
            print(text)


def main():
    """Run the quickstart demonstration."""
    print("\n" + "=" * 60)
    print("Welcome to BioRLHF!")
    print("=" * 60)
    print("""
This quickstart demonstrates the BioRLHF framework for fine-tuning
LLMs on biological reasoning tasks.

Key features:
- Ground truth data from KMP 2x2x2 factorial transcriptomic study
- Automated SFT dataset generation
- Support for factual, reasoning, and calibration examples
""")

    # Run demonstrations
    explore_ground_truth()
    create_example_dataset()

    print("\n\n" + "=" * 60)
    print("Next Steps")
    print("=" * 60)
    print("""
To train a model, see the full training examples:
- examples/train_sft.py - Supervised fine-tuning
- examples/evaluate_model.py - Model evaluation

For GPU training, ensure you have:
- CUDA-compatible GPU
- torch with CUDA support
- Sufficient VRAM (16GB+ recommended)
""")


if __name__ == "__main__":
    main()