File size: 3,920 Bytes
c7ebaa1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python3
"""
BioRLHF Quickstart Example

This script demonstrates the basic workflow for using BioRLHF:
1. Loading ground truth biological data
2. Creating an SFT dataset
3. Exploring the generated examples

Note: This example does not require a GPU and is safe to run locally.
"""

import json
import tempfile
from pathlib import Path

# Import ground truth data
from biorlhf.data.ground_truth import (
    STRESSOR_EFFECTS,
    KMP_EFFECTS,
    TISSUE_TYPES,
    OXPHOS_PATTERNS,
)

# Import dataset creation utilities
from biorlhf.data.dataset import create_sft_dataset


def explore_ground_truth():
    """Explore the ground truth experimental data."""
    print("=" * 60)
    print("BioRLHF Ground Truth Data Explorer")
    print("=" * 60)

    print("\n1. STRESSOR EFFECTS (DEG counts by tissue)")
    print("-" * 40)
    for tissue, effects in STRESSOR_EFFECTS.items():
        print(f"\n{tissue}:")
        print(f"  Hindlimb Unloading (HU): {effects['HU']:,} DEGs")
        print(f"  Ionizing Radiation (IR): {effects['IR']:,} DEGs")
        print(f"  Combined HU+IR: {effects['HU_IR']:,} DEGs")

    print("\n\n2. KMP EFFECTS UNDER DIFFERENT CONDITIONS")
    print("-" * 40)
    for tissue, effects in KMP_EFFECTS.items():
        print(f"\n{tissue}:")
        print(f"  Baseline: {effects['baseline']:,} DEGs")
        print(f"  Under HU: {effects['in_HU']:,} DEGs")
        print(f"  Under IR: {effects['in_IR']:,} DEGs")
        print(f"  Under HU+IR: {effects['in_HU_IR']:,} DEGs")

    print("\n\n3. TISSUE CLASSIFICATIONS")
    print("-" * 40)
    for tissue, ttype in TISSUE_TYPES.items():
        print(f"  {tissue}: {ttype}")

    print("\n\n4. OXPHOS PATHWAY PATTERNS")
    print("-" * 40)
    for tissue, data in OXPHOS_PATTERNS.items():
        print(f"\n{tissue}:")
        print(f"  Stress NES: {data['stress_NES']}")
        print(f"  KMP NES: {data['KMP_NES']}")
        print(f"  Pattern: {data['pattern']}")


def create_example_dataset():
    """Create and explore an example SFT dataset."""
    print("\n\n" + "=" * 60)
    print("Creating Example SFT Dataset")
    print("=" * 60)

    # Create a temporary directory for the output
    with tempfile.TemporaryDirectory() as tmpdir:
        output_path = Path(tmpdir) / "example_dataset.json"

        # Create the dataset
        examples = create_sft_dataset(
            output_path=output_path,
            include_calibration=True,
            include_chain_of_thought=True,
        )

        print(f"\nDataset created with {len(examples)} examples")
        print(f"Saved to: {output_path}")

        # Show example categories
        print("\n\nSample Examples by Category:")
        print("-" * 40)

        # Show a few examples
        for i, ex in enumerate(examples[:3]):
            print(f"\n--- Example {i+1} ---")
            text = ex["text"]
            # Truncate long outputs for display
            if len(text) > 500:
                text = text[:500] + "..."
            print(text)


def main():
    """Run the quickstart demonstration."""
    print("\n" + "=" * 60)
    print("Welcome to BioRLHF!")
    print("=" * 60)
    print("""
This quickstart demonstrates the BioRLHF framework for fine-tuning
LLMs on biological reasoning tasks.

Key features:
- Ground truth data from KMP 2x2x2 factorial transcriptomic study
- Automated SFT dataset generation
- Support for factual, reasoning, and calibration examples
""")

    # Run demonstrations
    explore_ground_truth()
    create_example_dataset()

    print("\n\n" + "=" * 60)
    print("Next Steps")
    print("=" * 60)
    print("""
To train a model, see the full training examples:
- examples/train_sft.py - Supervised fine-tuning
- examples/evaluate_model.py - Model evaluation

For GPU training, ensure you have:
- CUDA-compatible GPU
- torch with CUDA support
- Sufficient VRAM (16GB+ recommended)
""")


if __name__ == "__main__":
    main()