jang1563
/

BioRLHF

Model card Files Files and versions

BioRLHF / examples /quickstart.py

jang1563's picture

Initial commit: BioRLHF v0.1.0

c7ebaa1 3 months ago

history blame contribute delete

3.92 kB

	#!/usr/bin/env python3
	"""
	BioRLHF Quickstart Example

	This script demonstrates the basic workflow for using BioRLHF:
	1. Loading ground truth biological data
	2. Creating an SFT dataset
	3. Exploring the generated examples

	Note: This example does not require a GPU and is safe to run locally.
	"""

	import json
	import tempfile
	from pathlib import Path

	# Import ground truth data
	from biorlhf.data.ground_truth import (
	STRESSOR_EFFECTS,
	KMP_EFFECTS,
	TISSUE_TYPES,
	OXPHOS_PATTERNS,
	)

	# Import dataset creation utilities
	from biorlhf.data.dataset import create_sft_dataset


	def explore_ground_truth():
	"""Explore the ground truth experimental data."""
	print("=" * 60)
	print("BioRLHF Ground Truth Data Explorer")
	print("=" * 60)

	print("\n1. STRESSOR EFFECTS (DEG counts by tissue)")
	print("-" * 40)
	for tissue, effects in STRESSOR_EFFECTS.items():
	print(f"\n{tissue}:")
	print(f" Hindlimb Unloading (HU): {effects['HU']:,} DEGs")
	print(f" Ionizing Radiation (IR): {effects['IR']:,} DEGs")
	print(f" Combined HU+IR: {effects['HU_IR']:,} DEGs")

	print("\n\n2. KMP EFFECTS UNDER DIFFERENT CONDITIONS")
	print("-" * 40)
	for tissue, effects in KMP_EFFECTS.items():
	print(f"\n{tissue}:")
	print(f" Baseline: {effects['baseline']:,} DEGs")
	print(f" Under HU: {effects['in_HU']:,} DEGs")
	print(f" Under IR: {effects['in_IR']:,} DEGs")
	print(f" Under HU+IR: {effects['in_HU_IR']:,} DEGs")

	print("\n\n3. TISSUE CLASSIFICATIONS")
	print("-" * 40)
	for tissue, ttype in TISSUE_TYPES.items():
	print(f" {tissue}: {ttype}")

	print("\n\n4. OXPHOS PATHWAY PATTERNS")
	print("-" * 40)
	for tissue, data in OXPHOS_PATTERNS.items():
	print(f"\n{tissue}:")
	print(f" Stress NES: {data['stress_NES']}")
	print(f" KMP NES: {data['KMP_NES']}")
	print(f" Pattern: {data['pattern']}")


	def create_example_dataset():
	"""Create and explore an example SFT dataset."""
	print("\n\n" + "=" * 60)
	print("Creating Example SFT Dataset")
	print("=" * 60)

	# Create a temporary directory for the output
	with tempfile.TemporaryDirectory() as tmpdir:
	output_path = Path(tmpdir) / "example_dataset.json"

	# Create the dataset
	examples = create_sft_dataset(
	output_path=output_path,
	include_calibration=True,
	include_chain_of_thought=True,
	)

	print(f"\nDataset created with {len(examples)} examples")
	print(f"Saved to: {output_path}")

	# Show example categories
	print("\n\nSample Examples by Category:")
	print("-" * 40)

	# Show a few examples
	for i, ex in enumerate(examples[:3]):
	print(f"\n--- Example {i+1} ---")
	text = ex["text"]
	# Truncate long outputs for display
	if len(text) > 500:
	text = text[:500] + "..."
	print(text)


	def main():
	"""Run the quickstart demonstration."""
	print("\n" + "=" * 60)
	print("Welcome to BioRLHF!")
	print("=" * 60)
	print("""
	This quickstart demonstrates the BioRLHF framework for fine-tuning
	LLMs on biological reasoning tasks.

	Key features:
	- Ground truth data from KMP 2x2x2 factorial transcriptomic study
	- Automated SFT dataset generation
	- Support for factual, reasoning, and calibration examples
	""")

	# Run demonstrations
	explore_ground_truth()
	create_example_dataset()

	print("\n\n" + "=" * 60)
	print("Next Steps")
	print("=" * 60)
	print("""
	To train a model, see the full training examples:
	- examples/train_sft.py - Supervised fine-tuning
	- examples/evaluate_model.py - Model evaluation

	For GPU training, ensure you have:
	- CUDA-compatible GPU
	- torch with CUDA support
	- Sufficient VRAM (16GB+ recommended)
	""")


	if __name__ == "__main__":
	main()