jang1563 Claude Opus 4.6 commited on Mar 20

Commit

bff2f94

1 Parent(s): c7ebaa1

Add BioGRPO training pipeline with composable biological verifiers

Implements GRPO (Group Relative Policy Optimization) with four composable
biological verifiers (V1-V4) as reward functions:
- V1: Pathway direction verification against GeneLab fGSEA ground truth
- V2: Biological fact checking with keyword/entity validation
- V3: Cross-study consistency verification
- V4: Uncertainty calibration with ECE/Brier scoring

Key components:
- Verifier stack with weighted composition and per-question-type routing
- GRPO dataset builder merging GeneLab, BioEval, and SpaceOmicsBench sources
- GeneLab data loader with pathway enrichment score integration
- Calibration evaluation metrics (ECE, MCE, Brier, reliability diagrams)
- CLI entry point (biorlhf-grpo) with JSON config support
- SLURM scripts for Cayuga HPC (MVE and full experiment configs)
- Post-training evaluation script with SFT baseline comparison

Bug fixes applied during deployment:
- Tokenizer loading from base model (not adapter directory)
- LoRA adapter detection and SFT merge before GRPO training
- QLoRA 4-bit quantization support
- Lazy imports to avoid circular torch dependencies

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (31) hide show

configs/grpo_full.json +45 -0
configs/grpo_mve.json +45 -0
pyproject.toml +2 -1
scripts/HPC_TRAINING_GUIDE.md +253 -0
scripts/deploy_to_cayuga.sh +108 -0
scripts/evaluate_ecosystem_model.py +412 -0
scripts/evaluate_grpo.py +372 -0
scripts/merge_training_data.py +160 -0
scripts/run_eval_grpo.sh +92 -0
scripts/run_evaluation.sh +83 -0
scripts/run_grpo_full.sh +79 -0
scripts/run_grpo_mve.sh +84 -0
scripts/setup_cayuga_grpo.sh +127 -0
scripts/train_ecosystem_improved.sh +154 -0
src/biorlhf/__init__.py +24 -4
src/biorlhf/cli.py +128 -1
src/biorlhf/data/__init__.py +9 -1
src/biorlhf/data/genelabloader.py +272 -0
src/biorlhf/data/grpo_dataset.py +219 -0
src/biorlhf/data/question_generator.py +264 -0
src/biorlhf/evaluation/__init__.py +8 -2
src/biorlhf/evaluation/calibration.py +184 -0
src/biorlhf/training/__init__.py +16 -3
src/biorlhf/training/grpo.py +284 -0
src/biorlhf/verifiers/__init__.py +24 -0
src/biorlhf/verifiers/base.py +49 -0
src/biorlhf/verifiers/composer.py +227 -0
src/biorlhf/verifiers/consistency.py +221 -0
src/biorlhf/verifiers/factual.py +143 -0
src/biorlhf/verifiers/pathway.py +300 -0
src/biorlhf/verifiers/uncertainty.py +270 -0

configs/grpo_full.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "model_name": "mistralai/Mistral-7B-v0.3",
+    "sft_model_path": "./kmp_sft_model_final",
+    "output_dir": "./biogrpo_full_model",
+    "num_generations": 8,
+    "beta": 0.04,
+    "num_iterations": 1,
+    "scale_rewards": "group",
+    "loss_type": "grpo",
+    "num_epochs": 2,
+    "batch_size": 1,
+    "gradient_accumulation_steps": 8,
+    "learning_rate": 5e-7,
+    "max_completion_length": 1024,
+    "max_prompt_length": 512,
+    "warmup_ratio": 0.1,
+    "lora_r": 32,
+    "lora_alpha": 64,
+    "lora_dropout": 0.05,
+    "active_verifiers": ["V1", "V2", "V3", "V4"],
+    "verifier_weights": {"V1": 0.35, "V2": 0.30, "V3": 0.15, "V4": 0.20},
+    "pathway_db": "hallmark",
+    "hold_out_tissues": ["eye", "thymus"],
+    "seed": 42,
+    "use_4bit": true,
+    "wandb_project": "biogrpo",
+    "wandb_run_name": "grpo_full_all_verifiers",
+    "use_wandb": true,
+    "logging_steps": 10,
+    "save_steps": 50,
+    "eval_steps": 50,
+    "save_total_limit": 3,
+    "log_completions": true,
+    "use_vllm": false,
+    "gradient_checkpointing": true,
+    "bf16": true
+}

configs/grpo_mve.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+    "model_name": "mistralai/Mistral-7B-v0.3",
+    "sft_model_path": "./kmp_sft_model_final",
+    "output_dir": "./biogrpo_mve_model",
+    "num_generations": 4,
+    "beta": 0.04,
+    "num_iterations": 1,
+    "scale_rewards": "group",
+    "loss_type": "grpo",
+    "num_epochs": 3,
+    "batch_size": 2,
+    "gradient_accumulation_steps": 4,
+    "learning_rate": 1e-6,
+    "max_completion_length": 512,
+    "max_prompt_length": 384,
+    "warmup_ratio": 0.1,
+    "lora_r": 32,
+    "lora_alpha": 64,
+    "lora_dropout": 0.05,
+    "active_verifiers": ["V1", "V4"],
+    "verifier_weights": {"V1": 0.6, "V4": 0.4},
+    "pathway_db": "hallmark",
+    "hold_out_tissues": ["eye"],
+    "seed": 42,
+    "use_4bit": true,
+    "wandb_project": "biogrpo",
+    "wandb_run_name": "grpo_mve_v1v4",
+    "use_wandb": true,
+    "logging_steps": 5,
+    "save_steps": 25,
+    "eval_steps": 25,
+    "save_total_limit": 3,
+    "log_completions": true,
+    "use_vllm": false,
+    "gradient_checkpointing": true,
+    "bf16": true
+}

pyproject.toml CHANGED Viewed

@@ -43,7 +43,7 @@ dependencies = [
     "datasets>=2.14.0",
     "accelerate>=0.24.0",
     "peft>=0.6.0",
-    "trl>=0.7.0",
     "bitsandbytes>=0.41.0",
     "wandb>=0.15.0",
     "pandas>=2.0.0",
@@ -76,6 +76,7 @@ Issues = "https://github.com/jang1563/BioRLHF/issues"
 [project.scripts]
 biorlhf-train = "biorlhf.cli:train"
 biorlhf-evaluate = "biorlhf.cli:evaluate"
 [tool.hatch.build.targets.sdist]
 include = [

     "datasets>=2.14.0",
     "accelerate>=0.24.0",
     "peft>=0.6.0",
+    "trl>=0.14.0",
     "bitsandbytes>=0.41.0",
     "wandb>=0.15.0",
     "pandas>=2.0.0",
 [project.scripts]
 biorlhf-train = "biorlhf.cli:train"
 biorlhf-evaluate = "biorlhf.cli:evaluate"
+biorlhf-grpo = "biorlhf.cli:grpo_train"
 [tool.hatch.build.targets.sdist]
 include = [

scripts/HPC_TRAINING_GUIDE.md ADDED Viewed

	@@ -0,0 +1,253 @@

+# BioRLHF Training on Cayuga HPC (Interactive Session)
+**Cluster:** Cornell Cayuga HPC
+**Target:** GPU training with Mistral-7B + LoRA
+---
+## Quick Start (Copy-Paste Commands)
+```bash
+# 1. Start interactive GPU session (A100 recommended, 80GB VRAM)
+srun -p scu-gpu --gres=gpu:a100:1 --mem=48G -c 8 --time=4:00:00 --pty bash
+# 2. Set up environment (first time only - see Step 2 below)
+# 3. Run training
+cd /athena/cayuga_XXXX/scratch/$USER/BioRLHF/biorlhf
+./scripts/train_ecosystem_improved.sh
+```
+---
+## Step 1: Transfer Files to HPC
+From your local Mac:
+```bash
+# Replace with your actual paths and CWID
+rsync -avz --progress \
+    /Users/jak4013/Dropbox/Bioinformatics/Claude/BioRLHF \
+    YOUR_CWID@cayuga.cac.cornell.edu:/athena/cayuga_XXXX/scratch/$USER/
+```
+Or use scp:
+```bash
+scp -r /Users/jak4013/Dropbox/Bioinformatics/Claude/BioRLHF \
+    YOUR_CWID@cayuga.cac.cornell.edu:/athena/cayuga_XXXX/scratch/$USER/
+```
+---
+## Step 2: Set Up Conda Environment (First Time Only)
+### 2a. Start Interactive Session
+```bash
+# SSH to Cayuga
+ssh YOUR_CWID@cayuga.cac.cornell.edu
+# Request interactive GPU session
+srun -p scu-gpu --gres=gpu:a100:1 --mem=48G -c 8 --time=2:00:00 --pty bash
+```
+### 2b. Install Miniconda (if not already installed)
+```bash
+# Create directory in scratch space
+mkdir -p /athena/cayuga_XXXX/scratch/$USER/miniconda3
+# Download and install
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
+bash miniconda.sh -b -u -p /athena/cayuga_XXXX/scratch/$USER/miniconda3
+rm miniconda.sh
+# Initialize conda
+source /athena/cayuga_XXXX/scratch/$USER/miniconda3/bin/activate
+conda init bash
+source ~/.bashrc
+```
+### 2c. Create BioRLHF Environment
+```bash
+# Create environment with Python 3.10 (best compatibility)
+conda create -n biorlhf python=3.10 -y
+conda activate biorlhf
+# Install PyTorch with CUDA support
+conda install pytorch pytorch-cuda=12.1 -c pytorch -c nvidia -y
+# Install training dependencies
+pip install transformers>=4.36.0
+pip install peft>=0.7.0
+pip install trl>=0.7.0
+pip install bitsandbytes>=0.41.0
+pip install accelerate>=0.25.0
+pip install datasets>=2.14.0
+pip install wandb
+pip install scipy
+pip install sentencepiece
+# Verify GPU access
+python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else None}')"
+```
+---
+## Step 3: Run Training (Interactive)
+### 3a. Start GPU Session
+```bash
+# Request A100 GPU (80GB - best for Mistral-7B)
+srun -p scu-gpu --gres=gpu:a100:1 --mem=48G -c 8 --time=4:00:00 --pty bash
+# Or use A40 (48GB - also works with 4-bit quantization)
+srun -p scu-gpu --gres=gpu:a40:1 --mem=48G -c 8 --time=4:00:00 --pty bash
+```
+### 3b. Activate Environment and Run
+```bash
+# Activate conda
+source /athena/cayuga_XXXX/scratch/$USER/miniconda3/bin/activate
+conda activate biorlhf
+# Navigate to BioRLHF
+cd /athena/cayuga_XXXX/scratch/$USER/BioRLHF/biorlhf
+# Check GPU is available
+nvidia-smi
+# Set HuggingFace cache (optional - saves space)
+export HF_HOME=/athena/cayuga_XXXX/scratch/$USER/.cache/huggingface
+# Run training
+./scripts/train_ecosystem_improved.sh
+```
+---
+## Step 4: Monitor Training
+In a separate terminal (or use tmux/screen):
+```bash
+# Watch GPU usage
+watch -n 1 nvidia-smi
+# Tail training logs
+tail -f logs/biorlhf_ecosystem_*.out
+```
+### Using WandB (Optional)
+```bash
+# Login to Weights & Biases
+wandb login
+# Training will automatically log to: https://wandb.ai/YOUR_USERNAME/biorlhf
+```
+---
+## GPU Options on Cayuga
+| GPU Type | VRAM | Recommended For | Command |
+|----------|------|-----------------|---------|
+| A100 | 80GB | Full training, larger batches | `--gres=gpu:a100:1` |
+| A40 | 48GB | Standard training with 4-bit | `--gres=gpu:a40:1` |
+| H100 | 80GB | Fastest (if available) | `--gres=gpu:h100:1` |
+---
+## Troubleshooting
+### "CUDA out of memory"
+Reduce batch size in training script:
+```bash
+# Edit train_ecosystem_improved.sh
+BATCH_SIZE=2   # Reduce from 4
+GRAD_ACCUM=8   # Increase to maintain effective batch size
+```
+### "No GPU available"
+```bash
+# Check GPU allocation
+nvidia-smi
+# Verify CUDA installation
+python -c "import torch; print(torch.cuda.is_available())"
+# Check if you're on a GPU node
+squeue -u $USER
+```
+### "Module not found"
+```bash
+# Ensure conda environment is activated
+conda activate biorlhf
+# Reinstall missing package
+pip install <missing_package>
+```
+### Interactive session times out
+Use `tmux` or `screen` to persist sessions:
+```bash
+# Start tmux before srun
+tmux new -s training
+# Then request GPU
+srun -p scu-gpu --gres=gpu:a100:1 --mem=48G -c 8 --time=4:00:00 --pty bash
+# Detach: Ctrl+B, then D
+# Reattach: tmux attach -t training
+```
+---
+## Expected Training Time
+| Configuration | Dataset Size | Estimated Time |
+|--------------|--------------|----------------|
+| A100 + 4-bit | 378 examples, 10 epochs | ~45-60 min |
+| A40 + 4-bit | 378 examples, 10 epochs | ~60-90 min |
+| A100 (full) | 378 examples, 10 epochs | ~90-120 min |
+---
+## After Training
+### Copy model back to local machine:
+```bash
+# From your Mac
+scp -r YOUR_CWID@cayuga.cac.cornell.edu:/athena/cayuga_XXXX/scratch/$USER/BioRLHF/biorlhf/ecosystem_improved_model \
+    /Users/jak4013/Dropbox/Bioinformatics/Claude/BioRLHF/biorlhf/
+```
+### Run evaluation:
+```bash
+python evaluate_model.py --model ecosystem_improved_model
+```
+---
+## Complete Interactive Session Example
+```bash
+# SSH to Cayuga
+ssh jk2042@cayuga.cac.cornell.edu
+# Start tmux (optional but recommended)
+tmux new -s biorlhf
+# Request GPU
+srun -p scu-gpu --gres=gpu:a100:1 --mem=48G -c 8 --time=4:00:00 --pty bash
+# Set up environment
+source ~/miniconda3/bin/activate
+conda activate biorlhf
+# Navigate and run
+cd /athena/cayuga_XXXX/scratch/$USER/BioRLHF/biorlhf
+./scripts/train_ecosystem_improved.sh
+# Watch progress (in another terminal or after Ctrl+B, c for new window)
+watch -n 5 nvidia-smi
+```

scripts/deploy_to_cayuga.sh ADDED Viewed

	@@ -0,0 +1,108 @@

+#!/bin/bash
+# ============================================================
+# Deploy BioRLHF code + data to Cayuga HPC
+# Run from local Mac
+# ============================================================
+set -e
+REMOTE="cayuga-login1"
+SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
+LOCAL_BASE="$HOME/Dropbox/Bioinformatics/Claude"
+echo "============================================================"
+echo "BioRLHF Cayuga Deployment"
+echo "============================================================"
+# Step 1: Create directories on Cayuga
+echo ""
+echo "[1/4] Creating directories on Cayuga..."
+ssh ${REMOTE} "mkdir -p ${SCRATCH}/training/BioRLHF ${SCRATCH}/data/GeneLab_benchmark ${SCRATCH}/data/BioEval ${SCRATCH}/data/SpaceOmicsBench/v3/evaluation"
+# Step 2: Transfer BioRLHF code (only essential files)
+echo ""
+echo "[2/4] Transferring BioRLHF code..."
+LOCAL_BIORLHF="${LOCAL_BASE}/BioRLHF/biorlhf"
+DEST="${REMOTE}:${SCRATCH}/training/BioRLHF"
+# Transfer only the package structure needed for GRPO
+rsync -avz --progress \
+    "${LOCAL_BIORLHF}/src/" \
+    ${DEST}/src/
+rsync -avz --progress \
+    "${LOCAL_BIORLHF}/configs/" \
+    ${DEST}/configs/
+rsync -avz --progress \
+    "${LOCAL_BIORLHF}/scripts/" \
+    ${DEST}/scripts/
+rsync -avz --progress \
+    "${LOCAL_BIORLHF}/tests/" \
+    ${DEST}/tests/
+rsync -avz --progress \
+    "${LOCAL_BIORLHF}/pyproject.toml" \
+    "${LOCAL_BIORLHF}/README.md" \
+    ${DEST}/
+# Step 3: Transfer data (only what GRPO training needs)
+echo ""
+echo "[3/4] Transferring data..."
+echo "  GeneLab fgsea (pathway enrichment scores - required)..."
+rsync -avz --progress \
+    "${LOCAL_BASE}/GeneLab_benchmark/processed/fgsea/" \
+    ${REMOTE}:${SCRATCH}/data/GeneLab_benchmark/processed/fgsea/
+echo "  GeneLab evaluation (NES conservation - for conservation questions)..."
+rsync -avz --progress \
+    "${LOCAL_BASE}/GeneLab_benchmark/evaluation/" \
+    ${REMOTE}:${SCRATCH}/data/GeneLab_benchmark/evaluation/
+echo "  BioEval data..."
+rsync -avz --progress \
+    "${LOCAL_BASE}/Evaluation_model/BioEval/data/" \
+    ${REMOTE}:${SCRATCH}/data/BioEval/data/
+echo "  BioEval scoring (for calibration imports)..."
+rsync -avz --progress \
+    "${LOCAL_BASE}/Evaluation_model/BioEval/bioeval/" \
+    ${REMOTE}:${SCRATCH}/data/BioEval/bioeval/
+echo "  SpaceOmicsBench..."
+rsync -avz --progress \
+    "${LOCAL_BASE}/SpaceOmicsBench/v3/evaluation/llm/" \
+    ${REMOTE}:${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm/
+# Step 4: Verify
+echo ""
+echo "[4/4] Verifying deployment..."
+ssh ${REMOTE} "
+echo 'Directory structure:'
+echo '  BioRLHF code:'
+ls ${SCRATCH}/training/BioRLHF/pyproject.toml 2>/dev/null && echo '    pyproject.toml: OK' || echo '    pyproject.toml: MISSING'
+ls ${SCRATCH}/training/BioRLHF/configs/grpo_mve.json 2>/dev/null && echo '    configs/grpo_mve.json: OK' || echo '    configs/grpo_mve.json: MISSING'
+ls -d ${SCRATCH}/training/BioRLHF/src/biorlhf/ 2>/dev/null && echo '    src/biorlhf/: OK' || echo '    src/biorlhf/: MISSING'
+echo '  SFT checkpoint:'
+ls -d ${SCRATCH}/training/biorlhf/kmp_sft_model_final/ 2>/dev/null && echo '    kmp_sft_model_final: OK' || echo '    kmp_sft_model_final: MISSING'
+echo '  Data:'
+ls ${SCRATCH}/data/GeneLab_benchmark/processed/fgsea/ 2>/dev/null | head -3 && echo '    GeneLab fgsea: OK' || echo '    GeneLab fgsea: MISSING'
+ls ${SCRATCH}/data/GeneLab_benchmark/evaluation/ 2>/dev/null | head -3 && echo '    GeneLab evaluation: OK' || echo '    GeneLab evaluation: MISSING'
+ls ${SCRATCH}/data/BioEval/data/ 2>/dev/null | head -3 && echo '    BioEval: OK' || echo '    BioEval: MISSING'
+ls ${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm/ 2>/dev/null | head -3 && echo '    SpaceOmicsBench: OK' || echo '    SpaceOmicsBench: MISSING'
+"
+echo ""
+echo "============================================================"
+echo "Deployment complete!"
+echo ""
+echo "Next steps on Cayuga:"
+echo "  ssh ${REMOTE}"
+echo "  cd ${SCRATCH}/training/BioRLHF"
+echo "  bash scripts/setup_cayuga_grpo.sh"
+echo "  sbatch scripts/run_grpo_mve.sh"
+echo "============================================================"

scripts/evaluate_ecosystem_model.py ADDED Viewed

	@@ -0,0 +1,412 @@

+#!/usr/bin/env python3
+"""
+Evaluate Ecosystem-Improved Model on Failure Patterns
+This script evaluates the fine-tuned model specifically on the patterns
+it was trained to improve:
+- Calibration (uncertainty expression)
+- Adversarial resistance
+- Protocol completeness
+- Fact recall
+Usage (on HPC with GPU):
+    python scripts/evaluate_ecosystem_model.py --model ./ecosystem_improved_model
+Requirements:
+    - CUDA GPU
+    - transformers, peft, bitsandbytes, torch
+"""
+import argparse
+import json
+import torch
+from pathlib import Path
+from datetime import datetime
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel
+def load_model(model_path: str, base_model: str = "mistralai/Mistral-7B-v0.3", use_4bit: bool = True):
+    """Load the fine-tuned model with LoRA adapters."""
+    print(f"Loading base model: {base_model}")
+    if use_4bit:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            quantization_config=bnb_config,
+            device_map="auto",
+            trust_remote_code=True,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+        )
+    print(f"Loading LoRA adapters from: {model_path}")
+    model = PeftModel.from_pretrained(model, model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def generate_response(model, tokenizer, prompt: str, max_new_tokens: int = 512) -> str:
+    """Generate a response from the model."""
+    formatted_prompt = f"### Instruction:\n{prompt}\n\n### Response:\n"
+    inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.1,
+            top_p=0.9,
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Extract just the response part
+    if "### Response:" in response:
+        response = response.split("### Response:")[-1].strip()
+    return response
+def evaluate_calibration(model, tokenizer, test_cases: list) -> dict:
+    """Evaluate uncertainty calibration - does model admit when it doesn't know?"""
+    print("\n" + "="*60)
+    print("EVALUATING: Calibration (Uncertainty Expression)")
+    print("="*60)
+    results = []
+    calibration_phrases = [
+        "cannot determine",
+        "not provided",
+        "not included",
+        "would need",
+        "don't have",
+        "doesn't include",
+        "cannot say",
+        "uncertain",
+        "not specified",
+        "additional information",
+        "not in the data",
+    ]
+    for i, case in enumerate(test_cases):
+        question = case["conversations"][0]["content"]
+        expected = case["conversations"][1]["content"]
+        print(f"\n[{i+1}] Testing: {question[:60]}...")
+        response = generate_response(model, tokenizer, question)
+        # Check if response expresses appropriate uncertainty
+        expresses_uncertainty = any(phrase.lower() in response.lower() for phrase in calibration_phrases)
+        result = {
+            "id": case["id"],
+            "question": question,
+            "expected_behavior": "Express uncertainty about unavailable data",
+            "response": response[:500],
+            "expresses_uncertainty": expresses_uncertainty,
+            "passed": expresses_uncertainty
+        }
+        results.append(result)
+        status = "PASS" if expresses_uncertainty else "FAIL"
+        print(f"    [{status}] Uncertainty expressed: {expresses_uncertainty}")
+    passed = sum(1 for r in results if r["passed"])
+    total = len(results)
+    return {
+        "category": "calibration",
+        "passed": passed,
+        "total": total,
+        "accuracy": passed / total if total > 0 else 0,
+        "details": results
+    }
+def evaluate_adversarial(model, tokenizer, test_cases: list) -> dict:
+    """Evaluate adversarial resistance - does model reject false claims?"""
+    print("\n" + "="*60)
+    print("EVALUATING: Adversarial Resistance")
+    print("="*60)
+    results = []
+    resistance_phrases = [
+        "not supported",
+        "contradicted",
+        "incorrect",
+        "need to correct",
+        "not true",
+        "actually shows",
+        "data shows",
+        "opposite",
+        "challenge that premise",
+        "misrepresent",
+    ]
+    for i, case in enumerate(test_cases):
+        question = case["conversations"][0]["content"]
+        expected = case["conversations"][1]["content"]
+        print(f"\n[{i+1}] Testing adversarial: {question[:60]}...")
+        response = generate_response(model, tokenizer, question)
+        # Check if response resists the false claim
+        resists_claim = any(phrase.lower() in response.lower() for phrase in resistance_phrases)
+        result = {
+            "id": case["id"],
+            "question": question,
+            "expected_behavior": "Reject false premise with evidence",
+            "response": response[:500],
+            "resists_false_claim": resists_claim,
+            "passed": resists_claim
+        }
+        results.append(result)
+        status = "PASS" if resists_claim else "FAIL"
+        print(f"    [{status}] Resisted false claim: {resists_claim}")
+    passed = sum(1 for r in results if r["passed"])
+    total = len(results)
+    return {
+        "category": "adversarial_resistance",
+        "passed": passed,
+        "total": total,
+        "accuracy": passed / total if total > 0 else 0,
+        "details": results
+    }
+def evaluate_completeness(model, tokenizer, test_cases: list) -> dict:
+    """Evaluate protocol completeness - does model detect all missing steps?"""
+    print("\n" + "="*60)
+    print("EVALUATING: Protocol Completeness")
+    print("="*60)
+    results = []
+    # Key missing steps that should be detected
+    key_steps = {
+        "comp_001": ["dnase", "reverse transcription", "rt", "cdna"],
+        "comp_002": ["transfer", "blot", "membrane transfer"]
+    }
+    for i, case in enumerate(test_cases):
+        question = case["conversations"][0]["content"]
+        expected = case["conversations"][1]["content"]
+        case_id = case["id"]
+        print(f"\n[{i+1}] Testing completeness: {case_id}...")
+        response = generate_response(model, tokenizer, question, max_new_tokens=800)
+        # Check if key missing steps are detected
+        expected_steps = key_steps.get(case_id, [])
+        response_lower = response.lower()
+        detected = [step for step in expected_steps if step in response_lower]
+        detection_rate = len(detected) / len(expected_steps) if expected_steps else 0
+        result = {
+            "id": case_id,
+            "question": question[:100],
+            "expected_steps": expected_steps,
+            "detected_steps": detected,
+            "response": response[:600],
+            "detection_rate": detection_rate,
+            "passed": detection_rate >= 0.5  # Pass if at least half detected
+        }
+        results.append(result)
+        status = "PASS" if result["passed"] else "FAIL"
+        print(f"    [{status}] Detected {len(detected)}/{len(expected_steps)} key steps")
+    passed = sum(1 for r in results if r["passed"])
+    total = len(results)
+    return {
+        "category": "protocol_completeness",
+        "passed": passed,
+        "total": total,
+        "accuracy": passed / total if total > 0 else 0,
+        "details": results
+    }
+def evaluate_fact_recall(model, tokenizer, test_cases: list) -> dict:
+    """Evaluate fact recall - does model remember key trained facts?"""
+    print("\n" + "="*60)
+    print("EVALUATING: Fact Recall")
+    print("="*60)
+    results = []
+    # Key facts and values that should be recalled
+    key_facts = {
+        "fact_001": ["52%", "52 percent"],
+        "fact_002": ["52%", "52 percent"],
+        "fact_003": ["52%", "8%"],
+        "fact_004": ["-1.60", "-1.6", "suppressed", "suppression"],
+        "fact_005": ["liver", "-1.60", "-1.6", "opposite"]
+    }
+    for i, case in enumerate(test_cases):
+        question = case["conversations"][0]["content"]
+        expected = case["conversations"][1]["content"]
+        case_id = case["id"]
+        print(f"\n[{i+1}] Testing fact recall: {case_id}...")
+        response = generate_response(model, tokenizer, question)
+        # Check if key facts are present
+        expected_facts = key_facts.get(case_id, [])
+        response_lower = response.lower()
+        recalled = [fact for fact in expected_facts if fact.lower() in response_lower]
+        recall_rate = len(recalled) / len(expected_facts) if expected_facts else 0
+        result = {
+            "id": case_id,
+            "question": question,
+            "expected_facts": expected_facts,
+            "recalled_facts": recalled,
+            "response": response[:400],
+            "recall_rate": recall_rate,
+            "passed": recall_rate >= 0.5  # Pass if at least half recalled
+        }
+        results.append(result)
+        status = "PASS" if result["passed"] else "FAIL"
+        print(f"    [{status}] Recalled {len(recalled)}/{len(expected_facts)} key facts")
+    passed = sum(1 for r in results if r["passed"])
+    total = len(results)
+    return {
+        "category": "fact_recall",
+        "passed": passed,
+        "total": total,
+        "accuracy": passed / total if total > 0 else 0,
+        "details": results
+    }
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate ecosystem-improved model")
+    parser.add_argument("--model", type=str, default="./ecosystem_improved_model",
+                        help="Path to the fine-tuned model")
+    parser.add_argument("--base-model", type=str, default="mistralai/Mistral-7B-v0.3",
+                        help="Base model name")
+    parser.add_argument("--test-data", type=str, default="data/ecosystem_failures_training.json",
+                        help="Path to test data JSON")
+    parser.add_argument("--output", type=str, default=None,
+                        help="Output path for results JSON")
+    parser.add_argument("--no-4bit", action="store_true",
+                        help="Disable 4-bit quantization")
+    args = parser.parse_args()
+    print("="*60)
+    print("BioRLHF Ecosystem Model Evaluation")
+    print("="*60)
+    print(f"Model:      {args.model}")
+    print(f"Base:       {args.base_model}")
+    print(f"Test data:  {args.test_data}")
+    print(f"Time:       {datetime.now().isoformat()}")
+    print("="*60)
+    # Load test data
+    with open(args.test_data, 'r') as f:
+        test_data = json.load(f)
+    # Load model
+    model, tokenizer = load_model(args.model, args.base_model, use_4bit=not args.no_4bit)
+    print("\nModel loaded successfully!\n")
+    # Run evaluations
+    results = {}
+    # 1. Calibration
+    if test_data.get("calibration_examples"):
+        results["calibration"] = evaluate_calibration(
+            model, tokenizer, test_data["calibration_examples"]
+        )
+    # 2. Adversarial resistance
+    if test_data.get("adversarial_resistance_examples"):
+        results["adversarial"] = evaluate_adversarial(
+            model, tokenizer, test_data["adversarial_resistance_examples"]
+        )
+    # 3. Protocol completeness
+    if test_data.get("completeness_examples"):
+        results["completeness"] = evaluate_completeness(
+            model, tokenizer, test_data["completeness_examples"]
+        )
+    # 4. Fact recall
+    if test_data.get("fact_drilling_examples"):
+        results["fact_recall"] = evaluate_fact_recall(
+            model, tokenizer, test_data["fact_drilling_examples"]
+        )
+    # Summary
+    print("\n" + "="*60)
+    print("EVALUATION SUMMARY")
+    print("="*60)
+    total_passed = 0
+    total_tests = 0
+    for category, data in results.items():
+        print(f"\n{category.upper()}:")
+        print(f"  Passed: {data['passed']}/{data['total']} ({data['accuracy']:.1%})")
+        total_passed += data['passed']
+        total_tests += data['total']
+    overall_accuracy = total_passed / total_tests if total_tests > 0 else 0
+    print("\n" + "-"*60)
+    print(f"OVERALL: {total_passed}/{total_tests} ({overall_accuracy:.1%})")
+    print("-"*60)
+    # Save results
+    output_path = args.output or f"ecosystem_eval_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    output_data = {
+        "model_path": args.model,
+        "base_model": args.base_model,
+        "evaluation_date": datetime.now().isoformat(),
+        "overall_accuracy": overall_accuracy,
+        "total_passed": total_passed,
+        "total_tests": total_tests,
+        "results": results
+    }
+    with open(output_path, 'w') as f:
+        json.dump(output_data, f, indent=2)
+    print(f"\nResults saved to: {output_path}")
+    print("\n" + "="*60)
+    print("Evaluation complete!")
+    print("="*60)
+if __name__ == "__main__":
+    main()

scripts/evaluate_grpo.py ADDED Viewed

	@@ -0,0 +1,372 @@

+#!/usr/bin/env python3
+"""
+BioGRPO Post-Training Evaluation Script
+Evaluates a GRPO-trained model against:
+1. Held-out GeneLab questions (LOMO: Leave-One-Mission-Out)
+2. Calibration metrics (ECE, Brier, overconfidence rate)
+3. Per-verifier reward scores
+4. Baseline comparison (SFT, DPO)
+Usage:
+    python scripts/evaluate_grpo.py \
+        --model ./biogrpo_mve_model \
+        --sft-baseline ./kmp_sft_model_final \
+        --hold-out-tissues eye \
+        --output results/grpo_mve_eval.json
+"""
+import argparse
+import json
+import torch
+from pathlib import Path
+from datetime import datetime
+from typing import Dict, List, Optional
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import PeftModel
+from tqdm import tqdm
+from biorlhf.data.grpo_dataset import build_grpo_dataset, get_dataset_stats
+from biorlhf.verifiers.composer import VerifierComposer
+from biorlhf.verifiers.uncertainty import _extract_confidence_simple
+from biorlhf.evaluation.calibration import compute_calibration_metrics
+def load_model(
+    model_path: str,
+    base_model: str = "mistralai/Mistral-7B-v0.3",
+    use_4bit: bool = True,
+):
+    """Load a fine-tuned model with LoRA adapters."""
+    print(f"  Base model: {base_model}")
+    print(f"  Adapter: {model_path}")
+    if use_4bit:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            quantization_config=bnb_config,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+    else:
+        model = AutoModelForCausalLM.from_pretrained(
+            base_model,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+    model = PeftModel.from_pretrained(model, model_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def generate_response(
+    model,
+    tokenizer,
+    prompt: str,
+    max_new_tokens: int = 512,
+    temperature: float = 0.1,
+) -> str:
+    """Generate a response from the model."""
+    formatted = f"### Instruction:\n{prompt}\n\n### Response:\n"
+    inputs = tokenizer(formatted, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            top_p=0.9,
+            do_sample=temperature > 0,
+            pad_token_id=tokenizer.pad_token_id,
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    if "### Response:" in response:
+        response = response.split("### Response:")[-1].strip()
+    return response
+def evaluate_with_verifiers(
+    model,
+    tokenizer,
+    eval_dataset,
+    composer: VerifierComposer,
+    max_samples: Optional[int] = None,
+) -> Dict:
+    """Evaluate model using the verifier stack.
+    Returns per-sample results and aggregated metrics.
+    """
+    results = []
+    n = len(eval_dataset)
+    if max_samples:
+        n = min(n, max_samples)
+    for i in tqdm(range(n), desc="Evaluating"):
+        sample = eval_dataset[i]
+        prompt = sample["prompt"]
+        gt = sample["ground_truth"]
+        qtype = sample["question_type"]
+        applicable = sample["applicable_verifiers"]
+        response = generate_response(model, tokenizer, prompt)
+        reward = composer.compute_reward(
+            prompt=prompt,
+            completion=response,
+            ground_truth=gt,
+            question_type=qtype,
+            applicable_verifiers=applicable,
+        )
+        # Extract confidence for calibration
+        conf = _extract_confidence_simple(response)
+        results.append({
+            "prompt": prompt[:100],
+            "response": response[:300],
+            "total_reward": reward.total_reward,
+            "verifier_scores": reward.verifier_scores,
+            "question_type": qtype,
+            "source": sample.get("source", "unknown"),
+            "tissue": sample.get("tissue", "unknown"),
+            "confidence": conf.numeric,
+            "confidence_stated": conf.stated,
+        })
+    # Aggregate metrics
+    total_rewards = [r["total_reward"] for r in results]
+    per_verifier: Dict[str, List[float]] = {}
+    for r in results:
+        for v, s in r["verifier_scores"].items():
+            per_verifier.setdefault(v, []).append(s)
+    verifier_means = {v: sum(s) / len(s) for v, s in per_verifier.items()}
+    # Per question type
+    by_type: Dict[str, List[float]] = {}
+    for r in results:
+        by_type.setdefault(r["question_type"], []).append(r["total_reward"])
+    type_means = {t: sum(s) / len(s) for t, s in by_type.items()}
+    return {
+        "n_samples": len(results),
+        "mean_reward": sum(total_rewards) / len(total_rewards) if total_rewards else 0,
+        "verifier_means": verifier_means,
+        "by_question_type": type_means,
+        "per_sample": results,
+    }
+def evaluate_calibration(results: List[Dict]) -> Dict:
+    """Compute calibration metrics from evaluation results."""
+    confidences = [r["confidence"] for r in results]
+    # Correctness: reward > 0.5 considered "correct"
+    correctnesses = [r["total_reward"] > 0.5 for r in results]
+    metrics = compute_calibration_metrics(
+        confidences=confidences,
+        correctnesses=correctnesses,
+    )
+    return {
+        "ece": metrics.ece,
+        "mce": metrics.mce,
+        "brier_score": metrics.brier_score,
+        "overconfidence_rate": metrics.overconfidence_rate,
+        "underconfidence_rate": metrics.underconfidence_rate,
+        "mean_confidence": metrics.mean_confidence,
+        "mean_accuracy": metrics.mean_accuracy,
+        "n_samples": metrics.n_samples,
+        "reliability_bins": metrics.reliability_bins,
+    }
+def main():
+    parser = argparse.ArgumentParser(
+        description="Evaluate a BioGRPO-trained model",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model", type=str, required=True,
+        help="Path to the GRPO-trained model (LoRA adapter directory)",
+    )
+    parser.add_argument(
+        "--base-model", type=str, default="mistralai/Mistral-7B-v0.3",
+        help="Base model name",
+    )
+    parser.add_argument(
+        "--sft-baseline", type=str, default=None,
+        help="Path to SFT baseline model for comparison",
+    )
+    parser.add_argument(
+        "--hold-out-tissues", type=str, nargs="+", default=["eye"],
+        help="Tissues held out for evaluation",
+    )
+    parser.add_argument(
+        "--pathway-db", type=str, default="hallmark",
+        help="Pathway database",
+    )
+    parser.add_argument(
+        "--active-verifiers", type=str, nargs="+", default=None,
+        help="Active verifiers (default: all)",
+    )
+    parser.add_argument(
+        "--max-samples", type=int, default=None,
+        help="Max samples to evaluate (for quick testing)",
+    )
+    parser.add_argument(
+        "--output", type=str, default=None,
+        help="Output path for results JSON",
+    )
+    parser.add_argument(
+        "--no-4bit", action="store_true",
+        help="Disable 4-bit quantization",
+    )
+    args = parser.parse_args()
+    print("=" * 60)
+    print("BioGRPO Evaluation")
+    print("=" * 60)
+    print(f"  Model:          {args.model}")
+    print(f"  Base:           {args.base_model}")
+    print(f"  Hold-out:       {args.hold_out_tissues}")
+    print(f"  SFT baseline:   {args.sft_baseline or 'None'}")
+    print(f"  Time:           {datetime.now().isoformat()}")
+    print("=" * 60)
+    # Build eval dataset
+    print("\n[1/4] Building evaluation dataset...")
+    _, eval_dataset = build_grpo_dataset(
+        db=args.pathway_db,
+        hold_out_tissues=args.hold_out_tissues,
+    )
+    eval_stats = get_dataset_stats(eval_dataset)
+    print(f"  Eval samples: {eval_stats['total']}")
+    print(f"  By source: {eval_stats['by_source']}")
+    print(f"  By type: {eval_stats['by_question_type']}")
+    # Create verifier composer
+    composer = VerifierComposer(active_verifiers=args.active_verifiers)
+    # Evaluate GRPO model
+    print(f"\n[2/4] Evaluating GRPO model: {args.model}")
+    model, tokenizer = load_model(
+        args.model, args.base_model, use_4bit=not args.no_4bit,
+    )
+    grpo_results = evaluate_with_verifiers(
+        model, tokenizer, eval_dataset, composer,
+        max_samples=args.max_samples,
+    )
+    grpo_calibration = evaluate_calibration(grpo_results["per_sample"])
+    # Free GPU memory
+    del model
+    torch.cuda.empty_cache()
+    # Evaluate baseline if provided
+    baseline_results = None
+    baseline_calibration = None
+    if args.sft_baseline:
+        print(f"\n[3/4] Evaluating SFT baseline: {args.sft_baseline}")
+        baseline_model, baseline_tokenizer = load_model(
+            args.sft_baseline, args.base_model, use_4bit=not args.no_4bit,
+        )
+        baseline_results = evaluate_with_verifiers(
+            baseline_model, baseline_tokenizer, eval_dataset, composer,
+            max_samples=args.max_samples,
+        )
+        baseline_calibration = evaluate_calibration(baseline_results["per_sample"])
+        del baseline_model
+        torch.cuda.empty_cache()
+    else:
+        print("\n[3/4] Skipping baseline (not provided)")
+    # Print summary
+    print("\n[4/4] Results Summary")
+    print("=" * 60)
+    print(f"GRPO Model: {args.model}")
+    print(f"  Mean reward:     {grpo_results['mean_reward']:.3f}")
+    print(f"  Per verifier:    {grpo_results['verifier_means']}")
+    print(f"  ECE:             {grpo_calibration['ece']:.3f}")
+    print(f"  Brier:           {grpo_calibration['brier_score']:.3f}")
+    print(f"  Overconfidence:  {grpo_calibration['overconfidence_rate']:.3f}")
+    print(f"  By type:         {grpo_results['by_question_type']}")
+    comparison = {}
+    if baseline_results:
+        print(f"\nSFT Baseline: {args.sft_baseline}")
+        print(f"  Mean reward:     {baseline_results['mean_reward']:.3f}")
+        print(f"  ECE:             {baseline_calibration['ece']:.3f}")
+        print(f"  Brier:           {baseline_calibration['brier_score']:.3f}")
+        delta_reward = grpo_results["mean_reward"] - baseline_results["mean_reward"]
+        delta_ece = grpo_calibration["ece"] - baseline_calibration["ece"]
+        print(f"\n  Delta reward:    {delta_reward:+.3f}")
+        print(f"  Delta ECE:       {delta_ece:+.3f} (negative = better)")
+        comparison = {
+            "sft_mean_reward": baseline_results["mean_reward"],
+            "sft_ece": baseline_calibration["ece"],
+            "delta_reward": delta_reward,
+            "delta_ece": delta_ece,
+        }
+    # Success criteria
+    criteria = {
+        "reward_above_05": grpo_results["mean_reward"] > 0.5,
+        "ece_below_015": grpo_calibration["ece"] < 0.15,
+    }
+    if baseline_results:
+        criteria["reward_above_baseline"] = delta_reward > 0
+    criteria["overall_pass"] = all(criteria.values())
+    print(f"\nSuccess criteria: {criteria}")
+    # Save results
+    output_path = args.output or f"results/grpo_eval_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    Path(output_path).parent.mkdir(parents=True, exist_ok=True)
+    output_data = {
+        "model_path": args.model,
+        "base_model": args.base_model,
+        "evaluation_date": datetime.now().isoformat(),
+        "hold_out_tissues": args.hold_out_tissues,
+        "eval_dataset_stats": eval_stats,
+        "grpo": {
+            "mean_reward": grpo_results["mean_reward"],
+            "verifier_means": grpo_results["verifier_means"],
+            "by_question_type": grpo_results["by_question_type"],
+            "n_samples": grpo_results["n_samples"],
+        },
+        "calibration": grpo_calibration,
+        "baseline_comparison": comparison,
+        "success_criteria": criteria,
+        "per_sample": grpo_results["per_sample"],
+    }
+    with open(output_path, "w") as f:
+        json.dump(output_data, f, indent=2)
+    print(f"\nResults saved to: {output_path}")
+    print("=" * 60)
+if __name__ == "__main__":
+    main()

scripts/merge_training_data.py ADDED Viewed

	@@ -0,0 +1,160 @@

+#!/usr/bin/env python3
+"""
+Merge BioRLHF training data with ecosystem failure examples.
+This script:
+1. Loads existing kmp_sft_final.json training data
+2. Loads ecosystem_failures_training.json (failure-based examples)
+3. Converts failure examples to the same format
+4. Outputs combined_training.json
+Usage:
+    python scripts/merge_training_data.py
+"""
+import json
+from pathlib import Path
+from datetime import datetime
+def load_json(filepath: str) -> dict | list:
+    """Load JSON file."""
+    with open(filepath, 'r') as f:
+        return json.load(f)
+def save_json(data: list, filepath: str):
+    """Save JSON file."""
+    with open(filepath, 'w') as f:
+        json.dump(data, f, indent=2)
+    print(f"Saved {len(data)} examples to {filepath}")
+def convert_conversation_to_text(conversation: list) -> str:
+    """
+    Convert conversation format to text format.
+    Input: [{"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]
+    Output: "### Instruction:\n...\n\n### Response:\n..."
+    """
+    instruction = ""
+    response = ""
+    for turn in conversation:
+        if turn["role"] == "user":
+            instruction = turn["content"]
+        elif turn["role"] == "assistant":
+            response = turn["content"]
+    return f"### Instruction:\n{instruction}\n\n### Response:\n{response}"
+def extract_examples_from_failures(failure_data: dict) -> list:
+    """
+    Extract and convert all examples from failure training data.
+    """
+    examples = []
+    # Process calibration examples
+    for ex in failure_data.get("calibration_examples", []):
+        text = convert_conversation_to_text(ex["conversations"])
+        examples.append({
+            "text": text,
+            "source": f"ecosystem_failures:{ex['type']}",
+            "id": ex["id"]
+        })
+    # Process adversarial resistance examples
+    for ex in failure_data.get("adversarial_resistance_examples", []):
+        text = convert_conversation_to_text(ex["conversations"])
+        examples.append({
+            "text": text,
+            "source": f"ecosystem_failures:{ex['type']}",
+            "id": ex["id"]
+        })
+    # Process completeness examples
+    for ex in failure_data.get("completeness_examples", []):
+        text = convert_conversation_to_text(ex["conversations"])
+        examples.append({
+            "text": text,
+            "source": f"ecosystem_failures:{ex['type']}",
+            "id": ex["id"]
+        })
+    # Process fact drilling examples
+    for ex in failure_data.get("fact_drilling_examples", []):
+        text = convert_conversation_to_text(ex["conversations"])
+        examples.append({
+            "text": text,
+            "source": f"ecosystem_failures:{ex['type']}",
+            "id": ex["id"]
+        })
+    return examples
+def main():
+    # Paths
+    data_dir = Path(__file__).parent.parent / "data"
+    existing_path = data_dir / "kmp_sft_final.json"
+    failures_path = data_dir / "ecosystem_failures_training.json"
+    output_path = data_dir / "combined_training.json"
+    print("=" * 60)
+    print("BioRLHF Training Data Merger")
+    print("=" * 60)
+    # Load existing data
+    print(f"\n📂 Loading existing data from {existing_path}")
+    existing_data = load_json(existing_path)
+    print(f"   Found {len(existing_data)} existing examples")
+    # Load failure-based examples
+    print(f"\n📂 Loading failure examples from {failures_path}")
+    failure_data = load_json(failures_path)
+    # Convert failure examples
+    print("\n🔄 Converting failure examples to training format...")
+    new_examples = extract_examples_from_failures(failure_data)
+    print(f"   Converted {len(new_examples)} examples")
+    # Show breakdown
+    print("\n📊 New examples by type:")
+    type_counts = {}
+    for ex in new_examples:
+        source_type = ex["source"].split(":")[1] if ":" in ex["source"] else ex["source"]
+        type_counts[source_type] = type_counts.get(source_type, 0) + 1
+    for t, c in sorted(type_counts.items()):
+        print(f"   - {t}: {c}")
+    # Combine data
+    print("\n🔀 Merging datasets...")
+    # Add source field to existing data if not present
+    for ex in existing_data:
+        if "source" not in ex:
+            ex["source"] = "kmp_sft_original"
+    # Combine
+    combined = existing_data + new_examples
+    print(f"   Total examples: {len(combined)}")
+    # Save combined data
+    print(f"\n💾 Saving to {output_path}")
+    save_json(combined, output_path)
+    # Summary
+    print("\n" + "=" * 60)
+    print("✅ MERGE COMPLETE")
+    print("=" * 60)
+    print(f"   Original examples: {len(existing_data)}")
+    print(f"   New examples:      {len(new_examples)}")
+    print(f"   Total combined:    {len(combined)}")
+    print(f"\n   Output: {output_path}")
+    print("\nNext step: Run training with combined data:")
+    print("   python sft_train_v2.py --dataset data/combined_training.json")
+if __name__ == "__main__":
+    main()

scripts/run_eval_grpo.sh ADDED Viewed

	@@ -0,0 +1,92 @@

+#!/bin/bash
+#SBATCH --job-name=eval_grpo
+#SBATCH --partition=scu-gpu
+#SBATCH --account=cayuga_0003
+#SBATCH --gres=gpu:1
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=8
+#SBATCH --time=4:00:00
+#SBATCH --output=logs/eval_grpo_%j.log
+#SBATCH --error=logs/eval_grpo_%j.err
+# ============================================================
+# BioGRPO Post-Training Evaluation
+# Evaluates GRPO model + SFT baseline comparison
+# ============================================================
+SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
+WORKDIR="${SCRATCH}/training/BioRLHF"
+echo "============================================================"
+echo "BioGRPO Evaluation"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Node: $SLURMD_NODENAME"
+echo "Working dir: $WORKDIR"
+echo "Start time: $(date)"
+echo "============================================================"
+cd "$WORKDIR" || { echo "WORKDIR not found: $WORKDIR"; exit 1; }
+mkdir -p logs results
+module purge
+module load cuda/12.1
+. /home/fs01/jak4013/miniconda3/miniconda3/etc/profile.d/conda.sh
+conda activate biorlhf
+echo ""
+echo "GPU Information:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
+echo ""
+export CUDA_VISIBLE_DEVICES=0
+export TRANSFORMERS_CACHE="${WORKDIR}/cache/transformers"
+export HF_HOME="${WORKDIR}/cache/huggingface"
+export TOKENIZERS_PARALLELISM=false
+# Data paths
+export GENELAB_BASE="${SCRATCH}/data/GeneLab_benchmark"
+export BIOEVAL_DATA="${SCRATCH}/data/BioEval/data"
+export SPACEOMICS_DATA="${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm"
+export BIOEVAL_ROOT="${SCRATCH}/data/BioEval"
+# Model paths
+GRPO_MODEL="./biogrpo_mve_model"
+SFT_BASELINE="./kmp_sft_model_final"
+OUTPUT="results/grpo_mve_eval_$(date +%Y%m%d_%H%M%S).json"
+echo "GRPO model:    $GRPO_MODEL"
+echo "SFT baseline:  $SFT_BASELINE"
+echo "Output:        $OUTPUT"
+echo ""
+# Check model exists
+if [ ! -d "$GRPO_MODEL" ]; then
+    echo "ERROR: GRPO model not found at $GRPO_MODEL"
+    echo "Available directories:"
+    ls -d biogrpo_* 2>/dev/null || echo "  No biogrpo_* dirs found"
+    exit 1
+fi
+echo "Starting BioGRPO evaluation..."
+python scripts/evaluate_grpo.py \
+    --model "$GRPO_MODEL" \
+    --sft-baseline "$SFT_BASELINE" \
+    --hold-out-tissues eye \
+    --output "$OUTPUT"
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "============================================================"
+    echo "BioGRPO evaluation completed!"
+    echo "Results: $OUTPUT"
+    echo "End time: $(date)"
+    echo "============================================================"
+else
+    echo ""
+    echo "============================================================"
+    echo "BioGRPO evaluation failed with exit code $?"
+    echo "Check logs/eval_grpo_${SLURM_JOB_ID}.err for details"
+    echo "============================================================"
+    exit 1
+fi

scripts/run_evaluation.sh ADDED Viewed

	@@ -0,0 +1,83 @@

+#!/bin/bash
+#
+# BioRLHF Model Evaluation Script
+# ================================
+#
+# Evaluates the ecosystem-improved model on:
+# - Calibration (uncertainty expression)
+# - Adversarial resistance
+# - Protocol completeness
+# - Fact recall
+#
+# Usage on HPC:
+#   srun -p scu-gpu --gres=gpu:a100:1 --mem=48G -c 8 --time=1:00:00 --pty bash
+#   conda activate biorlhf
+#   ./scripts/run_evaluation.sh
+#
+echo "============================================================"
+echo "BioRLHF Ecosystem Model Evaluation"
+echo "============================================================"
+echo "Start time: $(date)"
+echo "Host: $(hostname)"
+echo ""
+# Set working directory
+cd "$(dirname "$0")/.." || exit 1
+echo "Working directory: $(pwd)"
+# Check GPU
+echo ""
+echo "GPU Information:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv 2>/dev/null || echo "No GPU detected"
+echo ""
+# Configuration
+MODEL_PATH="./ecosystem_improved_model"
+TEST_DATA="data/ecosystem_failures_training.json"
+OUTPUT="ecosystem_eval_results_$(date +%Y%m%d_%H%M%S).json"
+echo "============================================================"
+echo "Configuration:"
+echo "============================================================"
+echo "Model:     $MODEL_PATH"
+echo "Test data: $TEST_DATA"
+echo "Output:    $OUTPUT"
+echo ""
+# Check files exist
+if [ ! -d "$MODEL_PATH" ]; then
+    echo "ERROR: Model not found at $MODEL_PATH"
+    exit 1
+fi
+if [ ! -f "$TEST_DATA" ]; then
+    echo "ERROR: Test data not found at $TEST_DATA"
+    exit 1
+fi
+# Run evaluation
+echo "============================================================"
+echo "Starting Evaluation..."
+echo "============================================================"
+python3 scripts/evaluate_ecosystem_model.py \
+    --model "$MODEL_PATH" \
+    --test-data "$TEST_DATA" \
+    --output "$OUTPUT"
+# Check exit status
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "============================================================"
+    echo "Evaluation Complete!"
+    echo "============================================================"
+    echo "Results saved to: $OUTPUT"
+    echo "End time: $(date)"
+else
+    echo ""
+    echo "============================================================"
+    echo "Evaluation Failed!"
+    echo "============================================================"
+    exit 1
+fi

scripts/run_grpo_full.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/bin/bash
+#SBATCH --job-name=biogrpo_full
+#SBATCH --partition=scu-gpu
+#SBATCH --account=cayuga_0003
+#SBATCH --gres=gpu:1
+#SBATCH --mem=96G
+#SBATCH --cpus-per-task=8
+#SBATCH --time=24:00:00
+#SBATCH --output=logs/grpo_full_%j.log
+#SBATCH --error=logs/grpo_full_%j.err
+# ============================================================
+# BioGRPO Full Experiment
+# All V1-V4 verifiers, G=8, from SFT checkpoint
+# ============================================================
+SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
+WORKDIR="${SCRATCH}/training/BioRLHF"
+echo "============================================================"
+echo "BioGRPO Full Training"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Node: $SLURMD_NODENAME"
+echo "Working dir: $WORKDIR"
+echo "Start time: $(date)"
+echo "============================================================"
+cd "$WORKDIR" || { echo "WORKDIR not found: $WORKDIR"; exit 1; }
+mkdir -p logs
+module purge
+module load cuda/12.1
+. /home/fs01/jak4013/miniconda3/miniconda3/etc/profile.d/conda.sh
+conda activate biorlhf
+echo ""
+echo "GPU Information:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
+echo ""
+export CUDA_VISIBLE_DEVICES=0
+export TRANSFORMERS_CACHE="${WORKDIR}/cache/transformers"
+export HF_HOME="${WORKDIR}/cache/huggingface"
+export WANDB_DIR="${WORKDIR}/wandb"
+export TOKENIZERS_PARALLELISM=false
+# Data paths
+export GENELAB_BASE="${SCRATCH}/data/GeneLab_benchmark"
+export BIOEVAL_DATA="${SCRATCH}/data/BioEval/data"
+export SPACEOMICS_DATA="${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm"
+export BIOEVAL_ROOT="${SCRATCH}/data/BioEval"
+mkdir -p $TRANSFORMERS_CACHE $HF_HOME $WANDB_DIR
+# Symlink SFT checkpoint if not already present
+if [ ! -e "${WORKDIR}/kmp_sft_model_final" ]; then
+    ln -s "${SCRATCH}/training/biorlhf/kmp_sft_model_final" "${WORKDIR}/kmp_sft_model_final"
+    echo "Symlinked kmp_sft_model_final"
+fi
+echo "Starting BioGRPO Full training..."
+biorlhf-grpo --config configs/grpo_full.json
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "============================================================"
+    echo "BioGRPO Full training completed!"
+    echo "Model saved to: ./biogrpo_full_model"
+    echo "End time: $(date)"
+    echo "============================================================"
+else
+    echo ""
+    echo "============================================================"
+    echo "BioGRPO Full training failed with exit code $?"
+    echo "Check logs/grpo_full_${SLURM_JOB_ID}.err for details"
+    echo "============================================================"
+    exit 1
+fi

scripts/run_grpo_mve.sh ADDED Viewed

	@@ -0,0 +1,84 @@

+#!/bin/bash
+#SBATCH --job-name=biogrpo_mve
+#SBATCH --partition=scu-gpu
+#SBATCH --account=cayuga_0003
+#SBATCH --gres=gpu:1
+#SBATCH --mem=64G
+#SBATCH --cpus-per-task=8
+#SBATCH --time=48:00:00
+#SBATCH --output=logs/grpo_mve_%j.log
+#SBATCH --error=logs/grpo_mve_%j.err
+# ============================================================
+# BioGRPO Minimum Viable Experiment (MVE)
+# V1+V4 verifiers, G=4, from SFT checkpoint
+# ============================================================
+SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
+WORKDIR="${SCRATCH}/training/BioRLHF"
+echo "============================================================"
+echo "BioGRPO MVE Training"
+echo "Job ID: $SLURM_JOB_ID"
+echo "Node: $SLURMD_NODENAME"
+echo "Working dir: $WORKDIR"
+echo "Start time: $(date)"
+echo "============================================================"
+cd "$WORKDIR" || { echo "WORKDIR not found: $WORKDIR"; exit 1; }
+mkdir -p logs
+# Load modules
+module purge
+module load cuda/12.1
+# Activate conda environment
+. /home/fs01/jak4013/miniconda3/miniconda3/etc/profile.d/conda.sh
+conda activate biorlhf
+# Verify GPU
+echo ""
+echo "GPU Information:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv
+echo ""
+# Set environment variables
+export CUDA_VISIBLE_DEVICES=0
+export TRANSFORMERS_CACHE="${WORKDIR}/cache/transformers"
+export HF_HOME="${WORKDIR}/cache/huggingface"
+export WANDB_DIR="${WORKDIR}/wandb"
+export TOKENIZERS_PARALLELISM=false
+# Data paths
+export GENELAB_BASE="${SCRATCH}/data/GeneLab_benchmark"
+export BIOEVAL_DATA="${SCRATCH}/data/BioEval/data"
+export SPACEOMICS_DATA="${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm"
+export BIOEVAL_ROOT="${SCRATCH}/data/BioEval"
+mkdir -p $TRANSFORMERS_CACHE $HF_HOME $WANDB_DIR
+# Symlink SFT checkpoint if not already present
+if [ ! -e "${WORKDIR}/kmp_sft_model_final" ]; then
+    ln -s "${SCRATCH}/training/biorlhf/kmp_sft_model_final" "${WORKDIR}/kmp_sft_model_final"
+    echo "Symlinked kmp_sft_model_final"
+fi
+# Run GRPO MVE training
+echo "Starting BioGRPO MVE training..."
+biorlhf-grpo --config configs/grpo_mve.json
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "============================================================"
+    echo "BioGRPO MVE training completed!"
+    echo "Model saved to: ./biogrpo_mve_model"
+    echo "End time: $(date)"
+    echo "============================================================"
+else
+    echo ""
+    echo "============================================================"
+    echo "BioGRPO MVE training failed with exit code $?"
+    echo "Check logs/grpo_mve_${SLURM_JOB_ID}.err for details"
+    echo "============================================================"
+    exit 1
+fi

scripts/setup_cayuga_grpo.sh ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/bin/bash
+# ============================================================
+# BioGRPO Environment Setup for Cayuga HPC
+# Run once to verify/upgrade GRPO dependencies
+# ============================================================
+SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
+WORKDIR="${SCRATCH}/training/BioRLHF"
+echo "============================================================"
+echo "BioGRPO Environment Setup"
+echo "Working dir: $WORKDIR"
+echo "============================================================"
+cd "$WORKDIR" || { echo "WORKDIR not found: $WORKDIR"; exit 1; }
+# Activate environment
+source ~/.bashrc
+conda activate biorlhf
+# Step 1: Check current versions
+echo ""
+echo "[1/6] Current package versions..."
+python -c "import trl; print(f'  TRL: {trl.__version__}')"
+python -c "import peft; print(f'  PEFT: {peft.__version__}')"
+python -c "import transformers; print(f'  Transformers: {transformers.__version__}')"
+python -c "import torch; print(f'  PyTorch: {torch.__version__}'); print(f'  CUDA: {torch.cuda.is_available()}')"
+# Step 2: Upgrade TRL if needed
+echo ""
+echo "[2/6] Ensuring TRL >= 0.26.0..."
+pip install "trl>=0.26.0" --upgrade --quiet
+# Step 3: Verify GRPO imports
+echo ""
+echo "[3/6] Verifying GRPO imports..."
+python -c "
+from trl import GRPOTrainer, GRPOConfig
+print('  GRPOTrainer: OK')
+print('  GRPOConfig: OK')
+config = GRPOConfig(output_dir='/tmp/test', scale_rewards='group', loss_type='grpo')
+print(f'  scale_rewards={config.scale_rewards}, loss_type={config.loss_type}: OK')
+"
+# Step 4: Install biorlhf package
+echo ""
+echo "[4/6] Installing biorlhf package..."
+pip install -e . --quiet 2>/dev/null || pip install -e . 2>&1 | tail -3
+# Step 5: Verify biorlhf imports
+echo ""
+echo "[5/6] Verifying biorlhf imports..."
+python -c "
+from biorlhf.training.grpo import BioGRPOConfig, run_grpo_training
+print('  BioGRPOConfig: OK')
+from biorlhf.verifiers.composer import make_grpo_reward_function
+print('  make_grpo_reward_function: OK')
+from biorlhf.data.grpo_dataset import build_grpo_dataset
+print('  build_grpo_dataset: OK')
+from biorlhf.evaluation.calibration import compute_calibration_metrics
+print('  compute_calibration_metrics: OK')
+"
+# Step 6: Smoke test
+echo ""
+echo "[6/6] Running smoke test..."
+python -c "
+from biorlhf.verifiers.composer import make_grpo_reward_function
+import json
+reward_fn = make_grpo_reward_function(active_verifiers=['V1', 'V4'])
+rewards = reward_fn(
+    completions=['Oxidative phosphorylation is upregulated. Confidence: high.'],
+    ground_truth=[json.dumps({
+        'pathway': 'HALLMARK_OXIDATIVE_PHOSPHORYLATION',
+        'direction': 'UP',
+        'expected_confidence': 'high',
+    })],
+    question_type=['direction'],
+    applicable_verifiers=[json.dumps(['V1', 'V4'])],
+)
+print(f'  Reward: {rewards[0]:.3f} (expected > 0.5)')
+assert rewards[0] > 0.3, 'Reward too low'
+print('  Smoke test: PASSED')
+"
+# Create directories
+mkdir -p logs configs results cache/transformers cache/huggingface wandb
+# Step 6b: Symlink SFT checkpoint
+echo ""
+echo "[6b/7] Setting up SFT checkpoint symlink..."
+if [ ! -e "${WORKDIR}/kmp_sft_model_final" ]; then
+    if [ -d "${SCRATCH}/training/biorlhf/kmp_sft_model_final" ]; then
+        ln -s "${SCRATCH}/training/biorlhf/kmp_sft_model_final" "${WORKDIR}/kmp_sft_model_final"
+        echo "  Symlinked kmp_sft_model_final: OK"
+    else
+        echo "  WARNING: kmp_sft_model_final not found at ${SCRATCH}/training/biorlhf/"
+        echo "  You will need to provide the SFT checkpoint manually"
+    fi
+else
+    echo "  kmp_sft_model_final already exists: OK"
+fi
+# Step 7: Verify data paths
+echo ""
+echo "[7/7] Verifying data availability..."
+export GENELAB_BASE="${SCRATCH}/data/GeneLab_benchmark"
+export BIOEVAL_DATA="${SCRATCH}/data/BioEval/data"
+export SPACEOMICS_DATA="${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm"
+export BIOEVAL_ROOT="${SCRATCH}/data/BioEval"
+for d in "$GENELAB_BASE" "$BIOEVAL_DATA" "$SPACEOMICS_DATA" "$BIOEVAL_ROOT"; do
+    if [ -d "$d" ]; then
+        echo "  $d: OK"
+    else
+        echo "  $d: MISSING"
+    fi
+done
+echo ""
+echo "============================================================"
+echo "BioGRPO setup complete!"
+echo ""
+echo "Next steps:"
+echo "  sbatch scripts/run_grpo_mve.sh"
+echo "  tail -f logs/grpo_mve_*.log"
+echo "============================================================"

scripts/train_ecosystem_improved.sh ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/bin/bash
+#
+# BioRLHF Training Script - Ecosystem Improved Model
+# ====================================================
+#
+# This script trains a model on the combined dataset including:
+# - Original KMP study data (363 examples)
+# - Ecosystem failure-based examples (15 examples)
+#   - Calibration training
+#   - Adversarial resistance
+#   - Protocol completeness
+#   - Fact drilling
+#
+# Requirements:
+# - CUDA-capable GPU (recommended: A100, V100, or 4090)
+# - 24GB+ VRAM for Mistral-7B with 4-bit quantization
+# - Python environment with: torch, transformers, peft, trl, bitsandbytes
+#
+# Usage:
+#   ./scripts/train_ecosystem_improved.sh
+#
+# Or on HPC with SLURM:
+#   sbatch scripts/train_ecosystem_improved.sh
+#
+# ==============================================================================
+# SLURM Configuration (for HPC clusters - uncomment if using SLURM)
+# ==============================================================================
+#SBATCH --job-name=biorlhf_ecosystem
+#SBATCH --output=logs/biorlhf_ecosystem_%j.out
+#SBATCH --error=logs/biorlhf_ecosystem_%j.err
+#SBATCH --time=4:00:00
+#SBATCH --gres=gpu:1
+#SBATCH --mem=48G
+#SBATCH --cpus-per-task=8
+# ==============================================================================
+# Environment Setup
+# ==============================================================================
+echo "============================================================"
+echo "BioRLHF Ecosystem Training"
+echo "============================================================"
+echo "Start time: $(date)"
+echo "Host: $(hostname)"
+echo ""
+# Activate conda environment (adjust path as needed)
+# source /path/to/conda/etc/profile.d/conda.sh
+# conda activate biorlhf
+# Set working directory
+cd "$(dirname "$0")/.." || exit 1
+echo "Working directory: $(pwd)"
+# Check GPU
+echo ""
+echo "GPU Information:"
+nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv 2>/dev/null || echo "No GPU detected"
+echo ""
+# ==============================================================================
+# Training Configuration
+# ==============================================================================
+# Model settings
+MODEL="mistralai/Mistral-7B-v0.3"
+DATASET="data/combined_training.json"
+OUTPUT_DIR="./ecosystem_improved_model"
+# Training hyperparameters (optimized based on prior BioRLHF experiments)
+EPOCHS=10           # More epochs for better fact memorization
+BATCH_SIZE=4        # Adjust based on GPU memory
+GRAD_ACCUM=4        # Effective batch size = 16
+LEARNING_RATE=2e-4  # Standard for LoRA fine-tuning
+MAX_LENGTH=1024     # Sufficient for most examples
+# LoRA configuration (higher rank for domain knowledge)
+LORA_R=64           # Higher rank for better capacity
+LORA_ALPHA=128      # Alpha = 2 * r
+# Logging
+WANDB_PROJECT="biorlhf"
+WANDB_RUN="ecosystem_improved_$(date +%Y%m%d_%H%M%S)"
+# ==============================================================================
+# Pre-training Checks
+# ==============================================================================
+echo "============================================================"
+echo "Configuration:"
+echo "============================================================"
+echo "Model:        $MODEL"
+echo "Dataset:      $DATASET"
+echo "Output:       $OUTPUT_DIR"
+echo "Epochs:       $EPOCHS"
+echo "Batch size:   $BATCH_SIZE (effective: $((BATCH_SIZE * GRAD_ACCUM)))"
+echo "LoRA r/α:     $LORA_R / $LORA_ALPHA"
+echo "Max length:   $MAX_LENGTH"
+echo ""
+# Check if dataset exists
+if [ ! -f "$DATASET" ]; then
+    echo "ERROR: Dataset not found at $DATASET"
+    echo "Run: python scripts/merge_training_data.py"
+    exit 1
+fi
+# Count examples
+EXAMPLE_COUNT=$(python3 -c "import json; print(len(json.load(open('$DATASET'))))")
+echo "Dataset contains $EXAMPLE_COUNT examples"
+echo ""
+# ==============================================================================
+# Run Training
+# ==============================================================================
+echo "============================================================"
+echo "Starting Training..."
+echo "============================================================"
+python3 sft_train_v2.py \
+    --model "$MODEL" \
+    --dataset "$DATASET" \
+    --output_dir "$OUTPUT_DIR" \
+    --epochs $EPOCHS \
+    --batch_size $BATCH_SIZE \
+    --grad_accum $GRAD_ACCUM \
+    --lr $LEARNING_RATE \
+    --max_length $MAX_LENGTH \
+    --lora_r $LORA_R \
+    --lora_alpha $LORA_ALPHA \
+    --use_4bit \
+    --wandb_project "$WANDB_PROJECT" \
+    --wandb_run "$WANDB_RUN"
+# Check exit status
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "============================================================"
+    echo "✅ Training Complete!"
+    echo "============================================================"
+    echo "Model saved to: $OUTPUT_DIR"
+    echo "End time: $(date)"
+    echo ""
+    echo "Next steps:"
+    echo "1. Evaluate on SpaceOmicsBench: python evaluate_model.py --model $OUTPUT_DIR"
+    echo "2. Evaluate on CAMELOT: python evaluate_model.py --model $OUTPUT_DIR --benchmark camelot"
+    echo "3. Compare with baseline: python compare_models.py"
+else
+    echo ""
+    echo "============================================================"
+    echo "❌ Training Failed!"
+    echo "============================================================"
+    echo "Check the error messages above."
+    exit 1
+fi

src/biorlhf/__init__.py CHANGED Viewed

@@ -9,10 +9,30 @@ __version__ = "0.1.0"
 __author__ = "JangKeun Kim"
 __email__ = "jangkeun.kim@med.cornell.edu"
-from biorlhf.training.sft import SFTTrainingConfig, run_sft_training
-from biorlhf.training.dpo import DPOTrainingConfig, run_dpo_training
-from biorlhf.data.dataset import create_sft_dataset, load_dataset
-from biorlhf.evaluation.evaluate import evaluate_model
 __all__ = [
     "__version__",

 __author__ = "JangKeun Kim"
 __email__ = "jangkeun.kim@med.cornell.edu"
+def __getattr__(name):
+    """Lazy imports for torch-dependent modules."""
+    if name == "SFTTrainingConfig":
+        from biorlhf.training.sft import SFTTrainingConfig
+        return SFTTrainingConfig
+    elif name == "run_sft_training":
+        from biorlhf.training.sft import run_sft_training
+        return run_sft_training
+    elif name == "DPOTrainingConfig":
+        from biorlhf.training.dpo import DPOTrainingConfig
+        return DPOTrainingConfig
+    elif name == "run_dpo_training":
+        from biorlhf.training.dpo import run_dpo_training
+        return run_dpo_training
+    elif name == "create_sft_dataset":
+        from biorlhf.data.dataset import create_sft_dataset
+        return create_sft_dataset
+    elif name == "load_dataset":
+        from biorlhf.data.dataset import load_dataset
+        return load_dataset
+    elif name == "evaluate_model":
+        from biorlhf.evaluation.evaluate import evaluate_model
+        return evaluate_model
+    raise AttributeError(f"module 'biorlhf' has no attribute {name!r}")
 __all__ = [
     "__version__",

src/biorlhf/cli.py CHANGED Viewed

@@ -11,6 +11,7 @@ from pathlib import Path
 from biorlhf.training.sft import SFTTrainingConfig, run_sft_training
 from biorlhf.evaluation.evaluate import evaluate_model as _evaluate_model
 def train():
@@ -264,5 +265,131 @@ def evaluate():
         sys.exit(1)
 if __name__ == "__main__":
-    print("Use 'biorlhf-train' or 'biorlhf-evaluate' commands after installation.")

 from biorlhf.training.sft import SFTTrainingConfig, run_sft_training
 from biorlhf.evaluation.evaluate import evaluate_model as _evaluate_model
+from biorlhf.training.grpo import BioGRPOConfig, run_grpo_training
 def train():
         sys.exit(1)
+def grpo_train():
+    """CLI entry point for GRPO training with biological verifiers."""
+    parser = argparse.ArgumentParser(
+        description="Train a BioGRPO model with composable biological verifiers",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="mistralai/Mistral-7B-v0.3",
+        help="Base model to fine-tune",
+    )
+    parser.add_argument(
+        "--sft-model",
+        type=str,
+        default=None,
+        help="Path to SFT checkpoint (recommended)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="./biogrpo_model",
+        help="Output directory",
+    )
+    parser.add_argument(
+        "--num-generations",
+        type=int,
+        default=8,
+        help="G value: number of completions per prompt",
+    )
+    parser.add_argument(
+        "--beta",
+        type=float,
+        default=0.04,
+        help="KL penalty coefficient",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        type=float,
+        default=1e-6,
+        help="Learning rate",
+    )
+    parser.add_argument(
+        "--lora-r",
+        type=int,
+        default=32,
+        help="LoRA rank",
+    )
+    parser.add_argument(
+        "--lora-alpha",
+        type=int,
+        default=64,
+        help="LoRA alpha",
+    )
+    parser.add_argument(
+        "--verifiers",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Active verifiers (e.g., V1 V2 V3 V4). Default: all",
+    )
+    parser.add_argument(
+        "--pathway-db",
+        type=str,
+        default="hallmark",
+        choices=["hallmark", "kegg", "reactome", "mitocarta"],
+        help="Pathway database for GeneLab questions",
+    )
+    parser.add_argument(
+        "--no-wandb",
+        action="store_true",
+        help="Disable W&B logging",
+    )
+    parser.add_argument(
+        "--wandb-project",
+        type=str,
+        default="biogrpo",
+        help="W&B project name",
+    )
+    parser.add_argument(
+        "--wandb-run-name",
+        type=str,
+        default="grpo_v1",
+        help="W&B run name",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help="Path to JSON config file (overrides other args)",
+    )
+    args = parser.parse_args()
+    if args.config:
+        with open(args.config) as f:
+            config_dict = json.load(f)
+        config = BioGRPOConfig(**config_dict)
+    else:
+        config = BioGRPOConfig(
+            model_name=args.model,
+            sft_model_path=args.sft_model,
+            output_dir=args.output,
+            num_generations=args.num_generations,
+            beta=args.beta,
+            learning_rate=args.learning_rate,
+            lora_r=args.lora_r,
+            lora_alpha=args.lora_alpha,
+            active_verifiers=args.verifiers,
+            pathway_db=args.pathway_db,
+            use_wandb=not args.no_wandb,
+            wandb_project=args.wandb_project,
+            wandb_run_name=args.wandb_run_name,
+        )
+    try:
+        output_path = run_grpo_training(config)
+        print(f"\nModel saved to: {output_path}")
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        print(f"Error during GRPO training: {e}", file=sys.stderr)
+        sys.exit(1)
 if __name__ == "__main__":
+    print("Use 'biorlhf-train', 'biorlhf-evaluate', or 'biorlhf-grpo' commands after installation.")

src/biorlhf/data/__init__.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """Data processing and dataset creation modules for BioRLHF."""
-from biorlhf.data.dataset import create_sft_dataset, load_dataset
 from biorlhf.data.ground_truth import (
     STRESSOR_EFFECTS,
     KMP_EFFECTS,
@@ -18,3 +18,11 @@ __all__ = [
     "TISSUE_TYPES",
     "OXPHOS_PATTERNS",
 ]

 """Data processing and dataset creation modules for BioRLHF."""
+# ground_truth has no heavy dependencies, safe to import eagerly
 from biorlhf.data.ground_truth import (
     STRESSOR_EFFECTS,
     KMP_EFFECTS,
     "TISSUE_TYPES",
     "OXPHOS_PATTERNS",
 ]
+def __getattr__(name):
+    """Lazy imports for modules with heavy dependencies."""
+    if name in ("create_sft_dataset", "load_dataset"):
+        from biorlhf.data.dataset import create_sft_dataset, load_dataset
+        return {"create_sft_dataset": create_sft_dataset, "load_dataset": load_dataset}[name]
+    raise AttributeError(f"module 'biorlhf.data' has no attribute {name!r}")

src/biorlhf/data/genelabloader.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+GeneLab fGSEA/GSVA data loading for BioGRPO.
+Loads pathway enrichment results from the GeneLab_benchmark project's
+processed fGSEA and GSVA files. Provides consensus pathway directions
+across missions for use as verifiable ground truth.
+"""
+import os
+from pathlib import Path
+from typing import Dict, List
+from dataclasses import dataclass, field
+import json
+import pandas as pd
+# ── Paths (configurable via env vars for HPC) ─────────────────────────────
+GENELAB_BASE = Path(os.environ.get(
+    "GENELAB_BASE",
+    "/Users/jak4013/Dropbox/Bioinformatics/Claude/GeneLab_benchmark",
+))
+FGSEA_DIR = GENELAB_BASE / "processed" / "fgsea"
+GSVA_DIR = GENELAB_BASE / "processed" / "pathway_scores"
+TASKS_DIR = GENELAB_BASE / "tasks"
+EVAL_DIR = GENELAB_BASE / "evaluation"
+# ── Tissue → available missions (from actual files) ───────────────────────
+TISSUE_MISSIONS: Dict[str, List[str]] = {
+    "liver": ["MHU-2", "RR-1", "RR-3", "RR-6", "RR-8", "RR-9"],
+    "gastrocnemius": ["RR-1", "RR-9"],
+    "kidney": ["RR-1", "RR-3", "RR-7"],
+    "thymus": ["MHU-2", "RR-6", "RR-9"],
+    "skin": ["MHU-2_dorsal", "MHU-2_femoral", "RR-6"],
+    "eye": ["RR-1", "RR-3"],
+}
+# Tissue → LOMO task ID
+TISSUE_TASK_MAP: Dict[str, str] = {
+    "liver": "A1",
+    "gastrocnemius": "A2",
+    "kidney": "A3",
+    "thymus": "A4",
+    "skin": "A5",
+    "eye": "A6",
+}
+DBS = ["hallmark", "kegg", "reactome", "mitocarta"]
+@dataclass
+class PathwayResult:
+    """Single pathway enrichment result from fGSEA."""
+    pathway: str
+    nes: float
+    padj: float
+    direction: str          # "UP", "DOWN", or "NS"
+    tissue: str
+    mission: str
+    db: str
+    leading_edge: List[str] = field(default_factory=list)
+# ── Loading functions ──────────────────────────────────────────────────────
+def load_fgsea(tissue: str, mission: str, db: str = "hallmark") -> pd.DataFrame:
+    """Load a single fGSEA result CSV.
+    Returns DataFrame with columns:
+        pathway, pval, padj, log2err, ES, NES, size, db,
+        leadingEdge_str, tissue, mission, glds
+    """
+    path = FGSEA_DIR / tissue / f"{mission}_fgsea_{db}.csv"
+    if not path.exists():
+        raise FileNotFoundError(f"fGSEA file not found: {path}")
+    return pd.read_csv(path)
+def load_all_fgsea(tissue: str, db: str = "hallmark") -> pd.DataFrame:
+    """Load all fGSEA results for a tissue across all available missions."""
+    dfs = []
+    for mission in TISSUE_MISSIONS.get(tissue, []):
+        path = FGSEA_DIR / tissue / f"{mission}_fgsea_{db}.csv"
+        if path.exists():
+            dfs.append(pd.read_csv(path))
+    if not dfs:
+        return pd.DataFrame()
+    return pd.concat(dfs, ignore_index=True)
+def get_pathway_directions(
+    tissue: str,
+    db: str = "hallmark",
+    padj_threshold: float = 0.05,
+) -> Dict[str, Dict[str, str]]:
+    """Return pathway directions per mission.
+    Returns:
+        {mission: {pathway: "UP"/"DOWN"/"NS"}}
+        Only pathways with padj < threshold get UP/DOWN; rest are NS.
+    """
+    df = load_all_fgsea(tissue, db)
+    if df.empty:
+        return {}
+    result: Dict[str, Dict[str, str]] = {}
+    for mission, mdf in df.groupby("mission"):
+        directions: Dict[str, str] = {}
+        for _, row in mdf.iterrows():
+            if pd.notna(row["padj"]) and row["padj"] < padj_threshold:
+                directions[row["pathway"]] = "UP" if row["NES"] > 0 else "DOWN"
+            else:
+                directions[row["pathway"]] = "NS"
+        result[str(mission)] = directions
+    return result
+def get_consensus_directions(
+    tissue: str,
+    db: str = "hallmark",
+    min_missions: int = 2,
+    padj_threshold: float = 0.05,
+) -> Dict[str, Dict]:
+    """Return pathways with consensus direction across missions.
+    Only includes pathways where >= min_missions agree on direction
+    and the majority direction has more votes than the opposite.
+    Returns:
+        {pathway: {
+            direction: "UP"/"DOWN",
+            n_agree: int,
+            n_disagree: int,
+            n_ns: int,
+            missions_agree: List[str],
+            missions_disagree: List[str],
+        }}
+    """
+    all_dirs = get_pathway_directions(tissue, db, padj_threshold)
+    if not all_dirs:
+        return {}
+    # Collect per-pathway votes
+    pathway_votes: Dict[str, Dict[str, List[str]]] = {}
+    for mission, pmap in all_dirs.items():
+        for pathway, direction in pmap.items():
+            if pathway not in pathway_votes:
+                pathway_votes[pathway] = {"UP": [], "DOWN": [], "NS": []}
+            pathway_votes[pathway][direction].append(mission)
+    consensus: Dict[str, Dict] = {}
+    for pathway, votes in pathway_votes.items():
+        n_up = len(votes["UP"])
+        n_down = len(votes["DOWN"])
+        n_ns = len(votes["NS"])
+        if n_up >= min_missions and n_up > n_down:
+            consensus[pathway] = {
+                "direction": "UP",
+                "n_agree": n_up,
+                "n_disagree": n_down,
+                "n_ns": n_ns,
+                "missions_agree": votes["UP"],
+                "missions_disagree": votes["DOWN"],
+            }
+        elif n_down >= min_missions and n_down > n_up:
+            consensus[pathway] = {
+                "direction": "DOWN",
+                "n_agree": n_down,
+                "n_disagree": n_up,
+                "n_ns": n_ns,
+                "missions_agree": votes["DOWN"],
+                "missions_disagree": votes["UP"],
+            }
+    return consensus
+def get_disagreeing_pathways(
+    tissue: str,
+    db: str = "hallmark",
+    padj_threshold: float = 0.05,
+) -> Dict[str, Dict]:
+    """Return pathways where missions disagree on direction.
+    These are ideal for uncertainty questions — the model should
+    express uncertainty about direction.
+    Returns:
+        {pathway: {
+            missions_up: List[str],
+            missions_down: List[str],
+            missions_ns: List[str],
+        }}
+    """
+    all_dirs = get_pathway_directions(tissue, db, padj_threshold)
+    if not all_dirs:
+        return {}
+    pathway_votes: Dict[str, Dict[str, List[str]]] = {}
+    for mission, pmap in all_dirs.items():
+        for pathway, direction in pmap.items():
+            if pathway not in pathway_votes:
+                pathway_votes[pathway] = {"UP": [], "DOWN": [], "NS": []}
+            pathway_votes[pathway][direction].append(mission)
+    disagreeing: Dict[str, Dict] = {}
+    for pathway, votes in pathway_votes.items():
+        if votes["UP"] and votes["DOWN"]:
+            disagreeing[pathway] = {
+                "missions_up": votes["UP"],
+                "missions_down": votes["DOWN"],
+                "missions_ns": votes["NS"],
+            }
+    return disagreeing
+def load_gsva_scores(
+    tissue: str,
+    mission: str,
+    db: str = "hallmark",
+) -> pd.DataFrame:
+    """Load GSVA pathway scores (samples × pathways)."""
+    path = GSVA_DIR / tissue / f"{mission}_gsva_{db}.csv"
+    if not path.exists():
+        raise FileNotFoundError(f"GSVA file not found: {path}")
+    return pd.read_csv(path, index_col=0)
+def load_lomo_splits(tissue: str) -> List[Dict]:
+    """Load LOMO fold definitions from task_info.json."""
+    task_id = TISSUE_TASK_MAP.get(tissue)
+    if not task_id:
+        return []
+    task_dir = TASKS_DIR / f"{task_id}_{tissue}_lomo"
+    info_path = task_dir / "task_info.json"
+    if not info_path.exists():
+        return []
+    with open(info_path) as f:
+        info = json.load(f)
+    return info.get("folds", [])
+def load_nes_conservation(db: str = "hallmark") -> Dict:
+    """Load NES conservation analysis (cross-mission correlation data)."""
+    path = EVAL_DIR / f"NES_conservation_{db}.json"
+    if not path.exists():
+        return {}
+    with open(path) as f:
+        return json.load(f)
+def get_all_pathways(tissue: str, db: str = "hallmark") -> List[str]:
+    """Get sorted list of all pathway names for a tissue/db combo."""
+    df = load_all_fgsea(tissue, db)
+    if df.empty:
+        return []
+    return sorted(df["pathway"].unique().tolist())
+def get_pathway_nes_matrix(
+    tissue: str,
+    db: str = "hallmark",
+) -> pd.DataFrame:
+    """Return a mission × pathway NES matrix for a tissue.
+    Useful for visualizing pathway behavior across missions.
+    """
+    df = load_all_fgsea(tissue, db)
+    if df.empty:
+        return pd.DataFrame()
+    return df.pivot_table(
+        index="mission", columns="pathway", values="NES", aggfunc="first",
+    )

src/biorlhf/data/grpo_dataset.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Unified GRPO dataset builder for BioGRPO.
+Merges pathway questions from GeneLab, calibration tasks from BioEval,
+and domain questions from SpaceOmicsBench into a single TRL-compatible
+dataset with multi-dimensional ground truth.
+"""
+import os
+from pathlib import Path
+from typing import List, Dict, Optional, Tuple
+import json
+from datasets import Dataset as HFDataset
+from biorlhf.data.question_generator import generate_all_questions
+# ── External data paths (configurable via env vars for HPC) ───────────────
+BIOEVAL_DATA = Path(os.environ.get(
+    "BIOEVAL_DATA",
+    "/Users/jak4013/Dropbox/Bioinformatics/Claude/Evaluation_model/BioEval/data",
+))
+SPACEOMICS_DATA = Path(os.environ.get(
+    "SPACEOMICS_DATA",
+    "/Users/jak4013/Dropbox/Bioinformatics/Claude/SpaceOmicsBench/v3/evaluation/llm",
+))
+def load_bioeval_for_grpo() -> List[Dict]:
+    """Load BioEval tasks that have verifiable ground truth.
+    Selects:
+    - calibration tasks (30) → V4 training
+    - bioambiguity tasks (45) → V3 training
+    - Other verifiable tasks → V2 training
+    """
+    samples: List[Dict] = []
+    base_path = BIOEVAL_DATA / "bioeval_v060_base.jsonl"
+    if not base_path.exists():
+        return samples
+    with open(base_path) as f:
+        for line in f:
+            task = json.loads(line)
+            component = task.get("component", "")
+            prompt = task.get("prompt", "")
+            gt = task.get("ground_truth", "{}")
+            # Ensure ground_truth is a JSON string
+            gt_str = json.dumps(gt) if isinstance(gt, dict) else gt
+            if component == "calibration":
+                samples.append({
+                    "prompt": prompt,
+                    "ground_truth": gt_str,
+                    "question_type": "calibration",
+                    "applicable_verifiers": json.dumps(["V4"]),
+                    "source": "bioeval",
+                    "tissue": "general",
+                    "difficulty": "medium",
+                })
+            elif component == "bioambiguity":
+                samples.append({
+                    "prompt": prompt,
+                    "ground_truth": gt_str,
+                    "question_type": "context_dependent",
+                    "applicable_verifiers": json.dumps(["V3", "V4"]),
+                    "source": "bioeval",
+                    "tissue": "general",
+                    "difficulty": "hard",
+                })
+            elif component in ("causalbio", "designcheck", "adversarial"):
+                samples.append({
+                    "prompt": prompt,
+                    "ground_truth": gt_str,
+                    "question_type": component,
+                    "applicable_verifiers": json.dumps(["V2"]),
+                    "source": "bioeval",
+                    "tissue": "general",
+                    "difficulty": "hard" if component == "adversarial" else "medium",
+                })
+    return samples
+def load_spaceomics_for_grpo() -> List[Dict]:
+    """Load SpaceOmicsBench v3 questions with ground truth."""
+    samples: List[Dict] = []
+    qbank_path = SPACEOMICS_DATA / "question_bank_v3.json"
+    if not qbank_path.exists():
+        return samples
+    with open(qbank_path) as f:
+        qbank = json.load(f)
+    questions = qbank.get("questions", [])
+    for q in questions:
+        gt = {
+            "key_facts": q.get("ground_truth_key_facts", []),
+            "expected_reasoning": q.get("expected_reasoning", []),
+        }
+        verifiers = ["V2"]
+        if q.get("requires_uncertainty_calibration", False):
+            verifiers.append("V4")
+            gt["expected_confidence"] = "medium"
+        samples.append({
+            "prompt": q["question"],
+            "ground_truth": json.dumps(gt),
+            "question_type": q.get("category", "factual"),
+            "applicable_verifiers": json.dumps(verifiers),
+            "source": "spaceomics",
+            "tissue": "general",
+            "difficulty": q.get("difficulty", "medium"),
+        })
+    return samples
+def build_grpo_dataset(
+    db: str = "hallmark",
+    seed: int = 42,
+    hold_out_tissues: Optional[List[str]] = None,
+) -> Tuple[HFDataset, HFDataset]:
+    """Build the full GRPO training dataset with train/eval split.
+    Args:
+        db: Pathway database to use for GeneLab questions.
+        seed: Random seed for splitting.
+        hold_out_tissues: If set, questions from these tissues go to eval.
+            Otherwise uses random 10% split.
+    Returns:
+        (train_dataset, eval_dataset) as HuggingFace Datasets.
+    Dataset columns (TRL-compatible):
+        - prompt: str (required by GRPOTrainer)
+        - ground_truth: str (JSON, forwarded to reward function)
+        - question_type: str (forwarded to reward function)
+        - applicable_verifiers: str (JSON list, forwarded to reward function)
+        - source: str ("genelab", "bioeval", "spaceomics")
+        - tissue: str (for LOMO splitting)
+        - difficulty: str ("easy", "medium", "hard")
+    """
+    all_samples: List[Dict] = []
+    # 1. GeneLab pathway questions
+    genelab_qs = generate_all_questions(db)
+    for q in genelab_qs:
+        all_samples.append({
+            "prompt": q.prompt,
+            "ground_truth": json.dumps(q.ground_truth),
+            "question_type": q.question_type,
+            "applicable_verifiers": json.dumps(q.applicable_verifiers),
+            "source": "genelab",
+            "tissue": q.tissue,
+            "difficulty": q.difficulty,
+        })
+    # 2. BioEval tasks
+    all_samples.extend(load_bioeval_for_grpo())
+    # 3. SpaceOmicsBench questions
+    all_samples.extend(load_spaceomics_for_grpo())
+    if not all_samples:
+        raise ValueError("No training samples generated. Check data paths.")
+    # Convert to HF Dataset
+    full_dataset = HFDataset.from_list(all_samples)
+    # Split strategy
+    if hold_out_tissues:
+        train_indices = []
+        eval_indices = []
+        for i, sample in enumerate(all_samples):
+            if sample["tissue"] in hold_out_tissues:
+                eval_indices.append(i)
+            else:
+                train_indices.append(i)
+        if not eval_indices:
+            # Fallback: random split if no matching tissues
+            split = full_dataset.train_test_split(test_size=0.1, seed=seed)
+            return split["train"], split["test"]
+        train_dataset = full_dataset.select(train_indices)
+        eval_dataset = full_dataset.select(eval_indices)
+    else:
+        split = full_dataset.train_test_split(test_size=0.1, seed=seed)
+        train_dataset = split["train"]
+        eval_dataset = split["test"]
+    return train_dataset, eval_dataset
+def get_dataset_stats(dataset: HFDataset) -> Dict:
+    """Return summary statistics for a GRPO dataset."""
+    sources = {}
+    types = {}
+    tissues = {}
+    difficulties = {}
+    for sample in dataset:
+        src = sample["source"]
+        sources[src] = sources.get(src, 0) + 1
+        qt = sample["question_type"]
+        types[qt] = types.get(qt, 0) + 1
+        t = sample["tissue"]
+        tissues[t] = tissues.get(t, 0) + 1
+        d = sample["difficulty"]
+        difficulties[d] = difficulties.get(d, 0) + 1
+    return {
+        "total": len(dataset),
+        "by_source": sources,
+        "by_question_type": types,
+        "by_tissue": tissues,
+        "by_difficulty": difficulties,
+    }

src/biorlhf/data/question_generator.py ADDED Viewed

	@@ -0,0 +1,264 @@

+"""
+Pathway reasoning question generator for BioGRPO.
+Generates verifiable QA pairs from GeneLab fGSEA pathway data.
+Each question has structured ground truth for scoring by the verifier stack.
+"""
+from typing import List, Dict, Set
+from dataclasses import dataclass, field
+from biorlhf.data.genelabloader import (
+    get_consensus_directions,
+    get_disagreeing_pathways,
+    get_pathway_directions,
+    load_nes_conservation,
+    TISSUE_MISSIONS,
+)
+@dataclass
+class GRPOQuestion:
+    """A question with verifiable ground truth for GRPO training."""
+    prompt: str
+    ground_truth: Dict
+    tissue: str
+    db: str
+    question_type: str          # "direction", "comparison", "consistency", "uncertainty"
+    applicable_verifiers: List[str]
+    difficulty: str             # "easy", "medium", "hard"
+    metadata: Dict = field(default_factory=dict)
+def _clean_pathway_name(pathway: str) -> str:
+    """HALLMARK_OXIDATIVE_PHOSPHORYLATION → Oxidative Phosphorylation"""
+    for prefix in ("HALLMARK_", "KEGG_", "REACTOME_", "MITOCARTA_"):
+        pathway = pathway.replace(prefix, "")
+    return pathway.replace("_", " ").title()
+# ── Question generators ────────────────────────────────────────────────────
+def generate_direction_questions(
+    tissue: str,
+    db: str = "hallmark",
+    padj_threshold: float = 0.05,
+) -> List[GRPOQuestion]:
+    """Generate V1-targetable questions about pathway direction."""
+    consensus = get_consensus_directions(tissue, db, min_missions=2, padj_threshold=padj_threshold)
+    questions: List[GRPOQuestion] = []
+    for pathway, info in consensus.items():
+        pw = _clean_pathway_name(pathway)
+        n_agree = info["n_agree"]
+        # Type 1: Direct direction question (easy/medium)
+        questions.append(GRPOQuestion(
+            prompt=(
+                f"In mouse {tissue} tissue during spaceflight, is the "
+                f"{pw} pathway upregulated or downregulated based on "
+                f"gene set enrichment analysis? "
+                f"Provide your confidence level."
+            ),
+            ground_truth={
+                "pathway": pathway,
+                "direction": info["direction"],
+                "n_supporting_missions": n_agree,
+                "expected_confidence": "high" if n_agree >= 3 else "medium",
+            },
+            tissue=tissue,
+            db=db,
+            question_type="direction",
+            applicable_verifiers=["V1", "V4"],
+            difficulty="easy" if n_agree >= 3 else "medium",
+        ))
+        # Type 2: Mechanistic reasoning question (medium/hard)
+        direction_word = "activation" if info["direction"] == "UP" else "suppression"
+        questions.append(GRPOQuestion(
+            prompt=(
+                f"Explain the biological significance of {pw} pathway "
+                f"{direction_word} in mouse {tissue} under spaceflight conditions. "
+                f"What mechanisms might drive this change? "
+                f"State your confidence in the direction and magnitude."
+            ),
+            ground_truth={
+                "pathway": pathway,
+                "direction": info["direction"],
+                "n_supporting_missions": n_agree,
+                "requires_mechanism": True,
+                "expected_confidence": "medium",
+            },
+            tissue=tissue,
+            db=db,
+            question_type="direction",
+            applicable_verifiers=["V1", "V2", "V4"],
+            difficulty="medium" if n_agree >= 3 else "hard",
+        ))
+    return questions
+def generate_comparison_questions(
+    db: str = "hallmark",
+    padj_threshold: float = 0.05,
+) -> List[GRPOQuestion]:
+    """Generate cross-tissue comparison questions (V1 + V3 targetable)."""
+    questions: List[GRPOQuestion] = []
+    # Collect consensus directions across tissues (exclude skin subsites for cleaner Qs)
+    tissue_dirs: Dict[str, Dict[str, Dict]] = {}
+    comparison_tissues = ["liver", "gastrocnemius", "kidney", "thymus", "eye"]
+    for tissue in comparison_tissues:
+        consensus = get_consensus_directions(tissue, db, min_missions=2, padj_threshold=padj_threshold)
+        if consensus:
+            tissue_dirs[tissue] = consensus
+    if len(tissue_dirs) < 2:
+        return questions
+    # Find pathways in 2+ tissues
+    all_pathways: Set[str] = set()
+    for dirs in tissue_dirs.values():
+        all_pathways.update(dirs.keys())
+    for pathway in sorted(all_pathways):
+        tissues_with = {
+            t: d[pathway]
+            for t, d in tissue_dirs.items()
+            if pathway in d
+        }
+        if len(tissues_with) < 2:
+            continue
+        pw = _clean_pathway_name(pathway)
+        tissue_list = sorted(tissues_with.keys())
+        directions_set = {info["direction"] for info in tissues_with.values()}
+        is_consistent = len(directions_set) == 1
+        questions.append(GRPOQuestion(
+            prompt=(
+                f"Compare the response of the {pw} pathway to spaceflight "
+                f"across {', '.join(tissue_list)} tissues in mice. "
+                f"Is the direction of change consistent or tissue-specific? "
+                f"Explain the biological basis for any differences."
+            ),
+            ground_truth={
+                "pathway": pathway,
+                "tissue_directions": {
+                    t: info["direction"] for t, info in tissues_with.items()
+                },
+                "is_consistent": is_consistent,
+                "n_tissues": len(tissues_with),
+            },
+            tissue="multi",
+            db=db,
+            question_type="comparison",
+            applicable_verifiers=["V1", "V3", "V4"],
+            difficulty="hard",
+        ))
+    return questions
+def generate_uncertainty_questions(
+    tissue: str,
+    db: str = "hallmark",
+    padj_threshold: float = 0.05,
+) -> List[GRPOQuestion]:
+    """Generate questions where missions disagree → model should express uncertainty."""
+    disagreeing = get_disagreeing_pathways(tissue, db, padj_threshold)
+    questions: List[GRPOQuestion] = []
+    for pathway, info in disagreeing.items():
+        pw = _clean_pathway_name(pathway)
+        questions.append(GRPOQuestion(
+            prompt=(
+                f"Is the {pw} pathway consistently activated or suppressed "
+                f"in mouse {tissue} across different spaceflight missions? "
+                f"How confident are you in the direction of change?"
+            ),
+            ground_truth={
+                "pathway": pathway,
+                "missions_up": info["missions_up"],
+                "missions_down": info["missions_down"],
+                "missions_ns": info["missions_ns"],
+                "correct_behavior": "context_dependent",
+                "expected_confidence": "low",
+            },
+            tissue=tissue,
+            db=db,
+            question_type="uncertainty",
+            applicable_verifiers=["V1", "V4"],
+            difficulty="hard",
+        ))
+    return questions
+def generate_conservation_questions(
+    db: str = "hallmark",
+) -> List[GRPOQuestion]:
+    """Generate questions about NES conservation across missions."""
+    conservation = load_nes_conservation(db)
+    if not conservation:
+        return []
+    questions: List[GRPOQuestion] = []
+    data = conservation.get("data", conservation)
+    for tissue, info in data.items():
+        if not isinstance(info, dict):
+            continue
+        mean_r = info.get("nes_mean_r")
+        if mean_r is None:
+            continue
+        if mean_r > 0.5:
+            conservation_level = "highly conserved"
+            expected_conf = "high"
+        elif mean_r > 0.2:
+            conservation_level = "moderately conserved"
+            expected_conf = "medium"
+        else:
+            conservation_level = "poorly conserved"
+            expected_conf = "medium"
+        questions.append(GRPOQuestion(
+            prompt=(
+                f"How conserved are pathway-level responses to spaceflight "
+                f"across different missions in mouse {tissue}? "
+                f"Are the enrichment patterns reproducible?"
+            ),
+            ground_truth={
+                "tissue": tissue,
+                "nes_mean_r": mean_r,
+                "conservation_level": conservation_level,
+                "expected_confidence": expected_conf,
+                "key_facts": [
+                    f"Mean pairwise NES correlation across missions is {mean_r:.3f}",
+                    f"Pathway responses in {tissue} are {conservation_level}",
+                ],
+            },
+            tissue=tissue,
+            db=db,
+            question_type="direction",
+            applicable_verifiers=["V2", "V4"],
+            difficulty="medium",
+        ))
+    return questions
+def generate_all_questions(db: str = "hallmark") -> List[GRPOQuestion]:
+    """Generate the full question set from GeneLab data."""
+    all_q: List[GRPOQuestion] = []
+    for tissue in TISSUE_MISSIONS:
+        all_q.extend(generate_direction_questions(tissue, db))
+        all_q.extend(generate_uncertainty_questions(tissue, db))
+    all_q.extend(generate_comparison_questions(db))
+    all_q.extend(generate_conservation_questions(db))
+    return all_q

src/biorlhf/evaluation/__init__.py CHANGED Viewed

@@ -1,8 +1,14 @@
 """Evaluation modules for BioRLHF."""
-from biorlhf.evaluation.evaluate import evaluate_model, compute_metrics
 __all__ = [
     "evaluate_model",
     "compute_metrics",
 ]

 """Evaluation modules for BioRLHF."""
 __all__ = [
     "evaluate_model",
     "compute_metrics",
 ]
+def __getattr__(name):
+    """Lazy imports for torch-dependent modules."""
+    if name in ("evaluate_model", "compute_metrics"):
+        from biorlhf.evaluation.evaluate import evaluate_model, compute_metrics
+        return {"evaluate_model": evaluate_model, "compute_metrics": compute_metrics}[name]
+    raise AttributeError(f"module 'biorlhf.evaluation' has no attribute {name!r}")

src/biorlhf/evaluation/calibration.py ADDED Viewed

	@@ -0,0 +1,184 @@

+"""
+Calibration evaluation metrics for BioGRPO.
+Implements Expected Calibration Error (ECE), Brier score, overconfidence
+rate, and reliability diagram data generation.
+"""
+from typing import Dict, List, Tuple
+from dataclasses import dataclass
+@dataclass
+class CalibrationMetrics:
+    """Aggregated calibration metrics."""
+    ece: float                      # Expected Calibration Error
+    mce: float                      # Maximum Calibration Error
+    brier_score: float
+    overconfidence_rate: float      # P(wrong | confidence > threshold)
+    underconfidence_rate: float     # P(correct | confidence < threshold)
+    mean_confidence: float
+    mean_accuracy: float
+    n_samples: int
+    reliability_bins: List[Dict]    # For plotting reliability diagrams
+def compute_ece(
+    confidences: List[float],
+    correctnesses: List[bool],
+    n_bins: int = 10,
+) -> Tuple[float, float, List[Dict]]:
+    """Compute Expected and Maximum Calibration Error.
+    Uses equal-width binning.
+    Args:
+        confidences: Model's stated confidence for each prediction (0-1).
+        correctnesses: Whether each prediction was correct.
+        n_bins: Number of calibration bins.
+    Returns:
+        (ECE, MCE, bin_data) where bin_data is list of dicts for plotting.
+    """
+    if not confidences:
+        return 0.0, 0.0, []
+    bin_width = 1.0 / n_bins
+    bins: List[Dict] = []
+    for i in range(n_bins):
+        bin_lower = i * bin_width
+        bin_upper = (i + 1) * bin_width
+        # Find samples in this bin
+        indices = [
+            j for j, c in enumerate(confidences)
+            if bin_lower <= c < bin_upper or (i == n_bins - 1 and c == 1.0)
+        ]
+        if not indices:
+            bins.append({
+                "bin_lower": bin_lower,
+                "bin_upper": bin_upper,
+                "mean_confidence": (bin_lower + bin_upper) / 2,
+                "mean_accuracy": 0.0,
+                "count": 0,
+                "calibration_error": 0.0,
+            })
+            continue
+        bin_confs = [confidences[j] for j in indices]
+        bin_accs = [float(correctnesses[j]) for j in indices]
+        mean_conf = sum(bin_confs) / len(bin_confs)
+        mean_acc = sum(bin_accs) / len(bin_accs)
+        bins.append({
+            "bin_lower": bin_lower,
+            "bin_upper": bin_upper,
+            "mean_confidence": mean_conf,
+            "mean_accuracy": mean_acc,
+            "count": len(indices),
+            "calibration_error": abs(mean_acc - mean_conf),
+        })
+    # ECE: weighted average of calibration errors
+    total_samples = len(confidences)
+    ece = sum(
+        b["count"] / total_samples * b["calibration_error"]
+        for b in bins if b["count"] > 0
+    )
+    # MCE: maximum calibration error across non-empty bins
+    non_empty_errors = [b["calibration_error"] for b in bins if b["count"] > 0]
+    mce = max(non_empty_errors) if non_empty_errors else 0.0
+    return ece, mce, bins
+def compute_brier_score(
+    confidences: List[float],
+    correctnesses: List[bool],
+) -> float:
+    """Compute Brier score: mean squared error between confidence and outcome.
+    Lower is better. Range [0, 1].
+    """
+    if not confidences:
+        return 0.0
+    n = len(confidences)
+    return sum(
+        (c - float(o)) ** 2 for c, o in zip(confidences, correctnesses)
+    ) / n
+def compute_overconfidence_rate(
+    confidences: List[float],
+    correctnesses: List[bool],
+    threshold: float = 0.8,
+) -> float:
+    """P(wrong | confidence > threshold).
+    High overconfidence rate indicates the model is unreliably confident.
+    """
+    high_conf = [
+        (c, o) for c, o in zip(confidences, correctnesses) if c > threshold
+    ]
+    if not high_conf:
+        return 0.0
+    wrong = sum(1 for _, o in high_conf if not o)
+    return wrong / len(high_conf)
+def compute_underconfidence_rate(
+    confidences: List[float],
+    correctnesses: List[bool],
+    threshold: float = 0.3,
+) -> float:
+    """P(correct | confidence < threshold).
+    High underconfidence rate means the model knows more than it admits.
+    """
+    low_conf = [
+        (c, o) for c, o in zip(confidences, correctnesses) if c < threshold
+    ]
+    if not low_conf:
+        return 0.0
+    correct = sum(1 for _, o in low_conf if o)
+    return correct / len(low_conf)
+def compute_calibration_metrics(
+    confidences: List[float],
+    correctnesses: List[bool],
+    n_bins: int = 10,
+    overconf_threshold: float = 0.8,
+    underconf_threshold: float = 0.3,
+) -> CalibrationMetrics:
+    """Compute full calibration metrics suite."""
+    if not confidences:
+        return CalibrationMetrics(
+            ece=0.0, mce=0.0, brier_score=0.0,
+            overconfidence_rate=0.0, underconfidence_rate=0.0,
+            mean_confidence=0.0, mean_accuracy=0.0,
+            n_samples=0, reliability_bins=[],
+        )
+    ece, mce, bins = compute_ece(confidences, correctnesses, n_bins)
+    brier = compute_brier_score(confidences, correctnesses)
+    overconf = compute_overconfidence_rate(confidences, correctnesses, overconf_threshold)
+    underconf = compute_underconfidence_rate(confidences, correctnesses, underconf_threshold)
+    mean_conf = sum(confidences) / len(confidences)
+    mean_acc = sum(float(c) for c in correctnesses) / len(correctnesses)
+    return CalibrationMetrics(
+        ece=ece,
+        mce=mce,
+        brier_score=brier,
+        overconfidence_rate=overconf,
+        underconfidence_rate=underconf,
+        mean_confidence=mean_conf,
+        mean_accuracy=mean_acc,
+        n_samples=len(confidences),
+        reliability_bins=bins,
+    )

src/biorlhf/training/__init__.py CHANGED Viewed

@@ -1,11 +1,24 @@
 """Training modules for BioRLHF."""
-from biorlhf.training.sft import SFTTrainingConfig, run_sft_training
-from biorlhf.training.dpo import DPOTrainingConfig, run_dpo_training
 __all__ = [
     "SFTTrainingConfig",
     "run_sft_training",
     "DPOTrainingConfig",
     "run_dpo_training",
 ]

 """Training modules for BioRLHF."""
 __all__ = [
     "SFTTrainingConfig",
     "run_sft_training",
     "DPOTrainingConfig",
     "run_dpo_training",
+    "BioGRPOConfig",
+    "run_grpo_training",
 ]
+def __getattr__(name):
+    """Lazy imports for torch-dependent modules."""
+    if name in ("SFTTrainingConfig", "run_sft_training"):
+        from biorlhf.training.sft import SFTTrainingConfig, run_sft_training
+        return {"SFTTrainingConfig": SFTTrainingConfig, "run_sft_training": run_sft_training}[name]
+    elif name in ("DPOTrainingConfig", "run_dpo_training"):
+        from biorlhf.training.dpo import DPOTrainingConfig, run_dpo_training
+        return {"DPOTrainingConfig": DPOTrainingConfig, "run_dpo_training": run_dpo_training}[name]
+    elif name in ("BioGRPOConfig", "run_grpo_training"):
+        from biorlhf.training.grpo import BioGRPOConfig, run_grpo_training
+        return {"BioGRPOConfig": BioGRPOConfig, "run_grpo_training": run_grpo_training}[name]
+    raise AttributeError(f"module 'biorlhf.training' has no attribute {name!r}")

src/biorlhf/training/grpo.py ADDED Viewed

	@@ -0,0 +1,284 @@

+"""
+Group Relative Policy Optimization (GRPO) training for BioGRPO.
+Uses TRL's GRPOTrainer with composable biological verifiers as reward functions.
+Supports configurable G values, verifier weights, and LoRA parameters.
+"""
+import os
+from dataclasses import dataclass, field
+from typing import Optional, List, Dict
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from peft import LoraConfig, PeftModel
+from trl import GRPOTrainer, GRPOConfig
+from biorlhf.verifiers.composer import make_grpo_reward_function
+from biorlhf.data.grpo_dataset import build_grpo_dataset, get_dataset_stats
+@dataclass
+class BioGRPOConfig:
+    """Configuration for BioGRPO training."""
+    # Model settings
+    model_name: str = "mistralai/Mistral-7B-v0.3"
+    sft_model_path: Optional[str] = None
+    output_dir: str = "./biogrpo_model"
+    # GRPO hyperparameters
+    num_generations: int = 8
+    beta: float = 0.04
+    num_iterations: int = 1
+    scale_rewards: str = "group"
+    loss_type: str = "grpo"
+    # Training hyperparameters
+    num_epochs: int = 1
+    batch_size: int = 2
+    gradient_accumulation_steps: int = 8
+    learning_rate: float = 1e-6
+    max_completion_length: int = 1024
+    max_prompt_length: int = 512
+    warmup_ratio: float = 0.1
+    # LoRA settings
+    lora_r: int = 32
+    lora_alpha: int = 64
+    lora_dropout: float = 0.05
+    # Verifier configuration
+    verifier_weights: Optional[Dict[str, float]] = None
+    active_verifiers: Optional[List[str]] = None
+    # Data
+    pathway_db: str = "hallmark"
+    hold_out_tissues: Optional[List[str]] = None
+    seed: int = 42
+    # Quantization
+    use_4bit: bool = True
+    # Logging
+    wandb_project: str = "biogrpo"
+    wandb_run_name: str = "grpo_v1"
+    use_wandb: bool = True
+    logging_steps: int = 5
+    save_steps: int = 50
+    eval_steps: int = 50
+    save_total_limit: int = 3
+    log_completions: bool = True
+    # Memory optimization
+    use_vllm: bool = False
+    gradient_checkpointing: bool = True
+    bf16: bool = True
+def run_grpo_training(config: Optional[BioGRPOConfig] = None) -> str:
+    """Run BioGRPO training.
+    Pipeline:
+      1. Build dataset from GeneLab + BioEval + SpaceOmicsBench
+      2. Create composed reward function from verifier stack
+      3. Load tokenizer and configure GRPOTrainer with LoRA
+      4. Train and save model
+    Args:
+        config: Training configuration. Uses defaults if None.
+    Returns:
+        Path to the saved model directory.
+    """
+    if config is None:
+        config = BioGRPOConfig()
+    print("=" * 60)
+    print("BioGRPO Training")
+    print("=" * 60)
+    print(f"  Model:           {config.model_name}")
+    print(f"  SFT checkpoint:  {config.sft_model_path or 'None (from base)'}")
+    print(f"  G (generations): {config.num_generations}")
+    print(f"  Beta (KL):       {config.beta}")
+    print(f"  Loss type:       {config.loss_type}")
+    print(f"  Active verifiers:{config.active_verifiers or 'all (V1-V4)'}")
+    print(f"  Verifier weights:{config.verifier_weights or 'default'}")
+    print(f"  LoRA r/alpha:    {config.lora_r}/{config.lora_alpha}")
+    print(f"  Learning rate:   {config.learning_rate}")
+    print(f"  QLoRA 4-bit:     {config.use_4bit}")
+    print(f"  Output:          {config.output_dir}")
+    print("=" * 60)
+    # Initialize wandb
+    if config.use_wandb:
+        try:
+            import wandb
+            wandb.init(
+                project=config.wandb_project,
+                name=config.wandb_run_name,
+                config={k: v for k, v in vars(config).items() if not k.startswith("_")},
+            )
+        except ImportError:
+            print("Warning: wandb not installed, disabling logging")
+            config.use_wandb = False
+    # 1. Build dataset
+    print("\n[1/5] Building GRPO dataset...")
+    train_dataset, eval_dataset = build_grpo_dataset(
+        db=config.pathway_db,
+        seed=config.seed,
+        hold_out_tissues=config.hold_out_tissues,
+    )
+    train_stats = get_dataset_stats(train_dataset)
+    eval_stats = get_dataset_stats(eval_dataset)
+    print(f"  Train: {train_stats['total']} samples")
+    print(f"    By source: {train_stats['by_source']}")
+    print(f"    By type:   {train_stats['by_question_type']}")
+    print(f"  Eval:  {eval_stats['total']} samples")
+    # 2. Create reward function
+    print("\n[2/5] Initializing verifier stack...")
+    reward_func = make_grpo_reward_function(
+        weights=config.verifier_weights,
+        active_verifiers=config.active_verifiers,
+    )
+    print(f"  Active: {config.active_verifiers or ['V1', 'V2', 'V3', 'V4']}")
+    # 3. Load tokenizer (always from base model; adapter dirs lack config.json)
+    print("\n[3/5] Loading tokenizer...")
+    tokenizer_source = config.model_name
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_source, trust_remote_code=True)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "left"
+    print(f"  Tokenizer: {tokenizer.__class__.__name__}, vocab={tokenizer.vocab_size}")
+    # 4. Configure LoRA
+    peft_config = LoraConfig(
+        r=config.lora_r,
+        lora_alpha=config.lora_alpha,
+        target_modules=[
+            "q_proj", "k_proj", "v_proj", "o_proj",
+            "gate_proj", "up_proj", "down_proj",
+        ],
+        lora_dropout=config.lora_dropout,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    # 5. Load model (merge SFT adapter if present)
+    print("\n[4/5] Loading model...")
+    # QLoRA quantization config
+    bnb_config = None
+    if config.use_4bit:
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+        )
+    # Check if sft_model_path is a LoRA adapter or a full model
+    sft_is_adapter = (
+        config.sft_model_path
+        and os.path.isdir(config.sft_model_path)
+        and os.path.exists(os.path.join(config.sft_model_path, "adapter_config.json"))
+    )
+    if sft_is_adapter:
+        # Load base model, merge SFT adapter, then apply fresh LoRA for GRPO
+        print(f"  Loading base model: {config.model_name}")
+        base_model = AutoModelForCausalLM.from_pretrained(
+            config.model_name,
+            quantization_config=bnb_config,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+        print(f"  Loading SFT LoRA adapter: {config.sft_model_path}")
+        model = PeftModel.from_pretrained(base_model, config.sft_model_path)
+        print("  Merging SFT adapter into base model...")
+        model = model.merge_and_unload()
+        print("  SFT adapter merged successfully")
+    else:
+        # sft_model_path is a full model or use base model
+        model_path = config.sft_model_path or config.model_name
+        print(f"  Loading model: {model_path}")
+        model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            quantization_config=bnb_config,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+    # 6. Configure GRPOTrainer
+    print("\n[5/6] Configuring GRPOTrainer...")
+    grpo_config = GRPOConfig(
+        output_dir=config.output_dir,
+        num_train_epochs=config.num_epochs,
+        per_device_train_batch_size=config.batch_size,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        learning_rate=config.learning_rate,
+        warmup_ratio=config.warmup_ratio,
+        lr_scheduler_type="cosine",
+        # GRPO-specific
+        num_generations=config.num_generations,
+        beta=config.beta,
+        loss_type=config.loss_type,
+        max_completion_length=config.max_completion_length,
+        max_prompt_length=config.max_prompt_length,
+        num_iterations=config.num_iterations,
+        scale_rewards=config.scale_rewards,
+        # Memory/compute
+        gradient_checkpointing=config.gradient_checkpointing,
+        bf16=config.bf16,
+        use_vllm=config.use_vllm,
+        # Logging
+        logging_steps=config.logging_steps,
+        save_steps=config.save_steps,
+        save_total_limit=config.save_total_limit,
+        report_to="wandb" if config.use_wandb else "none",
+        run_name=config.wandb_run_name,
+        # Evaluation
+        eval_strategy="steps",
+        eval_steps=config.eval_steps,
+        log_completions=config.log_completions,
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        args=grpo_config,
+        reward_funcs=reward_func,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        peft_config=peft_config,
+        processing_class=tokenizer,
+    )
+    # Train
+    print("\n[6/6] Starting GRPO training...")
+    print("=" * 60)
+    trainer.train()
+    # Save
+    print(f"\nSaving model to {config.output_dir}")
+    trainer.save_model(config.output_dir)
+    if config.use_wandb:
+        try:
+            import wandb
+            wandb.finish()
+        except ImportError:
+            pass
+    print("\n" + "=" * 60)
+    print("BioGRPO Training complete!")
+    print("=" * 60)
+    return config.output_dir

src/biorlhf/verifiers/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""Composable biological verifiers for BioGRPO."""
+from biorlhf.verifiers.base import BaseVerifier, VerifierResult
+from biorlhf.verifiers.pathway import PathwayDirectionVerifier
+from biorlhf.verifiers.factual import BiologicalFactVerifier
+from biorlhf.verifiers.consistency import CrossContextConsistencyVerifier
+from biorlhf.verifiers.uncertainty import UncertaintyVerifier
+from biorlhf.verifiers.composer import (
+    VerifierComposer,
+    make_grpo_reward_function,
+    make_single_verifier_reward,
+)
+__all__ = [
+    "BaseVerifier",
+    "VerifierResult",
+    "PathwayDirectionVerifier",
+    "BiologicalFactVerifier",
+    "CrossContextConsistencyVerifier",
+    "UncertaintyVerifier",
+    "VerifierComposer",
+    "make_grpo_reward_function",
+    "make_single_verifier_reward",
+]

src/biorlhf/verifiers/base.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Abstract base class for biological verifiers."""
+from abc import ABC, abstractmethod
+from typing import Dict, List
+from dataclasses import dataclass, field
+@dataclass
+class VerifierResult:
+    """Result from a single verifier."""
+    score: float                    # 0.0 to 1.0
+    verifier_name: str
+    details: Dict = field(default_factory=dict)
+    applicable: bool = True         # False if verifier doesn't apply
+class BaseVerifier(ABC):
+    """Abstract base class for biological verifiers.
+    Each verifier scores a model completion against structured ground truth
+    on a specific dimension (pathway direction, factual accuracy, etc.).
+    """
+    name: str = "base"
+    @abstractmethod
+    def score(
+        self,
+        prompt: str,
+        completion: str,
+        ground_truth: Dict,
+        question_type: str,
+    ) -> VerifierResult:
+        """Score a single completion against ground truth.
+        Args:
+            prompt: The original question.
+            completion: The model's generated response.
+            ground_truth: Parsed ground truth dictionary.
+            question_type: Type of question for routing logic.
+        Returns:
+            VerifierResult with score in [0, 1].
+        """
+        raise NotImplementedError
+    def is_applicable(self, applicable_verifiers: List[str]) -> bool:
+        """Check if this verifier should score this question."""
+        return self.name in applicable_verifiers

src/biorlhf/verifiers/composer.py ADDED Viewed

	@@ -0,0 +1,227 @@

+"""
+Verifier Composer: Weighted composition of V1-V4 into a TRL-compatible reward function.
+This is THE critical integration point between the verifier stack and
+TRL's GRPOTrainer. The reward function signature must match TRL's expected
+interface exactly.
+"""
+import json
+from typing import Callable, Dict, List, Optional
+from dataclasses import dataclass, field
+from biorlhf.verifiers.base import BaseVerifier, VerifierResult
+from biorlhf.verifiers.pathway import PathwayDirectionVerifier
+from biorlhf.verifiers.factual import BiologicalFactVerifier
+from biorlhf.verifiers.consistency import CrossContextConsistencyVerifier
+from biorlhf.verifiers.uncertainty import UncertaintyVerifier
+@dataclass
+class ComposedReward:
+    """Result of composed reward computation."""
+    total_reward: float
+    verifier_scores: Dict[str, float]
+    verifier_details: Dict[str, Dict]
+    weights_used: Dict[str, float]
+# Default weights — factual signals dominate
+DEFAULT_WEIGHTS = {
+    "V1": 0.35,     # Pathway direction (hard signal)
+    "V2": 0.30,     # Biological facts (soft signal)
+    "V3": 0.15,     # Cross-context consistency
+    "V4": 0.20,     # Uncertainty appropriateness
+}
+class VerifierComposer:
+    """Composes V1-V4 verifiers into a unified reward signal."""
+    def __init__(
+        self,
+        weights: Optional[Dict[str, float]] = None,
+        active_verifiers: Optional[List[str]] = None,
+    ):
+        all_verifiers: Dict[str, BaseVerifier] = {
+            "V1": PathwayDirectionVerifier(),
+            "V2": BiologicalFactVerifier(),
+            "V3": CrossContextConsistencyVerifier(),
+            "V4": UncertaintyVerifier(),
+        }
+        self.weights = dict(weights or DEFAULT_WEIGHTS)
+        # Filter to active verifiers if specified
+        if active_verifiers:
+            self.verifiers = {
+                k: v for k, v in all_verifiers.items() if k in active_verifiers
+            }
+            # Renormalize weights
+            total_w = sum(self.weights.get(k, 0) for k in self.verifiers)
+            if total_w > 0:
+                self.weights = {
+                    k: self.weights.get(k, 0) / total_w for k in self.verifiers
+                }
+        else:
+            self.verifiers = all_verifiers
+    def compute_reward(
+        self,
+        prompt: str,
+        completion: str,
+        ground_truth: str,
+        question_type: str,
+        applicable_verifiers: str,
+    ) -> ComposedReward:
+        """Compute composed reward from all applicable verifiers.
+        Args:
+            prompt: The question text.
+            completion: Model's generated response.
+            ground_truth: JSON string of ground truth.
+            question_type: Question type for routing.
+            applicable_verifiers: JSON list of verifier names.
+        """
+        gt = json.loads(ground_truth) if isinstance(ground_truth, str) else ground_truth
+        applicable = (
+            json.loads(applicable_verifiers)
+            if isinstance(applicable_verifiers, str)
+            else applicable_verifiers
+        )
+        scores: Dict[str, float] = {}
+        details: Dict[str, Dict] = {}
+        weights_used: Dict[str, float] = {}
+        for vname, verifier in self.verifiers.items():
+            if not verifier.is_applicable(applicable):
+                continue
+            result = verifier.score(prompt, completion, gt, question_type)
+            if not result.applicable:
+                continue
+            scores[vname] = result.score
+            details[vname] = result.details
+            weights_used[vname] = self.weights.get(vname, 0)
+        # Compute weighted sum with renormalization
+        if not weights_used:
+            return ComposedReward(
+                total_reward=0.0,
+                verifier_scores=scores,
+                verifier_details=details,
+                weights_used=weights_used,
+            )
+        w_total = sum(weights_used.values())
+        if w_total > 0:
+            normalized = {k: v / w_total for k, v in weights_used.items()}
+        else:
+            normalized = weights_used
+        total = sum(scores[k] * normalized.get(k, 0) for k in scores)
+        return ComposedReward(
+            total_reward=total,
+            verifier_scores=scores,
+            verifier_details=details,
+            weights_used=normalized,
+        )
+def make_grpo_reward_function(
+    weights: Optional[Dict[str, float]] = None,
+    active_verifiers: Optional[List[str]] = None,
+) -> Callable:
+    """Create a TRL-compatible reward function from the verifier composer.
+    TRL's GRPOTrainer calls reward functions with signature:
+        reward_func(completions, **kwargs) -> list[float]
+    where kwargs include all dataset columns except "prompt".
+    The completions are list of list of dicts in chat format, or list of strings.
+    Note: TRL passes prompts separately. Dataset columns (ground_truth,
+    question_type, applicable_verifiers, etc.) are forwarded as kwargs.
+    """
+    composer = VerifierComposer(weights=weights, active_verifiers=active_verifiers)
+    def reward_func(
+        completions: List,
+        ground_truth: Optional[List[str]] = None,
+        question_type: Optional[List[str]] = None,
+        applicable_verifiers: Optional[List[str]] = None,
+        **kwargs,
+    ) -> List[float]:
+        """TRL-compatible reward function using composed biological verifiers.
+        Args:
+            completions: List of model completions (strings or chat messages).
+            ground_truth: List of JSON ground truth strings (from dataset).
+            question_type: List of question type strings (from dataset).
+            applicable_verifiers: List of JSON lists of verifier names.
+        Returns:
+            List of float rewards, one per completion.
+        """
+        rewards: List[float] = []
+        n = len(completions)
+        # Handle missing kwargs gracefully
+        if ground_truth is None:
+            ground_truth = ["{}"] * n
+        if question_type is None:
+            question_type = ["unknown"] * n
+        if applicable_verifiers is None:
+            applicable_verifiers = [json.dumps(["V1", "V2", "V3", "V4"])] * n
+        # Extract prompts if available in kwargs
+        prompts = kwargs.get("prompts", kwargs.get("prompt", [""] * n))
+        if isinstance(prompts, str):
+            prompts = [prompts] * n
+        for i in range(n):
+            # Extract completion text
+            completion_text = _extract_text(completions[i])
+            prompt_text = _extract_text(prompts[i]) if i < len(prompts) else ""
+            result = composer.compute_reward(
+                prompt=prompt_text,
+                completion=completion_text,
+                ground_truth=ground_truth[i],
+                question_type=question_type[i],
+                applicable_verifiers=applicable_verifiers[i],
+            )
+            rewards.append(result.total_reward)
+        return rewards
+    return reward_func
+def make_single_verifier_reward(verifier_name: str) -> Callable:
+    """Create a reward function using only one verifier (for ablation)."""
+    return make_grpo_reward_function(active_verifiers=[verifier_name])
+def _extract_text(item) -> str:
+    """Extract plain text from various completion formats.
+    TRL may pass completions as:
+      - str: plain text
+      - list[dict]: chat messages [{"role": "assistant", "content": "..."}]
+    """
+    if isinstance(item, str):
+        return item
+    elif isinstance(item, list):
+        # Chat format
+        texts = []
+        for msg in item:
+            if isinstance(msg, dict) and "content" in msg:
+                texts.append(msg["content"])
+        return " ".join(texts)
+    else:
+        return str(item)

src/biorlhf/verifiers/consistency.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+V3: Cross-Context Consistency Verifier.
+Scores whether the model appropriately distinguishes or generalizes
+across biological contexts (tissues, species, doses, timepoints).
+For comparison questions: checks tissue coverage + consistency assessment.
+For context-dependent questions: checks nuance and hedging.
+For BioAmbiguity tasks: checks context awareness using BioEval scoring logic.
+"""
+import json
+import re
+from typing import Dict, List
+from biorlhf.verifiers.base import BaseVerifier, VerifierResult
+# ── Indicator patterns ─────────────────────────────────────────────────────
+CONSISTENCY_TERMS = [
+    "consistent", "conserved", "similar across", "same direction",
+    "reproducible", "concordant", "shared", "common response",
+    "universal", "uniform",
+]
+SPECIFICITY_TERMS = [
+    "tissue-specific", "differs", "different", "opposite", "varies",
+    "divergent", "heterogeneous", "discordant", "unique to",
+    "distinct", "context-dependent", "tissue-dependent",
+]
+NUANCE_INDICATORS = [
+    "depends", "context", "varies", "mission-specific",
+    "not consistent", "differs", "some missions", "mixed",
+    "heterogeneous", "variable", "inconsistent",
+]
+HEDGING_INDICATORS = [
+    "uncertain", "unclear", "difficult to generalize",
+    "not enough evidence", "conflicting", "limited data",
+    "preliminary", "tentative", "cannot be determined",
+    "more research", "caution",
+]
+class CrossContextConsistencyVerifier(BaseVerifier):
+    """V3: Scores context-appropriate reasoning."""
+    name = "V3"
+    def score(
+        self,
+        prompt: str,
+        completion: str,
+        ground_truth: Dict,
+        question_type: str,
+    ) -> VerifierResult:
+        gt = ground_truth if isinstance(ground_truth, dict) else json.loads(ground_truth)
+        if question_type == "comparison":
+            return self._score_comparison(completion, gt)
+        elif question_type in ("context_dependent", "uncertainty"):
+            return self._score_context_dependent(completion, gt)
+        elif "contexts" in gt:
+            # BioEval BioAmbiguity format
+            return self._score_bioambiguity(completion, gt)
+        elif "tissue_directions" in gt:
+            return self._score_comparison(completion, gt)
+        else:
+            return VerifierResult(
+                score=0.5,
+                verifier_name=self.name,
+                details={"reason": "not_applicable"},
+                applicable=False,
+            )
+    def _score_comparison(self, completion: str, gt: Dict) -> VerifierResult:
+        """Score cross-tissue comparison questions."""
+        tissue_directions = gt.get("tissue_directions", {})
+        is_consistent = gt.get("is_consistent", False)
+        comp_lower = completion.lower()
+        # Check tissue coverage
+        tissues_mentioned = sum(
+            1 for tissue in tissue_directions if tissue.lower() in comp_lower
+        )
+        n_tissues = len(tissue_directions) if tissue_directions else 1
+        tissue_coverage = tissues_mentioned / n_tissues
+        # Check consistency/specificity assessment
+        claims_consistent = any(t in comp_lower for t in CONSISTENCY_TERMS)
+        claims_specific = any(t in comp_lower for t in SPECIFICITY_TERMS)
+        consistency_correct = False
+        if is_consistent:
+            consistency_correct = claims_consistent
+        else:
+            consistency_correct = claims_specific
+        score = 0.5 * tissue_coverage + 0.5 * (1.0 if consistency_correct else 0.0)
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "tissues_mentioned": tissues_mentioned,
+                "total_tissues": n_tissues,
+                "is_consistent_gt": is_consistent,
+                "claims_consistent": claims_consistent,
+                "claims_specific": claims_specific,
+                "consistency_correct": consistency_correct,
+            },
+        )
+    def _score_context_dependent(self, completion: str, gt: Dict) -> VerifierResult:
+        """Score questions where answer should acknowledge context-dependence."""
+        comp_lower = completion.lower()
+        nuance_hits = sum(1 for t in NUANCE_INDICATORS if t in comp_lower)
+        hedging_hits = sum(1 for t in HEDGING_INDICATORS if t in comp_lower)
+        # Scale: having 2-3 indicators is ideal
+        nuance_score = min(nuance_hits / 2.0, 1.0)
+        hedging_score = min(hedging_hits / 2.0, 1.0)
+        score = 0.6 * nuance_score + 0.4 * hedging_score
+        return VerifierResult(
+            score=min(score, 1.0),
+            verifier_name=self.name,
+            details={
+                "nuance_hits": nuance_hits,
+                "hedging_hits": hedging_hits,
+                "nuance_score": nuance_score,
+                "hedging_score": hedging_score,
+            },
+        )
+    def _score_bioambiguity(self, completion: str, gt: Dict) -> VerifierResult:
+        """Score BioEval BioAmbiguity tasks.
+        GT format:
+            {"contexts": {context_name: {"key_terms": [...], "role": "..."}},
+             "distinction_key": "..."}
+        """
+        contexts = gt.get("contexts", {})
+        distinction_key = gt.get("distinction_key", "")
+        comp_lower = completion.lower()
+        if not contexts:
+            return VerifierResult(
+                score=0.5, verifier_name=self.name,
+                details={"reason": "no_contexts"}, applicable=False,
+            )
+        # Context awareness: % of key terms found across all contexts
+        total_terms = 0
+        found_terms = 0
+        context_scores = {}
+        for ctx_name, ctx_info in contexts.items():
+            key_terms = ctx_info.get("key_terms", [])
+            if not key_terms:
+                continue
+            hits = sum(1 for t in key_terms if t.lower() in comp_lower)
+            total_terms += len(key_terms)
+            found_terms += hits
+            context_scores[ctx_name] = hits / len(key_terms) if key_terms else 0
+        context_awareness = found_terms / total_terms if total_terms > 0 else 0
+        # Distinction quality: does response contain distinction key words?
+        if distinction_key:
+            dist_terms = _extract_key_terms(distinction_key)
+            dist_hits = sum(1 for t in dist_terms if t.lower() in comp_lower)
+            distinction_quality = dist_hits / len(dist_terms) if dist_terms else 0
+        else:
+            distinction_quality = 0
+        # Evidence support: does response mention roles?
+        role_hits = 0
+        role_total = 0
+        for ctx_info in contexts.values():
+            role = ctx_info.get("role", "")
+            if role:
+                role_total += 1
+                role_terms = _extract_key_terms(role)
+                if any(t.lower() in comp_lower for t in role_terms):
+                    role_hits += 1
+        evidence_support = role_hits / role_total if role_total > 0 else 0
+        # Composite: 40% context + 35% distinction + 25% evidence
+        score = (
+            0.40 * context_awareness
+            + 0.35 * distinction_quality
+            + 0.25 * evidence_support
+        )
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "context_awareness": context_awareness,
+                "distinction_quality": distinction_quality,
+                "evidence_support": evidence_support,
+                "context_scores": context_scores,
+                "terms_found": found_terms,
+                "terms_total": total_terms,
+            },
+        )
+def _extract_key_terms(text: str, min_length: int = 4) -> List[str]:
+    """Extract key terms from text for matching."""
+    stopwords = {
+        "the", "and", "for", "that", "this", "with", "from", "are",
+        "was", "were", "been", "have", "has", "had", "will", "would",
+        "could", "should", "may", "might", "can", "does", "between",
+    }
+    words = re.findall(r"\b[a-zA-Z0-9-]+\b", text)
+    return [w for w in words if len(w) >= min_length and w.lower() not in stopwords]

src/biorlhf/verifiers/factual.py ADDED Viewed

	@@ -0,0 +1,143 @@

+"""
+V2: Biological Fact Verifier.
+Scores model responses based on overlap with known correct facts
+from curated knowledge bases (SpaceOmicsBench, BioEval, GeneTuring).
+Scoring: proportion of ground truth key facts found in the response.
+"""
+import re
+import json
+from typing import Dict, List
+from biorlhf.verifiers.base import BaseVerifier, VerifierResult
+def _extract_key_terms(text: str, min_length: int = 4, max_terms: int = 10) -> List[str]:
+    """Extract important terms from a text string."""
+    # Remove common stopwords and short words
+    stopwords = {
+        "the", "and", "for", "that", "this", "with", "from", "are", "was",
+        "were", "been", "have", "has", "had", "will", "would", "could",
+        "should", "may", "might", "can", "does", "did", "but", "not",
+        "its", "also", "into", "than", "then", "when", "which", "what",
+        "where", "who", "how", "all", "each", "every", "both", "more",
+        "most", "other", "some", "such", "only", "same", "very", "just",
+    }
+    words = re.findall(r"\b[a-zA-Z0-9-]+\b", text)
+    terms = [
+        w for w in words
+        if len(w) >= min_length and w.lower() not in stopwords
+    ]
+    return terms[:max_terms]
+def _phrase_match(phrase: str, text: str) -> bool:
+    """Check if a phrase (or its key terms) appears in text."""
+    text_lower = text.lower()
+    phrase_lower = phrase.lower()
+    # Direct substring match
+    if phrase_lower in text_lower:
+        return True
+    # For multi-word phrases, check if key terms co-occur
+    terms = _extract_key_terms(phrase, min_length=4, max_terms=5)
+    if not terms:
+        return phrase_lower in text_lower
+    matches = sum(1 for t in terms if t.lower() in text_lower)
+    # Require majority of key terms to match
+    return matches >= max(1, len(terms) // 2)
+class BiologicalFactVerifier(BaseVerifier):
+    """V2: Verifies biological factual claims against curated knowledge."""
+    name = "V2"
+    def score(
+        self,
+        prompt: str,
+        completion: str,
+        ground_truth: Dict,
+        question_type: str,
+    ) -> VerifierResult:
+        """Score based on overlap with ground truth key facts.
+        Handles multiple GT formats:
+          - {"key_facts": ["fact1", "fact2", ...]}
+          - {"ground_truth_key_facts": [...]}
+          - {"expected_answer": "text"}
+          - {"expected_reasoning": [...]}
+          - {"correct_steps": [...]}
+        """
+        gt = ground_truth if isinstance(ground_truth, dict) else json.loads(ground_truth)
+        # Extract key facts from various GT formats
+        key_facts = self._extract_facts(gt)
+        if not key_facts:
+            return VerifierResult(
+                score=0.5,
+                verifier_name=self.name,
+                details={"reason": "no_key_facts_in_gt"},
+                applicable=False,
+            )
+        # Score: proportion of key facts found in completion
+        matched_facts: List[str] = []
+        for fact in key_facts:
+            if isinstance(fact, str) and _phrase_match(fact, completion):
+                matched_facts.append(fact)
+        total = len(key_facts)
+        matched = len(matched_facts)
+        score = matched / total if total > 0 else 0.0
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "matched_facts": matched_facts,
+                "total_facts": total,
+                "matched_count": matched,
+                "unmatched": [f for f in key_facts if f not in matched_facts],
+            },
+        )
+    def _extract_facts(self, gt: Dict) -> List[str]:
+        """Extract verifiable facts from ground truth dictionary."""
+        facts: List[str] = []
+        # Direct key facts lists
+        for key in ("key_facts", "ground_truth_key_facts"):
+            if key in gt and isinstance(gt[key], list):
+                facts.extend(str(f) for f in gt[key] if f)
+        # Expected reasoning points
+        if "expected_reasoning" in gt and isinstance(gt["expected_reasoning"], list):
+            facts.extend(str(f) for f in gt["expected_reasoning"] if f)
+        # Single expected answer
+        if "expected_answer" in gt and isinstance(gt["expected_answer"], str):
+            facts.append(gt["expected_answer"])
+        # Protocol steps (BioEval protoreason)
+        if "correct_steps" in gt and isinstance(gt["correct_steps"], list):
+            facts.extend(str(s) for s in gt["correct_steps"] if s)
+        # NES conservation facts
+        if "conservation_level" in gt:
+            facts.append(gt["conservation_level"])
+        # Deduplicate while preserving order
+        seen = set()
+        unique_facts = []
+        for f in facts:
+            if f not in seen:
+                seen.add(f)
+                unique_facts.append(f)
+        return unique_facts

src/biorlhf/verifiers/pathway.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+V1: Pathway Direction Verifier.
+Extracts directional claims about biological pathways from model responses
+and compares them against fGSEA NES direction ground truth.
+Scoring:
+  1.0 — correct direction claimed
+  0.5 — mixed/contradictory claims
+  0.3 — no directional claim extracted
+  0.0 — wrong direction claimed
+"""
+import re
+from typing import Dict, List, Tuple
+from biorlhf.verifiers.base import BaseVerifier, VerifierResult
+# ── Direction indicator patterns ──────────────────────────────────────────
+UP_INDICATORS = [
+    r"\bupregulat\w*\b",
+    r"\bactivat\w*\b",
+    r"\bincreas\w*\b",
+    r"\belevat\w*\b",
+    r"\benhanc\w*\b",
+    r"\binduced?\b",
+    r"\bhigher\b",
+    r"\boverexpress\w*\b",
+    r"\benrich\w*\b",
+    r"\bpositive\s+NES\b",
+    r"\bNES\s*[>=]\s*0\b",
+    r"\bupstream\s+activat\w*\b",
+    r"\bstimulat\w*\b",
+    r"\bpromot\w*\b",
+]
+DOWN_INDICATORS = [
+    r"\bdownregulat\w*\b",
+    r"\bsuppress\w*\b",
+    r"\bdecreas\w*\b",
+    r"\breduced?\b",
+    r"\binhibit\w*\b",
+    r"\brepress\w*\b",
+    r"\blower\w*\b",
+    r"\bunderexpress\w*\b",
+    r"\bdepress\w*\b",
+    r"\bnegative\s+NES\b",
+    r"\bNES\s*<\s*0\b",
+    r"\bdiminish\w*\b",
+    r"\battenuati\w*\b",
+    r"\bimpair\w*\b",
+]
+# Negation patterns that flip direction
+NEGATION_PATTERNS = [
+    r"\bnot\s+",
+    r"\bno\s+",
+    r"\bneither\b",
+    r"\bwithout\s+",
+    r"\bfail\w*\s+to\b",
+    r"\bdoes\s+not\b",
+    r"\bdid\s+not\b",
+    r"\bisn'?t\b",
+    r"\bwasn'?t\b",
+    r"\baren'?t\b",
+]
+# ── Pathway name abbreviations ────────────────────────────────────────────
+PATHWAY_ABBREVIATIONS: Dict[str, List[str]] = {
+    "oxidative phosphorylation": ["oxphos", "oxidative phosphorylation", "ox phos"],
+    "tnfa signaling via nfkb": ["tnf-alpha", "nfkb", "nf-kb", "nf-κb", "tnfα"],
+    "mtorc1 signaling": ["mtor", "mtorc1"],
+    "pi3k akt mtor signaling": ["pi3k", "akt", "mtor", "pi3k/akt"],
+    "interferon gamma response": ["ifn-gamma", "ifn-γ", "interferon gamma", "ifnγ"],
+    "interferon alpha response": ["ifn-alpha", "ifn-α", "interferon alpha", "ifnα"],
+    "adipogenesis": ["adipogenesis", "adipogenic"],
+    "myogenesis": ["myogenesis", "myogenic"],
+    "epithelial mesenchymal transition": ["emt", "epithelial-mesenchymal"],
+    "unfolded protein response": ["upr", "unfolded protein"],
+    "reactive oxygen species pathway": ["ros", "reactive oxygen"],
+    "fatty acid metabolism": ["fatty acid", "fat metabolism", "lipid metabolism"],
+    "glycolysis": ["glycolysis", "glycolytic"],
+    "dna repair": ["dna repair", "dna damage response"],
+    "apoptosis": ["apoptosis", "apoptotic", "programmed cell death"],
+    "inflammatory response": ["inflammatory", "inflammation"],
+    "hypoxia": ["hypoxia", "hypoxic"],
+    "angiogenesis": ["angiogenesis", "angiogenic"],
+    "p53 pathway": ["p53", "tp53"],
+    "wnt beta catenin signaling": ["wnt", "beta-catenin", "β-catenin"],
+}
+def _generate_pathway_variants(pathway_name: str) -> List[str]:
+    """Generate matching variants for a pathway name.
+    E.g. HALLMARK_OXIDATIVE_PHOSPHORYLATION →
+         ["HALLMARK_OXIDATIVE_PHOSPHORYLATION",
+          "oxidative phosphorylation",
+          "oxidative phosphorylation pathway",
+          "oxphos"]
+    """
+    variants = [pathway_name]
+    clean = pathway_name
+    for prefix in ("HALLMARK_", "KEGG_", "REACTOME_", "MITOCARTA_"):
+        clean = clean.replace(prefix, "")
+    human = clean.replace("_", " ").lower()
+    variants.append(human)
+    variants.append(human + " pathway")
+    # Add known abbreviations
+    for key, abbrevs in PATHWAY_ABBREVIATIONS.items():
+        if key in human:
+            variants.extend(abbrevs)
+    return variants
+def _extract_sentences_with_term(text: str, term: str) -> List[str]:
+    """Extract sentences containing a term."""
+    sentences = re.split(r"[.!?\n]+", text)
+    return [
+        s.strip()
+        for s in sentences
+        if term.lower() in s.lower() and len(s.strip()) > 10
+    ]
+def _has_negation_before(text: str, match_start: int, window: int = 12) -> bool:
+    """Check if a negation word appears shortly before a match position.
+    Window of ~12 chars catches "not " + up to ~8 chars of whitespace/adverbs,
+    without reaching across clause boundaries like "not X but rather Y".
+    """
+    start = max(0, match_start - window)
+    preceding = text[start:match_start].lower()
+    return any(re.search(p, preceding) for p in NEGATION_PATTERNS)
+def extract_direction_claims(
+    text: str,
+    pathway_name: str,
+) -> List[Tuple[str, str]]:
+    """Extract directional claims about a specific pathway from text.
+    Returns list of (pathway_variant, direction) tuples.
+    Direction is "UP", "DOWN", or "AMBIGUOUS".
+    """
+    text_lower = text.lower()
+    pathway_variants = _generate_pathway_variants(pathway_name)
+    claims: List[Tuple[str, str]] = []
+    for variant in pathway_variants:
+        if variant.lower() not in text_lower:
+            continue
+        sentences = _extract_sentences_with_term(text, variant)
+        for sentence in sentences:
+            sent_lower = sentence.lower()
+            up_count = 0
+            down_count = 0
+            for pattern in UP_INDICATORS:
+                for match in re.finditer(pattern, sent_lower):
+                    if _has_negation_before(sent_lower, match.start()):
+                        down_count += 1  # Negated up = down
+                    else:
+                        up_count += 1
+            for pattern in DOWN_INDICATORS:
+                for match in re.finditer(pattern, sent_lower):
+                    if _has_negation_before(sent_lower, match.start()):
+                        up_count += 1  # Negated down = up
+                    else:
+                        down_count += 1
+            if up_count > down_count:
+                claims.append((variant, "UP"))
+            elif down_count > up_count:
+                claims.append((variant, "DOWN"))
+            elif up_count > 0:
+                claims.append((variant, "AMBIGUOUS"))
+    return claims
+class PathwayDirectionVerifier(BaseVerifier):
+    """V1: Verifies pathway direction claims against fGSEA NES data."""
+    name = "V1"
+    def score(
+        self,
+        prompt: str,
+        completion: str,
+        ground_truth: Dict,
+        question_type: str,
+    ) -> VerifierResult:
+        if "pathway" not in ground_truth or "direction" not in ground_truth:
+            return VerifierResult(
+                score=0.5,
+                verifier_name=self.name,
+                details={"reason": "no_pathway_in_gt"},
+                applicable=False,
+            )
+        expected_dir = ground_truth["direction"]
+        pathway = ground_truth["pathway"]
+        # For comparison questions, check all tissue directions
+        if "tissue_directions" in ground_truth and question_type == "comparison":
+            return self._score_comparison(completion, ground_truth)
+        claims = extract_direction_claims(completion, pathway)
+        if not claims:
+            return VerifierResult(
+                score=0.3,
+                verifier_name=self.name,
+                details={
+                    "reason": "no_claim_extracted",
+                    "pathway": pathway,
+                    "expected": expected_dir,
+                },
+            )
+        matching = [c for c in claims if c[1] == expected_dir]
+        contradicting = [
+            c for c in claims if c[1] != expected_dir and c[1] != "AMBIGUOUS"
+        ]
+        if matching and not contradicting:
+            score = 1.0
+        elif matching and contradicting:
+            score = 0.5
+        elif contradicting:
+            score = 0.0
+        else:
+            score = 0.3  # Only ambiguous claims
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "pathway": pathway,
+                "expected": expected_dir,
+                "claims": [(v, d) for v, d in claims],
+                "n_matching": len(matching),
+                "n_contradicting": len(contradicting),
+            },
+        )
+    def _score_comparison(
+        self, completion: str, ground_truth: Dict
+    ) -> VerifierResult:
+        """Score cross-tissue comparison: check direction per tissue."""
+        tissue_dirs = ground_truth.get("tissue_directions", {})
+        pathway = ground_truth.get("pathway", "")
+        if not tissue_dirs:
+            return VerifierResult(
+                score=0.5, verifier_name=self.name,
+                details={"reason": "no_tissue_directions"}, applicable=False,
+            )
+        correct = 0
+        checked = 0
+        details_per_tissue = {}
+        for tissue, expected_dir in tissue_dirs.items():
+            # Look for tissue-specific claims in the response
+            tissue_sentences = _extract_sentences_with_term(completion, tissue)
+            if not tissue_sentences:
+                continue
+            tissue_text = " ".join(tissue_sentences)
+            claims = extract_direction_claims(tissue_text, pathway)
+            checked += 1
+            if any(c[1] == expected_dir for c in claims):
+                correct += 1
+                details_per_tissue[tissue] = "correct"
+            elif claims:
+                details_per_tissue[tissue] = "wrong"
+            else:
+                details_per_tissue[tissue] = "no_claim"
+        score = correct / checked if checked > 0 else 0.3
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "pathway": pathway,
+                "tissues_checked": checked,
+                "tissues_correct": correct,
+                "per_tissue": details_per_tissue,
+            },
+        )

src/biorlhf/verifiers/uncertainty.py ADDED Viewed

	@@ -0,0 +1,270 @@

+"""
+V4: Uncertainty Appropriateness Verifier.
+Scores whether a model's stated confidence aligns with the ground-truth
+expected confidence level. Integrates with BioEval's calibration scoring
+when available, with a built-in fallback.
+Scoring dimensions:
+  - Confidence level alignment (stated vs. expected)
+  - Calibration task behavior (acknowledge_unknown, overconfidence_trap, etc.)
+  - Default: penalizes extreme overconfidence
+"""
+import re
+import json
+from typing import Dict, List
+from dataclasses import dataclass
+from biorlhf.verifiers.base import BaseVerifier, VerifierResult
+# ── Try importing BioEval calibration infrastructure ──────────────────────
+try:
+    import os
+    import sys
+    _bioeval_root = os.environ.get(
+        "BIOEVAL_ROOT",
+        "/Users/jak4013/Dropbox/Bioinformatics/Claude/Evaluation_model/BioEval",
+    )
+    sys.path.insert(0, _bioeval_root)
+    from bioeval.scoring.calibration import extract_confidence, ConfidenceExtraction
+    HAS_BIOEVAL = True
+except ImportError:
+    HAS_BIOEVAL = False
+# ── Built-in confidence extraction (fallback) ─────────────────────────────
+HIGH_CONFIDENCE_PATTERNS = [
+    r"\bhigh\s*confidence\b", r"\bvery\s+confident\b", r"\bconfident\s+that\b",
+    r"\bcertainly\b", r"\bclearly\b", r"\bdefinitely\b", r"\bwithout\s+doubt\b",
+    r"\bconfidence:\s*high\b", r"\bstrongly\s+suggest\b",
+]
+MEDIUM_CONFIDENCE_PATTERNS = [
+    r"\bmoderate\s*confidence\b", r"\breasonably\s+confident\b",
+    r"\blikely\b", r"\bprobably\b", r"\bsuggest\w*\b",
+    r"\bconfidence:\s*medium\b", r"\bconfidence:\s*moderate\b",
+]
+LOW_CONFIDENCE_PATTERNS = [
+    r"\blow\s*confidence\b", r"\bnot\s+confident\b", r"\buncertain\b",
+    r"\bunclear\b", r"\bnot\s+sure\b", r"\bdon'?t\s+know\b",
+    r"\bcannot\s+determine\b", r"\binsufficient\s+\w*\s*(?:data|evidence)\b",
+    r"\blimited\s+evidence\b", r"\bspeculat\w*\b",
+    r"\bconfidence:\s*low\b",
+]
+# Explicit numeric confidence
+NUMERIC_CONFIDENCE_RE = re.compile(
+    r"(?:confidence|certainty|probability)[:\s]*(\d{1,3})%",
+    re.IGNORECASE,
+)
+# Expected confidence ranges
+CONFIDENCE_RANGES = {
+    "high": (0.70, 1.00),
+    "medium": (0.35, 0.75),
+    "low": (0.00, 0.40),
+}
+# Expected confidence for calibration task behaviors
+BEHAVIOR_EXPECTED_CONFIDENCE = {
+    "acknowledge_unknown": 0.15,
+    "high_confidence_correct": 0.90,
+    "partial_knowledge": 0.50,
+    "context_dependent": 0.50,
+    "moderate_confidence": 0.50,
+    "overconfidence_trap": 0.30,
+}
+@dataclass
+class SimpleConfidence:
+    """Fallback confidence extraction result."""
+    stated: str             # "high", "medium", "low"
+    numeric: float          # 0.0 to 1.0
+    source: str             # "explicit", "pattern", "language"
+def _extract_confidence_simple(text: str) -> SimpleConfidence:
+    """Simple confidence extraction without BioEval."""
+    text_lower = text.lower()
+    # Check for explicit numeric confidence
+    num_match = NUMERIC_CONFIDENCE_RE.search(text)
+    if num_match:
+        pct = int(num_match.group(1))
+        numeric = pct / 100.0
+        if numeric >= 0.70:
+            stated = "high"
+        elif numeric >= 0.40:
+            stated = "medium"
+        else:
+            stated = "low"
+        return SimpleConfidence(stated=stated, numeric=numeric, source="explicit")
+    # Count pattern matches
+    high_count = sum(1 for p in HIGH_CONFIDENCE_PATTERNS if re.search(p, text_lower))
+    med_count = sum(1 for p in MEDIUM_CONFIDENCE_PATTERNS if re.search(p, text_lower))
+    low_count = sum(1 for p in LOW_CONFIDENCE_PATTERNS if re.search(p, text_lower))
+    if low_count > high_count and low_count > med_count:
+        return SimpleConfidence(stated="low", numeric=0.25, source="pattern")
+    elif high_count > low_count and high_count > med_count:
+        return SimpleConfidence(stated="high", numeric=0.85, source="pattern")
+    elif med_count > 0:
+        return SimpleConfidence(stated="medium", numeric=0.55, source="pattern")
+    else:
+        # Default: assume moderate confidence
+        return SimpleConfidence(stated="medium", numeric=0.50, source="language")
+class UncertaintyVerifier(BaseVerifier):
+    """V4: Verifies that model's confidence is appropriate for the question."""
+    name = "V4"
+    def score(
+        self,
+        prompt: str,
+        completion: str,
+        ground_truth: Dict,
+        question_type: str,
+    ) -> VerifierResult:
+        gt = ground_truth if isinstance(ground_truth, dict) else json.loads(ground_truth)
+        expected_confidence = gt.get("expected_confidence")
+        correct_behavior = gt.get("correct_behavior")
+        # Extract confidence from completion
+        if HAS_BIOEVAL:
+            conf_extraction = extract_confidence(completion)
+            conf_score = conf_extraction.confidence_score
+            stated = conf_extraction.stated_confidence or "medium"
+        else:
+            simple = _extract_confidence_simple(completion)
+            conf_score = simple.numeric
+            stated = simple.stated
+        # Route to appropriate scoring
+        if correct_behavior:
+            return self._score_calibration_behavior(
+                completion, gt, conf_score, stated, correct_behavior,
+            )
+        elif expected_confidence:
+            return self._score_confidence_alignment(
+                conf_score, stated, expected_confidence,
+            )
+        else:
+            return self._score_default(conf_score, stated)
+    def _score_calibration_behavior(
+        self,
+        completion: str,
+        gt: Dict,
+        conf_score: float,
+        stated: str,
+        correct_behavior: str,
+    ) -> VerifierResult:
+        """Score calibration tasks with specific expected behaviors."""
+        expected_conf = BEHAVIOR_EXPECTED_CONFIDENCE.get(correct_behavior, 0.5)
+        conf_error = abs(conf_score - expected_conf)
+        # Behavior-specific checks
+        behavior_correct = False
+        if correct_behavior == "acknowledge_unknown":
+            # Model should say it doesn't know
+            unknown_markers = [
+                "don't know", "cannot determine", "not enough",
+                "unknown", "hypothetical", "no information",
+                "unable to", "cannot assess", "fictional",
+            ]
+            behavior_correct = any(m in completion.lower() for m in unknown_markers)
+        elif correct_behavior == "high_confidence_correct":
+            behavior_correct = conf_score >= 0.65
+        elif correct_behavior == "overconfidence_trap":
+            # Should NOT be overconfident
+            behavior_correct = conf_score < 0.70
+        elif correct_behavior in ("partial_knowledge", "context_dependent", "moderate_confidence"):
+            behavior_correct = 0.30 <= conf_score <= 0.75
+        # Combined score: behavior compliance + confidence alignment
+        behavior_score = 1.0 if behavior_correct else 0.0
+        alignment_score = max(0, 1.0 - 2.0 * conf_error)
+        score = 0.6 * behavior_score + 0.4 * alignment_score
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "correct_behavior": correct_behavior,
+                "expected_confidence": expected_conf,
+                "actual_confidence": conf_score,
+                "stated_confidence": stated,
+                "confidence_error": conf_error,
+                "behavior_correct": behavior_correct,
+                "using_bioeval": HAS_BIOEVAL,
+            },
+        )
+    def _score_confidence_alignment(
+        self,
+        conf_score: float,
+        stated: str,
+        expected: str,
+    ) -> VerifierResult:
+        """Score how well stated confidence aligns with expected level."""
+        if expected not in CONFIDENCE_RANGES:
+            return VerifierResult(
+                score=0.5, verifier_name=self.name,
+                details={"reason": "unknown_expected_level"},
+            )
+        low, high = CONFIDENCE_RANGES[expected]
+        in_range = low <= conf_score <= high
+        if in_range:
+            score = 1.0
+        else:
+            distance = min(abs(conf_score - low), abs(conf_score - high))
+            score = max(0, 1.0 - 2.5 * distance)
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "expected_level": expected,
+                "expected_range": (low, high),
+                "actual_confidence": conf_score,
+                "stated_confidence": stated,
+                "in_range": in_range,
+                "using_bioeval": HAS_BIOEVAL,
+            },
+        )
+    def _score_default(
+        self,
+        conf_score: float,
+        stated: str,
+    ) -> VerifierResult:
+        """Default scoring: penalize extreme overconfidence."""
+        if conf_score > 0.90:
+            score = 0.4  # Overconfidence penalty
+        elif conf_score < 0.10:
+            score = 0.3  # Extreme underconfidence penalty
+        else:
+            score = 0.7  # Moderate confidence is good default
+        return VerifierResult(
+            score=score,
+            verifier_name=self.name,
+            details={
+                "actual_confidence": conf_score,
+                "stated_confidence": stated,
+                "mode": "default",
+                "using_bioeval": HAS_BIOEVAL,
+            },
+        )