BioRLHF / scripts /deploy_to_cayuga.sh
jang1563's picture
Add BioGRPO training pipeline with composable biological verifiers
bff2f94
#!/bin/bash
# ============================================================
# Deploy BioRLHF code + data to Cayuga HPC
# Run from local Mac
# ============================================================
set -e
REMOTE="cayuga-login1"
SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka"
LOCAL_BASE="$HOME/Dropbox/Bioinformatics/Claude"
echo "============================================================"
echo "BioRLHF Cayuga Deployment"
echo "============================================================"
# Step 1: Create directories on Cayuga
echo ""
echo "[1/4] Creating directories on Cayuga..."
ssh ${REMOTE} "mkdir -p ${SCRATCH}/training/BioRLHF ${SCRATCH}/data/GeneLab_benchmark ${SCRATCH}/data/BioEval ${SCRATCH}/data/SpaceOmicsBench/v3/evaluation"
# Step 2: Transfer BioRLHF code (only essential files)
echo ""
echo "[2/4] Transferring BioRLHF code..."
LOCAL_BIORLHF="${LOCAL_BASE}/BioRLHF/biorlhf"
DEST="${REMOTE}:${SCRATCH}/training/BioRLHF"
# Transfer only the package structure needed for GRPO
rsync -avz --progress \
"${LOCAL_BIORLHF}/src/" \
${DEST}/src/
rsync -avz --progress \
"${LOCAL_BIORLHF}/configs/" \
${DEST}/configs/
rsync -avz --progress \
"${LOCAL_BIORLHF}/scripts/" \
${DEST}/scripts/
rsync -avz --progress \
"${LOCAL_BIORLHF}/tests/" \
${DEST}/tests/
rsync -avz --progress \
"${LOCAL_BIORLHF}/pyproject.toml" \
"${LOCAL_BIORLHF}/README.md" \
${DEST}/
# Step 3: Transfer data (only what GRPO training needs)
echo ""
echo "[3/4] Transferring data..."
echo " GeneLab fgsea (pathway enrichment scores - required)..."
rsync -avz --progress \
"${LOCAL_BASE}/GeneLab_benchmark/processed/fgsea/" \
${REMOTE}:${SCRATCH}/data/GeneLab_benchmark/processed/fgsea/
echo " GeneLab evaluation (NES conservation - for conservation questions)..."
rsync -avz --progress \
"${LOCAL_BASE}/GeneLab_benchmark/evaluation/" \
${REMOTE}:${SCRATCH}/data/GeneLab_benchmark/evaluation/
echo " BioEval data..."
rsync -avz --progress \
"${LOCAL_BASE}/Evaluation_model/BioEval/data/" \
${REMOTE}:${SCRATCH}/data/BioEval/data/
echo " BioEval scoring (for calibration imports)..."
rsync -avz --progress \
"${LOCAL_BASE}/Evaluation_model/BioEval/bioeval/" \
${REMOTE}:${SCRATCH}/data/BioEval/bioeval/
echo " SpaceOmicsBench..."
rsync -avz --progress \
"${LOCAL_BASE}/SpaceOmicsBench/v3/evaluation/llm/" \
${REMOTE}:${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm/
# Step 4: Verify
echo ""
echo "[4/4] Verifying deployment..."
ssh ${REMOTE} "
echo 'Directory structure:'
echo ' BioRLHF code:'
ls ${SCRATCH}/training/BioRLHF/pyproject.toml 2>/dev/null && echo ' pyproject.toml: OK' || echo ' pyproject.toml: MISSING'
ls ${SCRATCH}/training/BioRLHF/configs/grpo_mve.json 2>/dev/null && echo ' configs/grpo_mve.json: OK' || echo ' configs/grpo_mve.json: MISSING'
ls -d ${SCRATCH}/training/BioRLHF/src/biorlhf/ 2>/dev/null && echo ' src/biorlhf/: OK' || echo ' src/biorlhf/: MISSING'
echo ' SFT checkpoint:'
ls -d ${SCRATCH}/training/biorlhf/kmp_sft_model_final/ 2>/dev/null && echo ' kmp_sft_model_final: OK' || echo ' kmp_sft_model_final: MISSING'
echo ' Data:'
ls ${SCRATCH}/data/GeneLab_benchmark/processed/fgsea/ 2>/dev/null | head -3 && echo ' GeneLab fgsea: OK' || echo ' GeneLab fgsea: MISSING'
ls ${SCRATCH}/data/GeneLab_benchmark/evaluation/ 2>/dev/null | head -3 && echo ' GeneLab evaluation: OK' || echo ' GeneLab evaluation: MISSING'
ls ${SCRATCH}/data/BioEval/data/ 2>/dev/null | head -3 && echo ' BioEval: OK' || echo ' BioEval: MISSING'
ls ${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm/ 2>/dev/null | head -3 && echo ' SpaceOmicsBench: OK' || echo ' SpaceOmicsBench: MISSING'
"
echo ""
echo "============================================================"
echo "Deployment complete!"
echo ""
echo "Next steps on Cayuga:"
echo " ssh ${REMOTE}"
echo " cd ${SCRATCH}/training/BioRLHF"
echo " bash scripts/setup_cayuga_grpo.sh"
echo " sbatch scripts/run_grpo_mve.sh"
echo "============================================================"