| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
|
|
| SCRATCH="/athena/cayuga_0003/scratch/users/jak4013/otsuka" |
| WORKDIR="${SCRATCH}/training/BioRLHF" |
|
|
| echo "============================================================" |
| echo "BioGRPO Phase 4 Training" |
| echo "Job ID: $SLURM_JOB_ID" |
| echo "Node: $SLURMD_NODENAME" |
| echo "Working dir: $WORKDIR" |
| echo "Start time: $(date)" |
| echo "============================================================" |
|
|
| cd "$WORKDIR" || { echo "WORKDIR not found: $WORKDIR"; exit 1; } |
| mkdir -p logs |
|
|
| module purge |
| module load cuda/12.1 |
|
|
| . /home/fs01/jak4013/miniconda3/miniconda3/etc/profile.d/conda.sh |
| conda activate biorlhf |
|
|
| echo "" |
| echo "GPU Information:" |
| nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv |
| echo "" |
|
|
| export CUDA_VISIBLE_DEVICES=0 |
| export TRANSFORMERS_CACHE="${WORKDIR}/cache/transformers" |
| export HF_HOME="${WORKDIR}/cache/huggingface" |
| export WANDB_DIR="${WORKDIR}/wandb" |
| export TOKENIZERS_PARALLELISM=false |
|
|
| |
| export GENELAB_BASE="${SCRATCH}/data/GeneLab_benchmark" |
| export BIOEVAL_DATA="${SCRATCH}/data/BioEval/data" |
| export SPACEOMICS_DATA="${SCRATCH}/data/SpaceOmicsBench/v3/evaluation/llm" |
| export BIOEVAL_ROOT="${SCRATCH}/data/BioEval" |
|
|
| mkdir -p $TRANSFORMERS_CACHE $HF_HOME $WANDB_DIR |
|
|
| |
| if [ ! -e "${WORKDIR}/kmp_sft_model_final" ]; then |
| ln -s "${SCRATCH}/training/biorlhf/kmp_sft_model_final" "${WORKDIR}/kmp_sft_model_final" |
| echo "Symlinked kmp_sft_model_final" |
| fi |
|
|
| echo "Starting BioGRPO Phase 4 training..." |
| biorlhf-grpo --config configs/grpo_phase4.json |
|
|
| if [ $? -eq 0 ]; then |
| echo "" |
| echo "============================================================" |
| echo "BioGRPO Phase 4 training completed!" |
| echo "Model saved to: ./biogrpo_phase4_model" |
| echo "End time: $(date)" |
| echo "============================================================" |
| else |
| echo "" |
| echo "============================================================" |
| echo "BioGRPO Phase 4 training failed with exit code $?" |
| echo "Check logs/grpo_phase4_${SLURM_JOB_ID}.err for details" |
| echo "============================================================" |
| exit 1 |
| fi |
|
|