| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
|
|
| echo "============================================================" |
| echo "BioRLHF SFT Training" |
| echo "Job ID: $SLURM_JOB_ID" |
| echo "Node: $SLURMD_NODENAME" |
| echo "Start time: $(date)" |
| echo "============================================================" |
|
|
| |
| mkdir -p logs |
|
|
| |
| module purge |
| module load cuda/12.1 |
| |
|
|
| |
| source ~/.bashrc |
| conda activate biorlhf |
|
|
| |
| echo "" |
| echo "GPU Information:" |
| nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv |
| echo "" |
|
|
| |
| export CUDA_VISIBLE_DEVICES=0 |
| export TRANSFORMERS_CACHE="./cache/transformers" |
| export HF_HOME="./cache/huggingface" |
| export WANDB_DIR="./wandb" |
|
|
| |
| mkdir -p $TRANSFORMERS_CACHE $HF_HOME $WANDB_DIR |
|
|
| |
| echo "Starting SFT training..." |
| python sft_train.py \ |
| --model "mistralai/Mistral-7B-v0.3" \ |
| --dataset "kmp_sft_dataset.json" \ |
| --output_dir "./kmp_sft_model" \ |
| --epochs 3 \ |
| --batch_size 4 \ |
| --grad_accum 4 \ |
| --lr 2e-4 \ |
| --max_seq_length 2048 \ |
| --lora_r 32 \ |
| --lora_alpha 64 \ |
| --wandb_project "biorlhf" \ |
| --wandb_run "kmp_sft_$(date +%Y%m%d_%H%M%S)" |
|
|
| |
| if [ $? -eq 0 ]; then |
| echo "" |
| echo "============================================================" |
| echo "Training completed successfully!" |
| echo "Model saved to: ./kmp_sft_model" |
| echo "End time: $(date)" |
| echo "============================================================" |
| else |
| echo "" |
| echo "============================================================" |
| echo "Training failed with exit code $?" |
| echo "Check logs/sft_${SLURM_JOB_ID}.err for details" |
| echo "============================================================" |
| exit 1 |
| fi |
|
|