| #!/bin/bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| echo "============================================================" |
| echo "BioRLHF Ecosystem Training" |
| echo "============================================================" |
| echo "Start time: $(date)" |
| echo "Host: $(hostname)" |
| echo "" |
|
|
| |
| |
| |
|
|
| |
| cd "$(dirname "$0")/.." || exit 1 |
| echo "Working directory: $(pwd)" |
|
|
| |
| echo "" |
| echo "GPU Information:" |
| nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv 2>/dev/null || echo "No GPU detected" |
| echo "" |
|
|
| |
| |
| |
|
|
| |
| MODEL="mistralai/Mistral-7B-v0.3" |
| DATASET="data/combined_training.json" |
| OUTPUT_DIR="./ecosystem_improved_model" |
|
|
| |
| EPOCHS=10 |
| BATCH_SIZE=4 |
| GRAD_ACCUM=4 |
| LEARNING_RATE=2e-4 |
| MAX_LENGTH=1024 |
|
|
| |
| LORA_R=64 |
| LORA_ALPHA=128 |
|
|
| |
| WANDB_PROJECT="biorlhf" |
| WANDB_RUN="ecosystem_improved_$(date +%Y%m%d_%H%M%S)" |
|
|
| |
| |
| |
| echo "============================================================" |
| echo "Configuration:" |
| echo "============================================================" |
| echo "Model: $MODEL" |
| echo "Dataset: $DATASET" |
| echo "Output: $OUTPUT_DIR" |
| echo "Epochs: $EPOCHS" |
| echo "Batch size: $BATCH_SIZE (effective: $((BATCH_SIZE * GRAD_ACCUM)))" |
| echo "LoRA r/α: $LORA_R / $LORA_ALPHA" |
| echo "Max length: $MAX_LENGTH" |
| echo "" |
|
|
| |
| if [ ! -f "$DATASET" ]; then |
| echo "ERROR: Dataset not found at $DATASET" |
| echo "Run: python scripts/merge_training_data.py" |
| exit 1 |
| fi |
|
|
| |
| EXAMPLE_COUNT=$(python3 -c "import json; print(len(json.load(open('$DATASET'))))") |
| echo "Dataset contains $EXAMPLE_COUNT examples" |
| echo "" |
|
|
| |
| |
| |
| echo "============================================================" |
| echo "Starting Training..." |
| echo "============================================================" |
|
|
| python3 sft_train_v2.py \ |
| --model "$MODEL" \ |
| --dataset "$DATASET" \ |
| --output_dir "$OUTPUT_DIR" \ |
| --epochs $EPOCHS \ |
| --batch_size $BATCH_SIZE \ |
| --grad_accum $GRAD_ACCUM \ |
| --lr $LEARNING_RATE \ |
| --max_length $MAX_LENGTH \ |
| --lora_r $LORA_R \ |
| --lora_alpha $LORA_ALPHA \ |
| --use_4bit \ |
| --wandb_project "$WANDB_PROJECT" \ |
| --wandb_run "$WANDB_RUN" |
|
|
| |
| if [ $? -eq 0 ]; then |
| echo "" |
| echo "============================================================" |
| echo "✅ Training Complete!" |
| echo "============================================================" |
| echo "Model saved to: $OUTPUT_DIR" |
| echo "End time: $(date)" |
| echo "" |
| echo "Next steps:" |
| echo "1. Evaluate on SpaceOmicsBench: python evaluate_model.py --model $OUTPUT_DIR" |
| echo "2. Evaluate on CAMELOT: python evaluate_model.py --model $OUTPUT_DIR --benchmark camelot" |
| echo "3. Compare with baseline: python compare_models.py" |
| else |
| echo "" |
| echo "============================================================" |
| echo "❌ Training Failed!" |
| echo "============================================================" |
| echo "Check the error messages above." |
| exit 1 |
| fi |
|
|