| #!/bin/bash |
|
|
| |
| |
| |
|
|
| echo "=== Launching Optimized Single GPU AMP Flow Matching Training with FULL DATA ===" |
| echo "Using GPU 3 for training (other GPUs are busy)" |
| echo "Using ALL available peptide embeddings and UniProt data" |
| echo "OVERNIGHT TRAINING: 15000 iterations with CFG support and H100 optimizations" |
| echo "" |
|
|
| |
| echo "Checking required files..." |
| if [ ! -f "final_compressor_model.pth" ]; then |
| echo "β Missing final_compressor_model.pth" |
| echo "Please run compressor_with_embeddings.py first" |
| exit 1 |
| fi |
|
|
| if [ ! -f "final_decompressor_model.pth" ]; then |
| echo "β Missing final_decompressor_model.pth" |
| echo "Please run compressor_with_embeddings.py first" |
| exit 1 |
| fi |
|
|
| if [ ! -d "/data2/edwardsun/flow_project/peptide_embeddings/" ]; then |
| echo "β Missing /data2/edwardsun/flow_project/peptide_embeddings/ directory" |
| echo "Please run final_sequence_encoder.py first" |
| exit 1 |
| fi |
|
|
| |
| if [ ! -f "/data2/edwardsun/flow_project/peptide_embeddings/all_peptide_embeddings.pt" ]; then |
| echo "β οΈ Warning: all_peptide_embeddings.pt not found" |
| echo "Will use individual embedding files instead" |
| else |
| echo "β Found all_peptide_embeddings.pt (4.3GB - ALL peptide data)" |
| fi |
|
|
| if [ ! -f "/data2/edwardsun/flow_project/test_uniprot_processed/uniprot_processed_data.json" ]; then |
| echo "β Missing /data2/edwardsun/flow_project/test_uniprot_processed/uniprot_processed_data.json" |
| echo "This contains ALL UniProt data for CFG training" |
| exit 1 |
| else |
| echo "β Found uniprot_processed_data.json (3.4GB - ALL UniProt data)" |
| fi |
|
|
| echo "β All required files found!" |
| echo "" |
|
|
| |
| export CUDA_VISIBLE_DEVICES=3 |
|
|
| |
| export TORCH_CUDNN_V8_API_ENABLED=1 |
| export TORCH_CUDNN_V8_API_DISABLED=0 |
|
|
| echo "=== Optimized Training Configuration ===" |
| echo " - GPU: 3 (CUDA_VISIBLE_DEVICES=3)" |
| echo " - Batch size: 96 (optimized based on profiling)" |
| echo " - Total iterations: 6,000" |
| echo " - Mixed precision: BF16 (H100 optimized)" |
| echo " - Learning rate: 4e-4 -> 2e-4 (cosine annealing)" |
| echo " - Warmup steps: 5,000" |
| echo " - Gradient clipping: 1.0" |
| echo " - Weight decay: 0.01" |
| echo " - Data workers: 16" |
| echo " - CFG dropout: 15%" |
| echo " - Validation: Every 10,000 steps" |
| echo " - Checkpoints: Every 1,000 epochs" |
| echo " - Estimated time: ~8-10 hours (overnight training)" |
| echo "" |
|
|
| |
| echo "Checking GPU capabilities..." |
| nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader,nounits | while IFS=, read -r name total free; do |
| echo " GPU: $name" |
| echo " Total memory: ${total}MB" |
| echo " Free memory: ${free}MB" |
| echo " Available: $((free * 100 / total))%" |
| done |
|
|
| echo "" |
|
|
| |
| echo "Starting optimized single GPU training on GPU 3 with FULL DATA..." |
| echo "" |
|
|
| |
| |
| |
|
|
| |
| python amp_flow_training_single_gpu_full_data.py |
|
|
| echo "" |
| echo "=== Optimized Overnight Training Complete with FULL DATA ===" |
| echo "Check for output files:" |
| echo " - amp_flow_model_best_optimized.pth (best validation model)" |
| echo " - amp_flow_model_final_optimized.pth (final model)" |
| echo " - amp_flow_checkpoint_optimized_step_*.pth (checkpoints every 1000 epochs)" |
| echo "" |
| echo "Training optimizations applied:" |
| echo " β Mixed precision (BF16) for ~30-50% speedup" |
| echo " β Increased batch size (128) for better H100 utilization" |
| echo " β Optimized learning rate schedule with proper warmup" |
| echo " β Gradient clipping for training stability" |
| echo " β CFG dropout for better guidance" |
| echo " β Validation monitoring and early stopping" |
| echo " β PyTorch 2.x compilation for speedup" |
| echo "" |
| echo "Next steps:" |
| echo "1. Test the optimized model: python generate_amps.py" |
| echo "2. Compare performance with previous model" |
| echo "3. Implement reflow for 1-step generation" |
| echo "4. Add conditioning for toxicity" |
| echo "5. Fine-tune on specific AMP properties" |