| #!/bin/bash |
|
|
| echo "=== AMP Flow Training Monitor ===" |
| echo "Timestamp: $(date)" |
| echo "" |
|
|
| |
| echo "1. Process Status:" |
| if pgrep -f "amp_flow_training_single_gpu_full_data.py" > /dev/null; then |
| echo "✓ Training process is running" |
| PID=$(pgrep -f "amp_flow_training_single_gpu_full_data.py") |
| echo " PID: $PID" |
| echo " Runtime: $(ps -o etime= -p $PID)" |
| else |
| echo "❌ Training process not found" |
| exit 1 |
| fi |
|
|
| echo "" |
|
|
| |
| echo "2. GPU Usage:" |
| nvidia-smi --query-gpu=index,name,utilization.gpu,memory.used,memory.total --format=csv,noheader,nounits | while IFS=, read -r idx name util mem_used mem_total; do |
| echo " GPU $idx ($name): $util% | ${mem_used}MB/${mem_total}MB" |
| done |
|
|
| echo "" |
|
|
| |
| echo "3. Recent Log Output:" |
| if [ -f "overnight_training.log" ]; then |
| echo " Log file size: $(du -h overnight_training.log | cut -f1)" |
| echo " Last 5 lines:" |
| tail -5 overnight_training.log | sed 's/^/ /' |
| else |
| echo " ❌ Log file not found" |
| fi |
|
|
| echo "" |
|
|
| |
| echo "4. Checkpoint Files:" |
| if [ -d "/data2/edwardsun/flow_checkpoints" ]; then |
| echo " Checkpoint directory: /data2/edwardsun/flow_checkpoints" |
| ls -la /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | wc -l | xargs echo " Number of checkpoints:" |
| echo " Latest checkpoint:" |
| ls -t /data2/edwardsun/flow_checkpoints/*.pth 2>/dev/null | head -1 | xargs -I {} basename {} 2>/dev/null || echo " None yet" |
| else |
| echo " ❌ Checkpoint directory not found" |
| fi |
|
|
| echo "" |
| echo "=== End Monitor ===" |