bot commited on
Commit ·
684902a
1
Parent(s): 9092725
Retry final upload 5x, skip shutdown on failure
Browse files- lerobot/src/lerobot/scripts/lerobot_train.py +21 -10
- train_cloud.sh +5 -2
lerobot/src/lerobot/scripts/lerobot_train.py
CHANGED
|
@@ -571,16 +571,27 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
|
|
| 571 |
logging.info("End of training")
|
| 572 |
|
| 573 |
if cfg.policy.push_to_hub:
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
|
| 577 |
-
unwrapped_policy
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
# Properly clean up the distributed process group
|
| 586 |
accelerator.wait_for_everyone()
|
|
|
|
| 571 |
logging.info("End of training")
|
| 572 |
|
| 573 |
if cfg.policy.push_to_hub:
|
| 574 |
+
upload_success = False
|
| 575 |
+
for attempt in range(5):
|
| 576 |
+
try:
|
| 577 |
+
unwrapped_policy = accelerator.unwrap_model(policy)
|
| 578 |
+
if cfg.policy.use_peft:
|
| 579 |
+
unwrapped_policy.push_model_to_hub(cfg, peft_model=unwrapped_policy)
|
| 580 |
+
else:
|
| 581 |
+
unwrapped_policy.push_model_to_hub(cfg)
|
| 582 |
+
preprocessor.push_to_hub(cfg.policy.repo_id)
|
| 583 |
+
postprocessor.push_to_hub(cfg.policy.repo_id)
|
| 584 |
+
logging.info("Final model uploaded to HF successfully")
|
| 585 |
+
upload_success = True
|
| 586 |
+
break
|
| 587 |
+
except Exception as e:
|
| 588 |
+
logging.error(f"Final model upload attempt {attempt+1}/5 failed: {e}")
|
| 589 |
+
import time
|
| 590 |
+
time.sleep(30)
|
| 591 |
+
if not upload_success:
|
| 592 |
+
logging.error("CRITICAL: Final model upload failed after 5 attempts. DO NOT shut down - weights are still on disk.")
|
| 593 |
+
import sys
|
| 594 |
+
sys.exit(1) # Non-zero exit prevents shutdown in train_cloud.sh
|
| 595 |
|
| 596 |
# Properly clean up the distributed process group
|
| 597 |
accelerator.wait_for_everyone()
|
train_cloud.sh
CHANGED
|
@@ -74,7 +74,10 @@ repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model')
|
|
| 74 |
print('Log uploaded')
|
| 75 |
" 2>&1 | tee -a "$LOG_FILE"
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
if command -v sudo &> /dev/null; then
|
| 79 |
sudo shutdown -h now
|
|
|
|
|
|
|
|
|
|
| 80 |
fi
|
|
|
|
| 74 |
print('Log uploaded')
|
| 75 |
" 2>&1 | tee -a "$LOG_FILE"
|
| 76 |
|
| 77 |
+
# Only auto-shutdown if training succeeded (exit 0 = weights uploaded)
|
| 78 |
+
if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then
|
| 79 |
sudo shutdown -h now
|
| 80 |
+
else
|
| 81 |
+
echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" | tee -a "$LOG_FILE"
|
| 82 |
+
echo "=== Weights may still be on disk at /ephemeral/production_run ===" | tee -a "$LOG_FILE"
|
| 83 |
fi
|