Retry final upload 5x, skip shutdown on failure

Files changed (2) hide show

lerobot/src/lerobot/scripts/lerobot_train.py CHANGED Viewed

@@ -571,16 +571,27 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
         logging.info("End of training")
         if cfg.policy.push_to_hub:
-            try:
-                unwrapped_policy = accelerator.unwrap_model(policy)
-                if cfg.policy.use_peft:
-                    unwrapped_policy.push_model_to_hub(cfg, peft_model=unwrapped_policy)
-                else:
-                    unwrapped_policy.push_model_to_hub(cfg)
-                preprocessor.push_to_hub(cfg.policy.repo_id)
-                postprocessor.push_to_hub(cfg.policy.repo_id)
-            except Exception as e:
-                logging.warning(f"End-of-training push to hub failed (non-fatal): {e}")
     # Properly clean up the distributed process group
     accelerator.wait_for_everyone()

         logging.info("End of training")
         if cfg.policy.push_to_hub:
+            upload_success = False
+            for attempt in range(5):
+                try:
+                    unwrapped_policy = accelerator.unwrap_model(policy)
+                    if cfg.policy.use_peft:
+                        unwrapped_policy.push_model_to_hub(cfg, peft_model=unwrapped_policy)
+                    else:
+                        unwrapped_policy.push_model_to_hub(cfg)
+                    preprocessor.push_to_hub(cfg.policy.repo_id)
+                    postprocessor.push_to_hub(cfg.policy.repo_id)
+                    logging.info("Final model uploaded to HF successfully")
+                    upload_success = True
+                    break
+                except Exception as e:
+                    logging.error(f"Final model upload attempt {attempt+1}/5 failed: {e}")
+                    import time
+                    time.sleep(30)
+            if not upload_success:
+                logging.error("CRITICAL: Final model upload failed after 5 attempts. DO NOT shut down - weights are still on disk.")
+                import sys
+                sys.exit(1)  # Non-zero exit prevents shutdown in train_cloud.sh
     # Properly clean up the distributed process group
     accelerator.wait_for_everyone()

train_cloud.sh CHANGED Viewed

@@ -74,7 +74,10 @@ repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model')
 print('Log uploaded')
 " 2>&1 | tee -a "$LOG_FILE"
-# Auto-shutdown if running on cloud (sudo available)
-if command -v sudo &> /dev/null; then
     sudo shutdown -h now
 fi

 print('Log uploaded')
 " 2>&1 | tee -a "$LOG_FILE"
+# Only auto-shutdown if training succeeded (exit 0 = weights uploaded)
+if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then
     sudo shutdown -h now
+else
+    echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" | tee -a "$LOG_FILE"
+    echo "=== Weights may still be on disk at /ephemeral/production_run ===" | tee -a "$LOG_FILE"
 fi