bot commited on
Commit
684902a
·
1 Parent(s): 9092725

Retry final upload 5x, skip shutdown on failure

Browse files
lerobot/src/lerobot/scripts/lerobot_train.py CHANGED
@@ -571,16 +571,27 @@ def train(cfg: TrainPipelineConfig, accelerator: Accelerator | None = None):
571
  logging.info("End of training")
572
 
573
  if cfg.policy.push_to_hub:
574
- try:
575
- unwrapped_policy = accelerator.unwrap_model(policy)
576
- if cfg.policy.use_peft:
577
- unwrapped_policy.push_model_to_hub(cfg, peft_model=unwrapped_policy)
578
- else:
579
- unwrapped_policy.push_model_to_hub(cfg)
580
- preprocessor.push_to_hub(cfg.policy.repo_id)
581
- postprocessor.push_to_hub(cfg.policy.repo_id)
582
- except Exception as e:
583
- logging.warning(f"End-of-training push to hub failed (non-fatal): {e}")
 
 
 
 
 
 
 
 
 
 
 
584
 
585
  # Properly clean up the distributed process group
586
  accelerator.wait_for_everyone()
 
571
  logging.info("End of training")
572
 
573
  if cfg.policy.push_to_hub:
574
+ upload_success = False
575
+ for attempt in range(5):
576
+ try:
577
+ unwrapped_policy = accelerator.unwrap_model(policy)
578
+ if cfg.policy.use_peft:
579
+ unwrapped_policy.push_model_to_hub(cfg, peft_model=unwrapped_policy)
580
+ else:
581
+ unwrapped_policy.push_model_to_hub(cfg)
582
+ preprocessor.push_to_hub(cfg.policy.repo_id)
583
+ postprocessor.push_to_hub(cfg.policy.repo_id)
584
+ logging.info("Final model uploaded to HF successfully")
585
+ upload_success = True
586
+ break
587
+ except Exception as e:
588
+ logging.error(f"Final model upload attempt {attempt+1}/5 failed: {e}")
589
+ import time
590
+ time.sleep(30)
591
+ if not upload_success:
592
+ logging.error("CRITICAL: Final model upload failed after 5 attempts. DO NOT shut down - weights are still on disk.")
593
+ import sys
594
+ sys.exit(1) # Non-zero exit prevents shutdown in train_cloud.sh
595
 
596
  # Properly clean up the distributed process group
597
  accelerator.wait_for_everyone()
train_cloud.sh CHANGED
@@ -74,7 +74,10 @@ repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model')
74
  print('Log uploaded')
75
  " 2>&1 | tee -a "$LOG_FILE"
76
 
77
- # Auto-shutdown if running on cloud (sudo available)
78
- if command -v sudo &> /dev/null; then
79
  sudo shutdown -h now
 
 
 
80
  fi
 
74
  print('Log uploaded')
75
  " 2>&1 | tee -a "$LOG_FILE"
76
 
77
+ # Only auto-shutdown if training succeeded (exit 0 = weights uploaded)
78
+ if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then
79
  sudo shutdown -h now
80
+ else
81
+ echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" | tee -a "$LOG_FILE"
82
+ echo "=== Weights may still be on disk at /ephemeral/production_run ===" | tee -a "$LOG_FILE"
83
  fi