| #!/bin/bash |
|
|
| |
|
|
| |
|
|
| set +e |
|
|
| LOG_FILE="training_$(date +%Y%m%d_%H%M%S).log" |
| NUM_GPUS="${NUM_GPUS:-1}" |
| DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}" |
|
|
| if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: WANDB_API_KEY not set"; exit 1; fi |
|
|
| if [ -z "$HF_TOKEN" ]; then echo "ERROR: HF_TOKEN not set"; exit 1; fi |
|
|
| if [ ! -d "$DATASET_DIR" ]; then echo "ERROR: Dataset not at $DATASET_DIR"; exit 1; fi |
|
|
| echo "=== Starting Training ===" | tee "$LOG_FILE" |
|
|
| |
| if command -v conda &> /dev/null; then |
| eval "$(conda shell.bash hook)" |
| conda activate lerobot |
| fi |
|
|
| ACCEL_FLAGS="" |
| if [ "$NUM_GPUS" -gt 1 ]; then |
| ACCEL_FLAGS="--multi_gpu --num_processes $NUM_GPUS" |
| fi |
|
|
| |
| RESUME_ARGS="" |
| LAST_CKPT="/ephemeral/production_run/checkpoints/last/pretrained_model/train_config.json" |
| if [ -f "$LAST_CKPT" ]; then |
| echo "Resuming from checkpoint: $LAST_CKPT" | tee -a "$LOG_FILE" |
| RESUME_ARGS="--resume=true --config_path=$LAST_CKPT" |
| else |
| echo "Starting fresh training" | tee -a "$LOG_FILE" |
| RESUME_ARGS="--policy.path=lerobot/pi05_base" |
| fi |
|
|
| python3.12 -m accelerate.commands.launch $ACCEL_FLAGS \ |
| -m lerobot.scripts.lerobot_train \ |
| $RESUME_ARGS \ |
| --dataset.repo_id="so100:$DATASET_DIR:/workspace/pi05-so100-diverse/filtered_index.json:/workspace/pi05-so100-diverse/norm_stats.json" \ |
| --policy.train_expert_only=true \ |
| --policy.dtype=bfloat16 \ |
| --policy.gradient_checkpointing=false \ |
| --policy.push_to_hub=true \ |
| --policy.repo_id=StrongRoboticsLab/pi05-so100-diverse \ |
| --policy.normalization_mapping='{"VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MEAN_STD"}' \ |
| --policy.scheduler_warmup_steps=1000 \ |
| --policy.scheduler_decay_steps=340000 \ |
| --rename_map='{"observation.images.image": "observation.images.base_0_rgb", "observation.images.image2": "observation.images.left_wrist_0_rgb"}' \ |
| --batch_size=16 \ |
| --steps=340000 \ |
| --save_freq=5000 \ |
| --log_freq=50 \ |
| --num_workers=4 \ |
| --wandb.enable=true \ |
| --wandb.project=pi05-so100-diverse \ |
| --output_dir=/ephemeral/production_run \ |
| 2>&1 | tee -a "$LOG_FILE" |
|
|
| TRAIN_EXIT=${PIPESTATUS[0]} |
|
|
| echo "=== Training Complete (exit: $TRAIN_EXIT) ===" | tee -a "$LOG_FILE" |
|
|
| python -c " |
| from huggingface_hub import HfApi |
| HfApi().upload_file(path_or_fileobj='$LOG_FILE', path_in_repo='logs/$LOG_FILE', |
| repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model') |
| print('Log uploaded') |
| " 2>&1 | tee -a "$LOG_FILE" |
|
|
| |
| if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then |
| sudo shutdown -h now |
| else |
| echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" | tee -a "$LOG_FILE" |
| echo "=== Weights may still be on disk at /ephemeral/production_run ===" | tee -a "$LOG_FILE" |
| fi |
|
|