StrongRoboticsLab
/

pi05-so100-diverse

Model card Files Files and versions

pi05-so100-diverse / train_cloud.sh

bot

Retry final upload 5x, skip shutdown on failure

684902a 24 days ago

history blame contribute delete

2.95 kB

	#!/bin/bash

	# Run training on cloud instance, then auto-stop

	# Prerequisites: set WANDB_API_KEY and HF_TOKEN

	set +e

	LOG_FILE="training_$(date +%Y%m%d_%H%M%S).log"
	NUM_GPUS="${NUM_GPUS:-1}"
	DATASET_DIR="${DATASET_DIR:-/ephemeral/community_dataset_v3}"

	if [ -z "$WANDB_API_KEY" ]; then echo "ERROR: WANDB_API_KEY not set"; exit 1; fi

	if [ -z "$HF_TOKEN" ]; then echo "ERROR: HF_TOKEN not set"; exit 1; fi

	if [ ! -d "$DATASET_DIR" ]; then echo "ERROR: Dataset not at $DATASET_DIR"; exit 1; fi

	echo "=== Starting Training ===" \| tee "$LOG_FILE"

	# Activate conda env if available (bare metal), otherwise assume deps are global (Docker)
	if command -v conda &> /dev/null; then
	eval "$(conda shell.bash hook)"
	conda activate lerobot
	fi

	ACCEL_FLAGS=""
	if [ "$NUM_GPUS" -gt 1 ]; then
	ACCEL_FLAGS="--multi_gpu --num_processes $NUM_GPUS"
	fi

	# Check if we can resume from a checkpoint
	RESUME_ARGS=""
	LAST_CKPT="/ephemeral/production_run/checkpoints/last/pretrained_model/train_config.json"
	if [ -f "$LAST_CKPT" ]; then
	echo "Resuming from checkpoint: $LAST_CKPT" \| tee -a "$LOG_FILE"
	RESUME_ARGS="--resume=true --config_path=$LAST_CKPT"
	else
	echo "Starting fresh training" \| tee -a "$LOG_FILE"
	RESUME_ARGS="--policy.path=lerobot/pi05_base"
	fi

	python3.12 -m accelerate.commands.launch $ACCEL_FLAGS \
	-m lerobot.scripts.lerobot_train \
	$RESUME_ARGS \
	--dataset.repo_id="so100:$DATASET_DIR:/workspace/pi05-so100-diverse/filtered_index.json:/workspace/pi05-so100-diverse/norm_stats.json" \
	--policy.train_expert_only=true \
	--policy.dtype=bfloat16 \
	--policy.gradient_checkpointing=false \
	--policy.push_to_hub=true \
	--policy.repo_id=StrongRoboticsLab/pi05-so100-diverse \
	--policy.normalization_mapping='{"VISUAL": "IDENTITY", "STATE": "MEAN_STD", "ACTION": "MEAN_STD"}' \
	--policy.scheduler_warmup_steps=1000 \
	--policy.scheduler_decay_steps=340000 \
	--rename_map='{"observation.images.image": "observation.images.base_0_rgb", "observation.images.image2": "observation.images.left_wrist_0_rgb"}' \
	--batch_size=16 \
	--steps=340000 \
	--save_freq=5000 \
	--log_freq=50 \
	--num_workers=4 \
	--wandb.enable=true \
	--wandb.project=pi05-so100-diverse \
	--output_dir=/ephemeral/production_run \
	2>&1 \| tee -a "$LOG_FILE"

	TRAIN_EXIT=${PIPESTATUS[0]}

	echo "=== Training Complete (exit: $TRAIN_EXIT) ===" \| tee -a "$LOG_FILE"

	python -c "
	from huggingface_hub import HfApi
	HfApi().upload_file(path_or_fileobj='$LOG_FILE', path_in_repo='logs/$LOG_FILE',
	repo_id='StrongRoboticsLab/pi05-so100-diverse', repo_type='model')
	print('Log uploaded')
	" 2>&1 \| tee -a "$LOG_FILE"

	# Only auto-shutdown if training succeeded (exit 0 = weights uploaded)
	if [ "$TRAIN_EXIT" -eq 0 ] && command -v sudo &> /dev/null; then
	sudo shutdown -h now
	else
	echo "=== NOT shutting down: training exited with code $TRAIN_EXIT ===" \| tee -a "$LOG_FILE"
	echo "=== Weights may still be on disk at /ephemeral/production_run ===" \| tee -a "$LOG_FILE"
	fi