chomera / launch_turbo.sh

fix: tcmalloc debug .so crash, add error trapping, chmod note

e80380b verified 10 days ago

3.06 kB

	#!/bin/bash
	# launch_turbo.sh — Launch ch1mera with all CPU optimizations
	#
	# First run: chmod +x launch_turbo.sh
	# Usage: ./launch_turbo.sh [train_hyper.py args...]
	# Example: ./launch_turbo.sh --growlength --reservoir
	# Full: ./launch_turbo.sh --scale tiny --seq_len 128 --max_steps 300 --batch_size 32 --growlength --reservoir

	set -euo pipefail
	trap 'echo "[TURBO] ✗ Failed at line $LINENO (exit $?)" >&2' ERR

	# ── Detect core topology ──
	# On Raptor Lake i7-14700T: 8 P-cores (0-7) + 12 E-cores (8-19)
	# P-cores have AVX-512 BF16; E-cores do NOT.
	# Mixing P+E in OpenMP parallel regions causes barrier stalls:
	# P-cores finish tiles in X cycles, E-cores in ~2.5X → P-cores idle.
	# Solution: pin to P-cores only.
	PHYS_CORES=$(lscpu -p \| grep -v '^#' \| sort -t, -k 2 -un \| wc -l)
	PCORE_THREADS=${CHIMERA_PCORE_THREADS:-8} # Override for non-hybrid CPUs
	echo "[TURBO] Physical cores: $PHYS_CORES → P-core threads: $PCORE_THREADS"

	# ── Threading: P-cores only ──
	export OMP_NUM_THREADS=$PCORE_THREADS
	export MKL_NUM_THREADS=$PCORE_THREADS
	# Explicit proclist pins threads 0-7 to physical P-cores.
	# On Raptor Lake, P-cores are typically CPU 0-7 in lscpu topology.
	# Verify with: lscpu -p \| head -20
	export KMP_AFFINITY="granularity=fine,proclist=[0,1,2,3,4,5,6,7],explicit"
	export KMP_BLOCKTIME=0 # Zero blocktime: yield immediately after parallel region
	# Eliminates busy-wait power waste on 35W TDP chip

	# ── tcmalloc (if available — non-debug only) ──
	# IMPORTANT: tcmalloc_debug / tcmalloc_minimal_debug cause silent aborts.
	# Only load the release version.
	TCMALLOC_LIB=""
	for candidate in $(ldconfig -p 2>/dev/null \| grep -oP '/\Slibtcmalloc(\|_minimal)\.so\S' \| grep -v debug \|\| true); do
	if [ -f "$candidate" ]; then
	TCMALLOC_LIB="$candidate"
	break
	fi
	done

	if [ -n "$TCMALLOC_LIB" ]; then
	echo "[TURBO] tcmalloc: $TCMALLOC_LIB"
	export LD_PRELOAD="$TCMALLOC_LIB${LD_PRELOAD:+:$LD_PRELOAD}"
	else
	echo "[TURBO] ⚠ tcmalloc (non-debug) not found — expect 5-8% throughput loss."
	echo "[TURBO] Install: sudo apt install libgoogle-perftools4"
	fi

	# ── IOMP (Intel OpenMP, if available) ──
	IOMP_LIB=""
	IOMP_LIB=$(python3 -c "
	import intel_extension_for_pytorch, os
	print(os.path.join(os.path.dirname(intel_extension_for_pytorch.__file__), '..', 'libiomp5.so'))
	" 2>/dev/null) \|\| true

	if [ -n "$IOMP_LIB" ] && [ -f "$IOMP_LIB" ]; then
	echo "[TURBO] libiomp5: $IOMP_LIB"
	export LD_PRELOAD="$IOMP_LIB${LD_PRELOAD:+:$LD_PRELOAD}"
	fi

	# ── NUMA pinning (if numactl available) ──
	NUMA_PREFIX=""
	if command -v numactl &>/dev/null; then
	echo "[TURBO] NUMA: pinning to node 0"
	NUMA_PREFIX="numactl --cpunodebind=0 --membind=0"
	fi

	# ── Launch ──
	echo "[TURBO] Launching: python3 train_hyper.py $*"
	echo "═══════════════════════════════════════════════════"

	$NUMA_PREFIX python3 train_hyper.py "$@"