#!/bin/bash # launch_turbo.sh — Launch ch1mera with all CPU optimizations # # First run: chmod +x launch_turbo.sh # Usage: ./launch_turbo.sh [train_hyper.py args...] # Example: ./launch_turbo.sh --growlength --reservoir # Full: ./launch_turbo.sh --scale tiny --seq_len 128 --max_steps 300 --batch_size 32 --growlength --reservoir set -euo pipefail trap 'echo "[TURBO] ✗ Failed at line $LINENO (exit $?)" >&2' ERR # ── Detect core topology ── # On Raptor Lake i7-14700T: 8 P-cores (0-7) + 12 E-cores (8-19) # P-cores have AVX-512 BF16; E-cores do NOT. # Mixing P+E in OpenMP parallel regions causes barrier stalls: # P-cores finish tiles in X cycles, E-cores in ~2.5X → P-cores idle. # Solution: pin to P-cores only. PHYS_CORES=$(lscpu -p | grep -v '^#' | sort -t, -k 2 -un | wc -l) PCORE_THREADS=${CHIMERA_PCORE_THREADS:-8} # Override for non-hybrid CPUs echo "[TURBO] Physical cores: $PHYS_CORES → P-core threads: $PCORE_THREADS" # ── Threading: P-cores only ── export OMP_NUM_THREADS=$PCORE_THREADS export MKL_NUM_THREADS=$PCORE_THREADS # Explicit proclist pins threads 0-7 to physical P-cores. # On Raptor Lake, P-cores are typically CPU 0-7 in lscpu topology. # Verify with: lscpu -p | head -20 export KMP_AFFINITY="granularity=fine,proclist=[0,1,2,3,4,5,6,7],explicit" export KMP_BLOCKTIME=0 # Zero blocktime: yield immediately after parallel region # Eliminates busy-wait power waste on 35W TDP chip # ── tcmalloc (if available — non-debug only) ── # IMPORTANT: tcmalloc_debug / tcmalloc_minimal_debug cause silent aborts. # Only load the release version. TCMALLOC_LIB="" for candidate in $(ldconfig -p 2>/dev/null | grep -oP '/\S*libtcmalloc(|_minimal)\.so\S*' | grep -v debug || true); do if [ -f "$candidate" ]; then TCMALLOC_LIB="$candidate" break fi done if [ -n "$TCMALLOC_LIB" ]; then echo "[TURBO] tcmalloc: $TCMALLOC_LIB" export LD_PRELOAD="$TCMALLOC_LIB${LD_PRELOAD:+:$LD_PRELOAD}" else echo "[TURBO] ⚠ tcmalloc (non-debug) not found — expect 5-8% throughput loss." echo "[TURBO] Install: sudo apt install libgoogle-perftools4" fi # ── IOMP (Intel OpenMP, if available) ── IOMP_LIB="" IOMP_LIB=$(python3 -c " import intel_extension_for_pytorch, os print(os.path.join(os.path.dirname(intel_extension_for_pytorch.__file__), '..', 'libiomp5.so')) " 2>/dev/null) || true if [ -n "$IOMP_LIB" ] && [ -f "$IOMP_LIB" ]; then echo "[TURBO] libiomp5: $IOMP_LIB" export LD_PRELOAD="$IOMP_LIB${LD_PRELOAD:+:$LD_PRELOAD}" fi # ── NUMA pinning (if numactl available) ── NUMA_PREFIX="" if command -v numactl &>/dev/null; then echo "[TURBO] NUMA: pinning to node 0" NUMA_PREFIX="numactl --cpunodebind=0 --membind=0" fi # ── Launch ── echo "[TURBO] Launching: python3 train_hyper.py $*" echo "═══════════════════════════════════════════════════" $NUMA_PREFIX python3 train_hyper.py "$@"