#!/bin/bash
# ============================================================
# QR-SPPS NB-5 Part A: MPI State-Vector Benchmark (29q + 30q)
# ============================================================
# Run from: ~/QARPdemo
#   cd ~/QARPdemo && sbatch run_nb5_30q.sh
#   tail -f nb5_30q_output.log
#
# Measures actual 29q and 30q state-vector evaluation times on
# Fujitsu A64FX using MPI-distributed state-vector simulation.
# These are the REAL hardware measurements that ground the
# exponential scaling law (R2=0.9948, doubling rate=1.1993/qubit).
#
# MPI layout (2 active ranks):
#   rank 0 -> 29q  (SV = 8,590 MB, measured ~595s)
#   rank 1 -> 30q  (SV = 17,180 MB, measured ~1192s) <- PHYSICAL CEILING
#
# Memory note (why --nodes=2 is correct):
#   30q state-vector = 17.2 GB raw.
#   + MPI overhead + 40-node observable = ~20-24 GB total per rank.
#   Each A64FX node has 28.9 GB free RAM. 2 nodes = sufficient.
#   (4-node salloc allocation provides topology stability.)
#
# IMPORTANT: mpi4py IS imported in QRSPPS_NB5_measure30q.py intentionally.
#   This is a standalone sbatch script - NOT run from inside Jupyter.
#   (Importing mpi4py in Jupyter on a compute node crashes the kernel.)
#
# Depends on:  nothing (standalone benchmark - does not need prior pkls)
# Produces:    QRSPPS_mpi_scaling.pkl  (saved to ~/QARPdemo/)
#
# After this completes, run:
#   cd ~/QARPdemo && sbatch run_nb5_final.sh
#
# Runtime: ~35 min  (29q ~595s + 30q ~1192s + overhead)
# Wall time set to 12:00:00 for safety (interactive partition limit)
# ============================================================
#SBATCH --job-name=qrspps_nb5_30q
#SBATCH --nodes=4
#SBATCH --ntasks-per-node=12
#SBATCH --cpus-per-task=4
#SBATCH --partition=Interactive
#SBATCH --time=48:00:00
#SBATCH --output=nb5_30q_output.log

source ~/QARPdemo/setup_env.sh

# NOTE: do NOT set QARP_DISABLE_MPI here.
# QRSPPS_NB5_measure30q.py uses mpi4py intentionally.
export OMP_NUM_THREADS=4

echo "================================================================"
echo " QR-SPPS NB-5A: MPI 29q + 30q State-Vector Benchmark"
echo "================================================================"
echo " Start : $(date)"
echo " Node  : $(hostname)"
echo " Job   : $SLURM_JOB_ID"
echo " Dir   : $(pwd)"
echo " Nodes : $SLURM_JOB_NUM_NODES"
echo " Tasks : $SLURM_NTASKS"
echo ""
echo " rank 0 -> 29q  (SV = 8,590 MB, ~595s)"
echo " rank 1 -> 30q  (SV = 17,180 MB, ~1192s) <- physical memory ceiling"
echo ""
echo " 30q state-vector = 17.2 GB + MPI overhead = ~20-24 GB"
echo " A64FX free RAM per node = 28.9 GB: fits comfortably"
echo "================================================================"
echo ""

echo "=== Starting QRSPPS_NB5_measure30q.py via srun ==="

srun python3 QRSPPS_NB5_measure30q.py

EXIT=$?
echo ""
echo "=== srun exit: $EXIT  ($(date)) ==="

# Verify output (saved to ~/QARPdemo/QRSPPS_mpi_scaling.pkl by the script)
MPI_PKL="$HOME/QARPdemo/QRSPPS_mpi_scaling.pkl"
if [ -f "$MPI_PKL" ]; then
    echo "Output: QRSPPS_mpi_scaling.pkl  ($(du -h $MPI_PKL | cut -f1))  OK"
else
    echo "WARNING: QRSPPS_mpi_scaling.pkl not found at $MPI_PKL"
    echo "Check nb5_30q_output.log for errors."
    exit 1
fi

echo ""
echo "================================================================"
echo " NB-5A DONE  |  End: $(date)"
echo "================================================================"
echo ""
echo "Next step:"
echo "  cd ~/QARPdemo && sbatch run_nb5_final.sh"
exit $EXIT