PEFT
qlora
sft
trl
qwen3
tmf921
intent-based-networking
network-slicing
rtx-6000-ada
ml-intern
nraptisss's picture
Add nohup run management and resumable checkpoint support
a896ecd verified
#!/usr/bin/env bash
set -euo pipefail
# Show status for a run directory.
# Usage: bash scripts/status_run.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
if [ $# -lt 1 ]; then
echo "Usage: $0 <RUN_DIR>" >&2
exit 1
fi
RUN_DIR="$1"
echo "RUN_DIR=$RUN_DIR"
[ -f "$RUN_DIR/RUN_ID.txt" ] && echo "RUN_ID=$(cat "$RUN_DIR/RUN_ID.txt")"
for f in TRAIN_PID.txt TRAIN_RESUME_PID.txt EVAL_PID.txt; do
if [ -f "$RUN_DIR/$f" ]; then
PID=$(cat "$RUN_DIR/$f")
if kill -0 "$PID" 2>/dev/null; then
echo "$f PID=$PID running"
else
echo "$f PID=$PID not running"
fi
fi
done
echo
if [ -d "$RUN_DIR/outputs/adapter" ]; then
echo "Latest checkpoints:"
find "$RUN_DIR/outputs/adapter" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 5 || true
fi
echo
if ls "$RUN_DIR"/logs/*.log >/dev/null 2>&1; then
echo "Logs:"
ls -lh "$RUN_DIR"/logs/*.log
echo
echo "Last train/eval lines:"
tail -n 20 "$RUN_DIR"/logs/*.log
fi
echo
if [ -f "$RUN_DIR/eval/all_metrics.json" ]; then
echo "Eval metrics: $RUN_DIR/eval/all_metrics.json"
fi