File size: 1,064 Bytes
a896ecd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | #!/usr/bin/env bash
set -euo pipefail
# Show status for a run directory.
# Usage: bash scripts/status_run.sh runs/qwen3-8b-qlora-YYYYMMDD-HHMMSS
if [ $# -lt 1 ]; then
echo "Usage: $0 <RUN_DIR>" >&2
exit 1
fi
RUN_DIR="$1"
echo "RUN_DIR=$RUN_DIR"
[ -f "$RUN_DIR/RUN_ID.txt" ] && echo "RUN_ID=$(cat "$RUN_DIR/RUN_ID.txt")"
for f in TRAIN_PID.txt TRAIN_RESUME_PID.txt EVAL_PID.txt; do
if [ -f "$RUN_DIR/$f" ]; then
PID=$(cat "$RUN_DIR/$f")
if kill -0 "$PID" 2>/dev/null; then
echo "$f PID=$PID running"
else
echo "$f PID=$PID not running"
fi
fi
done
echo
if [ -d "$RUN_DIR/outputs/adapter" ]; then
echo "Latest checkpoints:"
find "$RUN_DIR/outputs/adapter" -maxdepth 1 -type d -name "checkpoint-*" | sort -V | tail -n 5 || true
fi
echo
if ls "$RUN_DIR"/logs/*.log >/dev/null 2>&1; then
echo "Logs:"
ls -lh "$RUN_DIR"/logs/*.log
echo
echo "Last train/eval lines:"
tail -n 20 "$RUN_DIR"/logs/*.log
fi
echo
if [ -f "$RUN_DIR/eval/all_metrics.json" ]; then
echo "Eval metrics: $RUN_DIR/eval/all_metrics.json"
fi
|