Spaces:
Runtime error
feat(burst-but-dont-die): adaptive auto-scaler + tighter discoverer cycle
Browse filesUser: 'burst ΰΉΰΈ₯ΰΈ’ ΰΉΰΈΰΉΰΈ«ΰΉΰΈ²ΰΈ‘ΰΈΰΈ²ΰΈ’'
bin/v2/auto-scaler.sh β adaptive worker scaling by memory tier:
β₯10GB free β BURST: 4 streaming + 1 bulk (max throughput)
β₯6GB free β MID: 3 streaming
β₯4GB free β SAFE: 2 streaming
β₯3GB free β MIN: 1 streaming
<3GB free β CRISIS: kill all workers (let cron finish)
Cron M%5==4 every 5 min: read /proc/meminfo MemAvailable, count active
workers via pgrep, compute target, spawn or kill the diff.
Discord notify on tier transition (BURST/MID/SAFE/MIN/CRISIS).
continuous-discoverer:
per-step sleep: 8s β 4s
cycle gap: 30s β 10s
β ~2x faster discovery cadence (was ~5min cycle, now ~2.5min)
Net effect on cpu-basic 16GB:
β’ Memory guard (d8d7a71) prevents heavy cron tasks from OOM
β’ Auto-scaler (this commit) maxes worker throughput when safe
β’ Stagger fix (3234167) prevents minute=0 collision
β’ 3 layers of protection + adaptive scaling = burst-but-dont-die
Expected: in BURST tier (10GB free, no cron tasks running) we run
4 streamers + 1 bulk = ~5x current 1-streamer baseline. When cron
tasks fire and consume memory, scaler drops to MIN/CRISIS and pauses
workers; tasks complete; scaler ramps back up. No more OOM crashes.
- bin/v2/auto-scaler.sh +107 -0
- bin/v2/continuous-discoverer.sh +2 -2
- start.sh +2 -0
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Surrogate-1 v2 β adaptive worker auto-scaler.
|
| 3 |
+
#
|
| 4 |
+
# User: 'burst ΰΉΰΈ₯ΰΈ’ ΰΉΰΈΰΉΰΈ«ΰΉΰΈ²ΰΈ‘ΰΈΰΈ²ΰΈ’'
|
| 5 |
+
#
|
| 6 |
+
# Strategy: monitor MemAvailable every minute. Spawn streaming workers UP
|
| 7 |
+
# when memory is plentiful (burst mode); kill workers when memory tight.
|
| 8 |
+
# No fixed BULK/STREAM_WORKERS β fully dynamic.
|
| 9 |
+
#
|
| 10 |
+
# Memory tiers (cpu-basic 16 GB Space):
|
| 11 |
+
# β₯10 GB free β BURST: target 4 streaming + 1 bulk worker (max)
|
| 12 |
+
# β₯6 GB free β MID: target 3 streaming
|
| 13 |
+
# β₯4 GB free β SAFE: target 2 streaming
|
| 14 |
+
# β₯3 GB free β MIN: target 1 streaming (current default)
|
| 15 |
+
# <3 GB free β CRISIS: kill all workers, let cron tasks finish
|
| 16 |
+
#
|
| 17 |
+
# Each tick: read avail mem, count current workers, spawn/kill the diff.
|
| 18 |
+
# Kill order: bulk first (heaviest), then youngest streaming.
|
| 19 |
+
# Spawn always streaming first (lightest).
|
| 20 |
+
#
|
| 21 |
+
# Cron: M%5 (every 5 min, no minute=0 collide).
|
| 22 |
+
set -uo pipefail
|
| 23 |
+
[[ -f "$HOME/.hermes/.env" ]] && { set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a; }
|
| 24 |
+
|
| 25 |
+
LOG="$HOME/.surrogate/logs/auto-scaler.log"
|
| 26 |
+
mkdir -p "$(dirname "$LOG")"
|
| 27 |
+
|
| 28 |
+
# Read MemAvailable
|
| 29 |
+
if [[ -r /proc/meminfo ]]; then
|
| 30 |
+
AVAIL_MB=$(awk '/^MemAvailable:/{print int($2/1024)}' /proc/meminfo)
|
| 31 |
+
else
|
| 32 |
+
AVAIL_MB=99999
|
| 33 |
+
fi
|
| 34 |
+
|
| 35 |
+
# Count current workers
|
| 36 |
+
N_BULK=$(pgrep -cf "bulk-mirror-worker.sh" 2>/dev/null || echo 0)
|
| 37 |
+
N_STREAM=$(pgrep -cf "streaming-mirror-worker.sh" 2>/dev/null || echo 0)
|
| 38 |
+
|
| 39 |
+
# Determine target by memory tier
|
| 40 |
+
if (( AVAIL_MB >= 10000 )); then T_STREAM=4; T_BULK=1; TIER="BURST"
|
| 41 |
+
elif (( AVAIL_MB >= 6000 )); then T_STREAM=3; T_BULK=0; TIER="MID"
|
| 42 |
+
elif (( AVAIL_MB >= 4000 )); then T_STREAM=2; T_BULK=0; TIER="SAFE"
|
| 43 |
+
elif (( AVAIL_MB >= 3000 )); then T_STREAM=1; T_BULK=0; TIER="MIN"
|
| 44 |
+
else T_STREAM=0; T_BULK=0; TIER="CRISIS"
|
| 45 |
+
fi
|
| 46 |
+
|
| 47 |
+
ACTION=""
|
| 48 |
+
|
| 49 |
+
# CRISIS: kill everything
|
| 50 |
+
if [[ "$TIER" == "CRISIS" ]]; then
|
| 51 |
+
pkill -f "bulk-mirror-worker.sh" 2>/dev/null && ACTION="${ACTION}killed-bulk "
|
| 52 |
+
pkill -f "streaming-mirror-worker.sh" 2>/dev/null && ACTION="${ACTION}killed-stream "
|
| 53 |
+
fi
|
| 54 |
+
|
| 55 |
+
# Spawn streaming up to target
|
| 56 |
+
DIFF=$((T_STREAM - N_STREAM))
|
| 57 |
+
if (( DIFF > 0 )); then
|
| 58 |
+
for _ in $(seq 1 "$DIFF"); do
|
| 59 |
+
wid="autoscale-stream-$(date +%s)-$$-${RANDOM}"
|
| 60 |
+
nohup bash "$HOME/.surrogate/bin/v2/streaming-mirror-worker.sh" "$wid" \
|
| 61 |
+
> "$HOME/.surrogate/logs/stream-$wid.log" 2>&1 &
|
| 62 |
+
ACTION="${ACTION}+stream "
|
| 63 |
+
done
|
| 64 |
+
elif (( DIFF < 0 )); then
|
| 65 |
+
# Kill youngest streaming first
|
| 66 |
+
KILLN=$(( -DIFF ))
|
| 67 |
+
pgrep -f "streaming-mirror-worker.sh" | tail -"$KILLN" | xargs -r kill 2>/dev/null
|
| 68 |
+
ACTION="${ACTION}-${KILLN}stream "
|
| 69 |
+
fi
|
| 70 |
+
|
| 71 |
+
# Spawn bulk up to target
|
| 72 |
+
DIFF_B=$((T_BULK - N_BULK))
|
| 73 |
+
if (( DIFF_B > 0 )); then
|
| 74 |
+
for _ in $(seq 1 "$DIFF_B"); do
|
| 75 |
+
wid="autoscale-bulk-$(date +%s)-$$-${RANDOM}"
|
| 76 |
+
nohup bash "$HOME/.surrogate/bin/v2/bulk-mirror-worker.sh" "$wid" \
|
| 77 |
+
> "$HOME/.surrogate/logs/bulk-$wid.log" 2>&1 &
|
| 78 |
+
ACTION="${ACTION}+bulk "
|
| 79 |
+
done
|
| 80 |
+
elif (( DIFF_B < 0 )); then
|
| 81 |
+
KILLN=$(( -DIFF_B ))
|
| 82 |
+
pgrep -f "bulk-mirror-worker.sh" | tail -"$KILLN" | xargs -r kill 2>/dev/null
|
| 83 |
+
ACTION="${ACTION}-${KILLN}bulk "
|
| 84 |
+
fi
|
| 85 |
+
|
| 86 |
+
[[ -z "$ACTION" ]] && ACTION="steady"
|
| 87 |
+
|
| 88 |
+
echo "[$(date '+%H:%M:%S')] tier=$TIER avail=${AVAIL_MB}MB stream=$N_STREAM/$T_STREAM bulk=$N_BULK/$T_BULK action=$ACTION" >> "$LOG"
|
| 89 |
+
|
| 90 |
+
# Discord notify on tier change (compare to last)
|
| 91 |
+
LAST_TIER_FILE="$HOME/.surrogate/logs/.auto-scaler-last-tier"
|
| 92 |
+
LAST_TIER=$(cat "$LAST_TIER_FILE" 2>/dev/null || echo "")
|
| 93 |
+
if [[ "$TIER" != "$LAST_TIER" ]]; then
|
| 94 |
+
echo "$TIER" > "$LAST_TIER_FILE"
|
| 95 |
+
if [[ -n "${DISCORD_WEBHOOK:-}" ]]; then
|
| 96 |
+
case "$TIER" in
|
| 97 |
+
BURST) emoji="π" ;;
|
| 98 |
+
MID) emoji="β‘" ;;
|
| 99 |
+
SAFE) emoji="β
" ;;
|
| 100 |
+
MIN) emoji="β οΈ" ;;
|
| 101 |
+
CRISIS) emoji="π΄" ;;
|
| 102 |
+
esac
|
| 103 |
+
curl -s -X POST -H "Content-Type: application/json" \
|
| 104 |
+
-d "{\"content\":\"$emoji auto-scaler: tier $LAST_TIER β **$TIER** (avail ${AVAIL_MB}MB) β stream=$T_STREAM bulk=$T_BULK\"}" \
|
| 105 |
+
"$DISCORD_WEBHOOK" >/dev/null 2>&1 || true
|
| 106 |
+
fi
|
| 107 |
+
fi
|
|
@@ -198,7 +198,7 @@ while true; do
|
|
| 198 |
CYCLE=$((CYCLE+1))
|
| 199 |
for fn in "${SOURCE_FNS[@]}"; do
|
| 200 |
$fn 2>>"$LOG" || true
|
| 201 |
-
sleep
|
| 202 |
done
|
| 203 |
|
| 204 |
# Re-seed coordinator every 10 cycles (~50 min) so newly-added rows enter queue
|
|
@@ -209,5 +209,5 @@ while true; do
|
|
| 209 |
notify "cycle $CYCLE done β queue total $TOTAL sources"
|
| 210 |
fi
|
| 211 |
|
| 212 |
-
sleep
|
| 213 |
done
|
|
|
|
| 198 |
CYCLE=$((CYCLE+1))
|
| 199 |
for fn in "${SOURCE_FNS[@]}"; do
|
| 200 |
$fn 2>>"$LOG" || true
|
| 201 |
+
sleep 4 # tighter β burst mode (was 8s)
|
| 202 |
done
|
| 203 |
|
| 204 |
# Re-seed coordinator every 10 cycles (~50 min) so newly-added rows enter queue
|
|
|
|
| 209 |
notify "cycle $CYCLE done β queue total $TOTAL sources"
|
| 210 |
fi
|
| 211 |
|
| 212 |
+
sleep 10 # tighter cycle gap β burst mode (was 30s)
|
| 213 |
done
|
|
@@ -397,6 +397,8 @@ while true; do
|
|
| 397 |
# Each major task picks a unique M%X==N offset so no two fire together.
|
| 398 |
[[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
|
| 399 |
[[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
|
|
|
|
|
|
|
| 400 |
# push-training-to-hf gated by memory (loads big shard into RAM).
|
| 401 |
# Anchor (24GB) takes over when capacity arrives β see anchor cron-loop.
|
| 402 |
[[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
|
|
|
|
| 397 |
# Each major task picks a unique M%X==N offset so no two fire together.
|
| 398 |
[[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
|
| 399 |
[[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
|
| 400 |
+
# Auto-scaler β spawn/kill workers based on free memory tier (burst-but-don't-die)
|
| 401 |
+
[[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
|
| 402 |
# push-training-to-hf gated by memory (loads big shard into RAM).
|
| 403 |
# Anchor (24GB) takes over when capacity arrives β see anchor cron-loop.
|
| 404 |
[[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
|