Ashira Pitchayapakayakul commited on
Commit
ff2fbf3
Β·
1 Parent(s): d8d7a71

feat(burst-but-dont-die): adaptive auto-scaler + tighter discoverer cycle

Browse files

User: 'burst ΰΉ€ΰΈ₯ΰΈ’ ΰΉΰΈ•ΰΉˆΰΈ«ΰΉ‰ΰΈ²ΰΈ‘ΰΈ•ΰΈ²ΰΈ’'

bin/v2/auto-scaler.sh β€” adaptive worker scaling by memory tier:
β‰₯10GB free β†’ BURST: 4 streaming + 1 bulk (max throughput)
β‰₯6GB free β†’ MID: 3 streaming
β‰₯4GB free β†’ SAFE: 2 streaming
β‰₯3GB free β†’ MIN: 1 streaming
<3GB free β†’ CRISIS: kill all workers (let cron finish)

Cron M%5==4 every 5 min: read /proc/meminfo MemAvailable, count active
workers via pgrep, compute target, spawn or kill the diff.

Discord notify on tier transition (BURST/MID/SAFE/MIN/CRISIS).

continuous-discoverer:
per-step sleep: 8s β†’ 4s
cycle gap: 30s β†’ 10s
β†’ ~2x faster discovery cadence (was ~5min cycle, now ~2.5min)

Net effect on cpu-basic 16GB:
β€’ Memory guard (d8d7a71) prevents heavy cron tasks from OOM
β€’ Auto-scaler (this commit) maxes worker throughput when safe
β€’ Stagger fix (3234167) prevents minute=0 collision
β€’ 3 layers of protection + adaptive scaling = burst-but-dont-die

Expected: in BURST tier (10GB free, no cron tasks running) we run
4 streamers + 1 bulk = ~5x current 1-streamer baseline. When cron
tasks fire and consume memory, scaler drops to MIN/CRISIS and pauses
workers; tasks complete; scaler ramps back up. No more OOM crashes.

bin/v2/auto-scaler.sh ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Surrogate-1 v2 β€” adaptive worker auto-scaler.
3
+ #
4
+ # User: 'burst ΰΉ€ΰΈ₯ΰΈ’ ΰΉΰΈ•ΰΉˆΰΈ«ΰΉ‰ΰΈ²ΰΈ‘ΰΈ•ΰΈ²ΰΈ’'
5
+ #
6
+ # Strategy: monitor MemAvailable every minute. Spawn streaming workers UP
7
+ # when memory is plentiful (burst mode); kill workers when memory tight.
8
+ # No fixed BULK/STREAM_WORKERS β€” fully dynamic.
9
+ #
10
+ # Memory tiers (cpu-basic 16 GB Space):
11
+ # β‰₯10 GB free β†’ BURST: target 4 streaming + 1 bulk worker (max)
12
+ # β‰₯6 GB free β†’ MID: target 3 streaming
13
+ # β‰₯4 GB free β†’ SAFE: target 2 streaming
14
+ # β‰₯3 GB free β†’ MIN: target 1 streaming (current default)
15
+ # <3 GB free β†’ CRISIS: kill all workers, let cron tasks finish
16
+ #
17
+ # Each tick: read avail mem, count current workers, spawn/kill the diff.
18
+ # Kill order: bulk first (heaviest), then youngest streaming.
19
+ # Spawn always streaming first (lightest).
20
+ #
21
+ # Cron: M%5 (every 5 min, no minute=0 collide).
22
+ set -uo pipefail
23
+ [[ -f "$HOME/.hermes/.env" ]] && { set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a; }
24
+
25
+ LOG="$HOME/.surrogate/logs/auto-scaler.log"
26
+ mkdir -p "$(dirname "$LOG")"
27
+
28
+ # Read MemAvailable
29
+ if [[ -r /proc/meminfo ]]; then
30
+ AVAIL_MB=$(awk '/^MemAvailable:/{print int($2/1024)}' /proc/meminfo)
31
+ else
32
+ AVAIL_MB=99999
33
+ fi
34
+
35
+ # Count current workers
36
+ N_BULK=$(pgrep -cf "bulk-mirror-worker.sh" 2>/dev/null || echo 0)
37
+ N_STREAM=$(pgrep -cf "streaming-mirror-worker.sh" 2>/dev/null || echo 0)
38
+
39
+ # Determine target by memory tier
40
+ if (( AVAIL_MB >= 10000 )); then T_STREAM=4; T_BULK=1; TIER="BURST"
41
+ elif (( AVAIL_MB >= 6000 )); then T_STREAM=3; T_BULK=0; TIER="MID"
42
+ elif (( AVAIL_MB >= 4000 )); then T_STREAM=2; T_BULK=0; TIER="SAFE"
43
+ elif (( AVAIL_MB >= 3000 )); then T_STREAM=1; T_BULK=0; TIER="MIN"
44
+ else T_STREAM=0; T_BULK=0; TIER="CRISIS"
45
+ fi
46
+
47
+ ACTION=""
48
+
49
+ # CRISIS: kill everything
50
+ if [[ "$TIER" == "CRISIS" ]]; then
51
+ pkill -f "bulk-mirror-worker.sh" 2>/dev/null && ACTION="${ACTION}killed-bulk "
52
+ pkill -f "streaming-mirror-worker.sh" 2>/dev/null && ACTION="${ACTION}killed-stream "
53
+ fi
54
+
55
+ # Spawn streaming up to target
56
+ DIFF=$((T_STREAM - N_STREAM))
57
+ if (( DIFF > 0 )); then
58
+ for _ in $(seq 1 "$DIFF"); do
59
+ wid="autoscale-stream-$(date +%s)-$$-${RANDOM}"
60
+ nohup bash "$HOME/.surrogate/bin/v2/streaming-mirror-worker.sh" "$wid" \
61
+ > "$HOME/.surrogate/logs/stream-$wid.log" 2>&1 &
62
+ ACTION="${ACTION}+stream "
63
+ done
64
+ elif (( DIFF < 0 )); then
65
+ # Kill youngest streaming first
66
+ KILLN=$(( -DIFF ))
67
+ pgrep -f "streaming-mirror-worker.sh" | tail -"$KILLN" | xargs -r kill 2>/dev/null
68
+ ACTION="${ACTION}-${KILLN}stream "
69
+ fi
70
+
71
+ # Spawn bulk up to target
72
+ DIFF_B=$((T_BULK - N_BULK))
73
+ if (( DIFF_B > 0 )); then
74
+ for _ in $(seq 1 "$DIFF_B"); do
75
+ wid="autoscale-bulk-$(date +%s)-$$-${RANDOM}"
76
+ nohup bash "$HOME/.surrogate/bin/v2/bulk-mirror-worker.sh" "$wid" \
77
+ > "$HOME/.surrogate/logs/bulk-$wid.log" 2>&1 &
78
+ ACTION="${ACTION}+bulk "
79
+ done
80
+ elif (( DIFF_B < 0 )); then
81
+ KILLN=$(( -DIFF_B ))
82
+ pgrep -f "bulk-mirror-worker.sh" | tail -"$KILLN" | xargs -r kill 2>/dev/null
83
+ ACTION="${ACTION}-${KILLN}bulk "
84
+ fi
85
+
86
+ [[ -z "$ACTION" ]] && ACTION="steady"
87
+
88
+ echo "[$(date '+%H:%M:%S')] tier=$TIER avail=${AVAIL_MB}MB stream=$N_STREAM/$T_STREAM bulk=$N_BULK/$T_BULK action=$ACTION" >> "$LOG"
89
+
90
+ # Discord notify on tier change (compare to last)
91
+ LAST_TIER_FILE="$HOME/.surrogate/logs/.auto-scaler-last-tier"
92
+ LAST_TIER=$(cat "$LAST_TIER_FILE" 2>/dev/null || echo "")
93
+ if [[ "$TIER" != "$LAST_TIER" ]]; then
94
+ echo "$TIER" > "$LAST_TIER_FILE"
95
+ if [[ -n "${DISCORD_WEBHOOK:-}" ]]; then
96
+ case "$TIER" in
97
+ BURST) emoji="πŸš€" ;;
98
+ MID) emoji="⚑" ;;
99
+ SAFE) emoji="βœ…" ;;
100
+ MIN) emoji="⚠️" ;;
101
+ CRISIS) emoji="πŸ”΄" ;;
102
+ esac
103
+ curl -s -X POST -H "Content-Type: application/json" \
104
+ -d "{\"content\":\"$emoji auto-scaler: tier $LAST_TIER β†’ **$TIER** (avail ${AVAIL_MB}MB) β†’ stream=$T_STREAM bulk=$T_BULK\"}" \
105
+ "$DISCORD_WEBHOOK" >/dev/null 2>&1 || true
106
+ fi
107
+ fi
bin/v2/continuous-discoverer.sh CHANGED
@@ -198,7 +198,7 @@ while true; do
198
  CYCLE=$((CYCLE+1))
199
  for fn in "${SOURCE_FNS[@]}"; do
200
  $fn 2>>"$LOG" || true
201
- sleep 8 # polite per-step throttle
202
  done
203
 
204
  # Re-seed coordinator every 10 cycles (~50 min) so newly-added rows enter queue
@@ -209,5 +209,5 @@ while true; do
209
  notify "cycle $CYCLE done β€” queue total $TOTAL sources"
210
  fi
211
 
212
- sleep 30 # cycle gap (each full cycle ~5 min counting per-step sleeps)
213
  done
 
198
  CYCLE=$((CYCLE+1))
199
  for fn in "${SOURCE_FNS[@]}"; do
200
  $fn 2>>"$LOG" || true
201
+ sleep 4 # tighter β€” burst mode (was 8s)
202
  done
203
 
204
  # Re-seed coordinator every 10 cycles (~50 min) so newly-added rows enter queue
 
209
  notify "cycle $CYCLE done β€” queue total $TOTAL sources"
210
  fi
211
 
212
+ sleep 10 # tighter cycle gap β€” burst mode (was 30s)
213
  done
start.sh CHANGED
@@ -397,6 +397,8 @@ while true; do
397
  # Each major task picks a unique M%X==N offset so no two fire together.
398
  [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
399
  [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
 
 
400
  # push-training-to-hf gated by memory (loads big shard into RAM).
401
  # Anchor (24GB) takes over when capacity arrives β€” see anchor cron-loop.
402
  [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \
 
397
  # Each major task picks a unique M%X==N offset so no two fire together.
398
  [[ $((M % 2)) -eq 1 ]] && bash ~/.surrogate/bin/surrogate-dev-loop.sh 1 >> "$LOG" 2>&1 &
399
  [[ $((M % 5)) -eq 2 ]] && bash ~/.surrogate/bin/work-queue-producer.sh >> "$LOG" 2>&1 &
400
+ # Auto-scaler β€” spawn/kill workers based on free memory tier (burst-but-don't-die)
401
+ [[ $((M % 5)) -eq 4 ]] && bash ~/.surrogate/bin/v2/auto-scaler.sh >> "$LOG" 2>&1 &
402
  # push-training-to-hf gated by memory (loads big shard into RAM).
403
  # Anchor (24GB) takes over when capacity arrives β€” see anchor cron-loop.
404
  [[ $((M % 3)) -eq 1 ]] && bash ~/.surrogate/bin/v2/memory-guard.sh \