Ashira Pitchayapakayakul commited on
Commit
fbea511
·
1 Parent(s): ff4b1b7

fix: HF 128-commits/hr cap — 5x retry-with-backoff + slower ticker + adaptive watchdog

Browse files

Audit found three compounding issues triggering '429 Too Many Requests' on
the dataset repo (HF cap is 128 commits/hr per repo):

1. push-training-to-hf.sh: previous behavior on 429 was give-up-and-leave-
slice-on-disk. The slice would be retried next cron tick (3 min later),
but if the cap was still saturated, it'd 429 again and pile up local
disk. Added 5-attempt exponential backoff (30s, 90s, 240s, 600s, 1200s
= ~36 min worst-case before giving up). On non-429 hub errors, retries
once then fails.

2. gh-actions-ticker.sh: dispatch interval 60s -> 120s. Halves the burst
rate that's reaching HF when GH runners complete and upload.

3. self-heal-watchdog.sh: kill_count was hardcoded 1. Now adaptive:
* pct >= 85%: kill 1 youngest shard
* pct >= 90%: kill 2 youngest
* pct >= 95%: kill 3 youngest + parquet-direct (the second-biggest hog)
We saw mem stick at 94% even after single-shard kills — adaptive
escalation lets the watchdog catch up to a runaway spike.

bin/gh-actions-ticker.sh CHANGED
@@ -17,7 +17,7 @@ set -uo pipefail
17
  LOG="$HOME/.surrogate/logs/gh-actions-ticker.log"
18
  mkdir -p "$(dirname "$LOG")"
19
 
20
- TICK_SEC="${GH_TICK_SEC:-60}"
21
 
22
  dispatch() {
23
  local repo="$1"
 
17
  LOG="$HOME/.surrogate/logs/gh-actions-ticker.log"
18
  mkdir -p "$(dirname "$LOG")"
19
 
20
+ TICK_SEC="${GH_TICK_SEC:-120}" # 60s -> 120s after HF rate-limit at 128 commits/hr
21
 
22
  dispatch() {
23
  local repo="$1"
bin/push-training-to-hf.sh CHANGED
@@ -60,24 +60,45 @@ hf_auth = os.environ["HF_AUTH"]
60
 
61
  try:
62
  from huggingface_hub import HfApi
 
63
  except ImportError:
64
  print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
65
  sys.exit(2)
66
 
67
  api = HfApi(token=hf_auth)
68
- try:
69
- api.upload_file(
70
- path_or_fileobj=slice_path,
71
- path_in_repo=remote,
72
- repo_id="axentx/surrogate-1-training-pairs",
73
- repo_type="dataset",
74
- commit_message=f"chunk: +{n_lines} pairs ({time.strftime('%H:%M')})",
75
- )
76
- print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded → {remote}")
77
- sys.exit(0)
78
- except Exception as e:
79
- print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:300]}")
80
- sys.exit(3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  PYEOF
82
  then
83
  echo "$NEW_OFFSET" > "$OFFSET_FILE"
 
60
 
61
  try:
62
  from huggingface_hub import HfApi
63
+ from huggingface_hub.errors import HfHubHTTPError
64
  except ImportError:
65
  print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
66
  sys.exit(2)
67
 
68
  api = HfApi(token=hf_auth)
69
+
70
+ # 5-attempt retry with exponential backoff. The repo has a hard 128
71
+ # commits/hr ceiling that gets hit when 40+ GH Actions runners + Space
72
+ # shards all push at once. Backoff lets that ceiling drift back down.
73
+ delays = [30, 90, 240, 600, 1200] # 30s, 1.5m, 4m, 10m, 20m (~36 min total worst-case)
74
+ for attempt, delay in enumerate([0] + delays):
75
+ if delay:
76
+ time.sleep(delay)
77
+ try:
78
+ api.upload_file(
79
+ path_or_fileobj=slice_path,
80
+ path_in_repo=remote,
81
+ repo_id="axentx/surrogate-1-training-pairs",
82
+ repo_type="dataset",
83
+ commit_message=f"chunk: +{n_lines} pairs ({time.strftime('%H:%M')})",
84
+ )
85
+ print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded → {remote} (attempt {attempt + 1})")
86
+ sys.exit(0)
87
+ except HfHubHTTPError as e:
88
+ msg = str(e)
89
+ if "429" in msg or "rate limit" in msg.lower() or "Too Many Requests" in msg:
90
+ print(f"[{time.strftime('%H:%M:%S')}] ⚠ 429 on attempt {attempt + 1}/{len(delays)+1} — backing off {delays[attempt] if attempt < len(delays) else 0}s")
91
+ continue
92
+ # Non-429 hub errors: retry once with short delay then fail
93
+ print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {msg[:200]}")
94
+ if attempt < 2: continue
95
+ sys.exit(3)
96
+ except Exception as e:
97
+ print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:200]}")
98
+ if attempt < 2: continue
99
+ sys.exit(3)
100
+ print(f"[{time.strftime('%H:%M:%S')}] ❌ all retries exhausted — slice will be retried next cron tick")
101
+ sys.exit(3)
102
  PYEOF
103
  then
104
  echo "$NEW_OFFSET" > "$OFFSET_FILE"
bin/self-heal-watchdog.sh CHANGED
@@ -54,15 +54,23 @@ last_kept_age_min() {
54
  heal_memory() {
55
  local pct="$1"
56
  log "MEMORY ALERT pct=$pct% threshold=$MEM_THRESHOLD_PCT% — preempting OOM"
57
- # Find the youngest (highest PID) dataset-enrich shard process and SIGTERM.
58
- # The shard loop sleeps SHARD_COOLDOWN before respawning, so memory recovers.
59
- local victim
60
- victim=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -1)
61
- if [[ -n "$victim" ]]; then
62
- log " -> kill youngest dataset-enrich pid=$victim"
63
- kill -TERM "$victim" 2>/dev/null || true
64
- else
65
- log " -> no dataset-enrich processes found; nothing to preempt"
 
 
 
 
 
 
 
 
66
  fi
67
  }
68
 
 
54
  heal_memory() {
55
  local pct="$1"
56
  log "MEMORY ALERT pct=$pct% threshold=$MEM_THRESHOLD_PCT% — preempting OOM"
57
+ # Adaptive aggressiveness kill more shards as pct gets dangerously close.
58
+ local kill_count=1
59
+ [[ "$pct" -ge 90 ]] && kill_count=2 # 90%+: kill 2 youngest
60
+ [[ "$pct" -ge 95 ]] && kill_count=3 # 95%+: kill 3 youngest, plus parquet-direct if running
61
+
62
+ local victims
63
+ victims=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -"$kill_count")
64
+ if [[ -n "$victims" ]]; then
65
+ echo "$victims" | while read -r pid; do
66
+ log " -> SIGTERM dataset-enrich pid=$pid"
67
+ kill -TERM "$pid" 2>/dev/null || true
68
+ done
69
+ fi
70
+ # Also nuke parquet-direct if mem is critical — it's the second-biggest hog
71
+ if [[ "$pct" -ge 95 ]]; then
72
+ local pq=$(pgrep -f "parquet-direct-ingest.sh" | head -1)
73
+ [[ -n "$pq" ]] && { log " -> CRITICAL: SIGTERM parquet-direct pid=$pq"; kill -TERM "$pq" 2>/dev/null || true; }
74
  fi
75
  }
76