Spaces:

axentx
/

surrogate-1

Runtime error

Ashira Pitchayapakayakul commited on 10 days ago

Commit

fbea511

1 Parent(s): ff4b1b7

fix: HF 128-commits/hr cap — 5x retry-with-backoff + slower ticker + adaptive watchdog

Audit found three compounding issues triggering '429 Too Many Requests' on
the dataset repo (HF cap is 128 commits/hr per repo):

1. push-training-to-hf.sh: previous behavior on 429 was give-up-and-leave-
slice-on-disk. The slice would be retried next cron tick (3 min later),
but if the cap was still saturated, it'd 429 again and pile up local
disk. Added 5-attempt exponential backoff (30s, 90s, 240s, 600s, 1200s
= ~36 min worst-case before giving up). On non-429 hub errors, retries
once then fails.

2. gh-actions-ticker.sh: dispatch interval 60s -> 120s. Halves the burst
rate that's reaching HF when GH runners complete and upload.

3. self-heal-watchdog.sh: kill_count was hardcoded 1. Now adaptive:
* pct >= 85%: kill 1 youngest shard
* pct >= 90%: kill 2 youngest
* pct >= 95%: kill 3 youngest + parquet-direct (the second-biggest hog)
We saw mem stick at 94% even after single-shard kills — adaptive
escalation lets the watchdog catch up to a runaway spike.

Files changed (3) hide show

bin/gh-actions-ticker.sh +1 -1
bin/push-training-to-hf.sh +34 -13
bin/self-heal-watchdog.sh +17 -9

bin/gh-actions-ticker.sh CHANGED Viewed

@@ -17,7 +17,7 @@ set -uo pipefail
 LOG="$HOME/.surrogate/logs/gh-actions-ticker.log"
 mkdir -p "$(dirname "$LOG")"
-TICK_SEC="${GH_TICK_SEC:-60}"
 dispatch() {
     local repo="$1"

 LOG="$HOME/.surrogate/logs/gh-actions-ticker.log"
 mkdir -p "$(dirname "$LOG")"
+TICK_SEC="${GH_TICK_SEC:-120}"   # 60s -> 120s after HF rate-limit at 128 commits/hr
 dispatch() {
     local repo="$1"

bin/push-training-to-hf.sh CHANGED Viewed

@@ -60,24 +60,45 @@ hf_auth = os.environ["HF_AUTH"]
 try:
     from huggingface_hub import HfApi
 except ImportError:
     print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
     sys.exit(2)
 api = HfApi(token=hf_auth)
-try:
-    api.upload_file(
-        path_or_fileobj=slice_path,
-        path_in_repo=remote,
-        repo_id="axentx/surrogate-1-training-pairs",
-        repo_type="dataset",
-        commit_message=f"chunk: +{n_lines} pairs ({time.strftime('%H:%M')})",
-    )
-    print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded → {remote}")
-    sys.exit(0)
-except Exception as e:
-    print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:300]}")
-    sys.exit(3)
 PYEOF
 then
     echo "$NEW_OFFSET" > "$OFFSET_FILE"

 try:
     from huggingface_hub import HfApi
+    from huggingface_hub.errors import HfHubHTTPError
 except ImportError:
     print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
     sys.exit(2)
 api = HfApi(token=hf_auth)
+# 5-attempt retry with exponential backoff. The repo has a hard 128
+# commits/hr ceiling that gets hit when 40+ GH Actions runners + Space
+# shards all push at once. Backoff lets that ceiling drift back down.
+delays = [30, 90, 240, 600, 1200]   # 30s, 1.5m, 4m, 10m, 20m  (~36 min total worst-case)
+for attempt, delay in enumerate([0] + delays):
+    if delay:
+        time.sleep(delay)
+    try:
+        api.upload_file(
+            path_or_fileobj=slice_path,
+            path_in_repo=remote,
+            repo_id="axentx/surrogate-1-training-pairs",
+            repo_type="dataset",
+            commit_message=f"chunk: +{n_lines} pairs ({time.strftime('%H:%M')})",
+        )
+        print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded → {remote} (attempt {attempt + 1})")
+        sys.exit(0)
+    except HfHubHTTPError as e:
+        msg = str(e)
+        if "429" in msg or "rate limit" in msg.lower() or "Too Many Requests" in msg:
+            print(f"[{time.strftime('%H:%M:%S')}] ⚠ 429 on attempt {attempt + 1}/{len(delays)+1} — backing off {delays[attempt] if attempt < len(delays) else 0}s")
+            continue
+        # Non-429 hub errors: retry once with short delay then fail
+        print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {msg[:200]}")
+        if attempt < 2: continue
+        sys.exit(3)
+    except Exception as e:
+        print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:200]}")
+        if attempt < 2: continue
+        sys.exit(3)
+print(f"[{time.strftime('%H:%M:%S')}] ❌ all retries exhausted — slice will be retried next cron tick")
+sys.exit(3)
 PYEOF
 then
     echo "$NEW_OFFSET" > "$OFFSET_FILE"

bin/self-heal-watchdog.sh CHANGED Viewed

@@ -54,15 +54,23 @@ last_kept_age_min() {
 heal_memory() {
     local pct="$1"
     log "MEMORY ALERT pct=$pct% threshold=$MEM_THRESHOLD_PCT% — preempting OOM"
-    # Find the youngest (highest PID) dataset-enrich shard process and SIGTERM.
-    # The shard loop sleeps SHARD_COOLDOWN before respawning, so memory recovers.
-    local victim
-    victim=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -1)
-    if [[ -n "$victim" ]]; then
-        log "  -> kill youngest dataset-enrich pid=$victim"
-        kill -TERM "$victim" 2>/dev/null || true
-    else
-        log "  -> no dataset-enrich processes found; nothing to preempt"
     fi
 }

 heal_memory() {
     local pct="$1"
     log "MEMORY ALERT pct=$pct% threshold=$MEM_THRESHOLD_PCT% — preempting OOM"
+    # Adaptive aggressiveness — kill more shards as pct gets dangerously close.
+    local kill_count=1
+    [[ "$pct" -ge 90 ]] && kill_count=2     # 90%+: kill 2 youngest
+    [[ "$pct" -ge 95 ]] && kill_count=3     # 95%+: kill 3 youngest, plus parquet-direct if running
+    local victims
+    victims=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -"$kill_count")
+    if [[ -n "$victims" ]]; then
+        echo "$victims" | while read -r pid; do
+            log "  -> SIGTERM dataset-enrich pid=$pid"
+            kill -TERM "$pid" 2>/dev/null || true
+        done
+    fi
+    # Also nuke parquet-direct if mem is critical — it's the second-biggest hog
+    if [[ "$pct" -ge 95 ]]; then
+        local pq=$(pgrep -f "parquet-direct-ingest.sh" | head -1)
+        [[ -n "$pq" ]] && { log "  -> CRITICAL: SIGTERM parquet-direct pid=$pq"; kill -TERM "$pq" 2>/dev/null || true; }
     fi
 }