Spaces:
Runtime error
fix: HF 128-commits/hr cap — 5x retry-with-backoff + slower ticker + adaptive watchdog
Browse filesAudit found three compounding issues triggering '429 Too Many Requests' on
the dataset repo (HF cap is 128 commits/hr per repo):
1. push-training-to-hf.sh: previous behavior on 429 was give-up-and-leave-
slice-on-disk. The slice would be retried next cron tick (3 min later),
but if the cap was still saturated, it'd 429 again and pile up local
disk. Added 5-attempt exponential backoff (30s, 90s, 240s, 600s, 1200s
= ~36 min worst-case before giving up). On non-429 hub errors, retries
once then fails.
2. gh-actions-ticker.sh: dispatch interval 60s -> 120s. Halves the burst
rate that's reaching HF when GH runners complete and upload.
3. self-heal-watchdog.sh: kill_count was hardcoded 1. Now adaptive:
* pct >= 85%: kill 1 youngest shard
* pct >= 90%: kill 2 youngest
* pct >= 95%: kill 3 youngest + parquet-direct (the second-biggest hog)
We saw mem stick at 94% even after single-shard kills — adaptive
escalation lets the watchdog catch up to a runaway spike.
- bin/gh-actions-ticker.sh +1 -1
- bin/push-training-to-hf.sh +34 -13
- bin/self-heal-watchdog.sh +17 -9
|
@@ -17,7 +17,7 @@ set -uo pipefail
|
|
| 17 |
LOG="$HOME/.surrogate/logs/gh-actions-ticker.log"
|
| 18 |
mkdir -p "$(dirname "$LOG")"
|
| 19 |
|
| 20 |
-
TICK_SEC="${GH_TICK_SEC:-
|
| 21 |
|
| 22 |
dispatch() {
|
| 23 |
local repo="$1"
|
|
|
|
| 17 |
LOG="$HOME/.surrogate/logs/gh-actions-ticker.log"
|
| 18 |
mkdir -p "$(dirname "$LOG")"
|
| 19 |
|
| 20 |
+
TICK_SEC="${GH_TICK_SEC:-120}" # 60s -> 120s after HF rate-limit at 128 commits/hr
|
| 21 |
|
| 22 |
dispatch() {
|
| 23 |
local repo="$1"
|
|
@@ -60,24 +60,45 @@ hf_auth = os.environ["HF_AUTH"]
|
|
| 60 |
|
| 61 |
try:
|
| 62 |
from huggingface_hub import HfApi
|
|
|
|
| 63 |
except ImportError:
|
| 64 |
print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
|
| 65 |
sys.exit(2)
|
| 66 |
|
| 67 |
api = HfApi(token=hf_auth)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
PYEOF
|
| 82 |
then
|
| 83 |
echo "$NEW_OFFSET" > "$OFFSET_FILE"
|
|
|
|
| 60 |
|
| 61 |
try:
|
| 62 |
from huggingface_hub import HfApi
|
| 63 |
+
from huggingface_hub.errors import HfHubHTTPError
|
| 64 |
except ImportError:
|
| 65 |
print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
|
| 66 |
sys.exit(2)
|
| 67 |
|
| 68 |
api = HfApi(token=hf_auth)
|
| 69 |
+
|
| 70 |
+
# 5-attempt retry with exponential backoff. The repo has a hard 128
|
| 71 |
+
# commits/hr ceiling that gets hit when 40+ GH Actions runners + Space
|
| 72 |
+
# shards all push at once. Backoff lets that ceiling drift back down.
|
| 73 |
+
delays = [30, 90, 240, 600, 1200] # 30s, 1.5m, 4m, 10m, 20m (~36 min total worst-case)
|
| 74 |
+
for attempt, delay in enumerate([0] + delays):
|
| 75 |
+
if delay:
|
| 76 |
+
time.sleep(delay)
|
| 77 |
+
try:
|
| 78 |
+
api.upload_file(
|
| 79 |
+
path_or_fileobj=slice_path,
|
| 80 |
+
path_in_repo=remote,
|
| 81 |
+
repo_id="axentx/surrogate-1-training-pairs",
|
| 82 |
+
repo_type="dataset",
|
| 83 |
+
commit_message=f"chunk: +{n_lines} pairs ({time.strftime('%H:%M')})",
|
| 84 |
+
)
|
| 85 |
+
print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded → {remote} (attempt {attempt + 1})")
|
| 86 |
+
sys.exit(0)
|
| 87 |
+
except HfHubHTTPError as e:
|
| 88 |
+
msg = str(e)
|
| 89 |
+
if "429" in msg or "rate limit" in msg.lower() or "Too Many Requests" in msg:
|
| 90 |
+
print(f"[{time.strftime('%H:%M:%S')}] ⚠ 429 on attempt {attempt + 1}/{len(delays)+1} — backing off {delays[attempt] if attempt < len(delays) else 0}s")
|
| 91 |
+
continue
|
| 92 |
+
# Non-429 hub errors: retry once with short delay then fail
|
| 93 |
+
print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {msg[:200]}")
|
| 94 |
+
if attempt < 2: continue
|
| 95 |
+
sys.exit(3)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:200]}")
|
| 98 |
+
if attempt < 2: continue
|
| 99 |
+
sys.exit(3)
|
| 100 |
+
print(f"[{time.strftime('%H:%M:%S')}] ❌ all retries exhausted — slice will be retried next cron tick")
|
| 101 |
+
sys.exit(3)
|
| 102 |
PYEOF
|
| 103 |
then
|
| 104 |
echo "$NEW_OFFSET" > "$OFFSET_FILE"
|
|
@@ -54,15 +54,23 @@ last_kept_age_min() {
|
|
| 54 |
heal_memory() {
|
| 55 |
local pct="$1"
|
| 56 |
log "MEMORY ALERT pct=$pct% threshold=$MEM_THRESHOLD_PCT% — preempting OOM"
|
| 57 |
-
#
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
fi
|
| 67 |
}
|
| 68 |
|
|
|
|
| 54 |
heal_memory() {
|
| 55 |
local pct="$1"
|
| 56 |
log "MEMORY ALERT pct=$pct% threshold=$MEM_THRESHOLD_PCT% — preempting OOM"
|
| 57 |
+
# Adaptive aggressiveness — kill more shards as pct gets dangerously close.
|
| 58 |
+
local kill_count=1
|
| 59 |
+
[[ "$pct" -ge 90 ]] && kill_count=2 # 90%+: kill 2 youngest
|
| 60 |
+
[[ "$pct" -ge 95 ]] && kill_count=3 # 95%+: kill 3 youngest, plus parquet-direct if running
|
| 61 |
+
|
| 62 |
+
local victims
|
| 63 |
+
victims=$(pgrep -f "dataset-enrich.sh" | sort -nr | head -"$kill_count")
|
| 64 |
+
if [[ -n "$victims" ]]; then
|
| 65 |
+
echo "$victims" | while read -r pid; do
|
| 66 |
+
log " -> SIGTERM dataset-enrich pid=$pid"
|
| 67 |
+
kill -TERM "$pid" 2>/dev/null || true
|
| 68 |
+
done
|
| 69 |
+
fi
|
| 70 |
+
# Also nuke parquet-direct if mem is critical — it's the second-biggest hog
|
| 71 |
+
if [[ "$pct" -ge 95 ]]; then
|
| 72 |
+
local pq=$(pgrep -f "parquet-direct-ingest.sh" | head -1)
|
| 73 |
+
[[ -n "$pq" ]] && { log " -> CRITICAL: SIGTERM parquet-direct pid=$pq"; kill -TERM "$pq" 2>/dev/null || true; }
|
| 74 |
fi
|
| 75 |
}
|
| 76 |
|