Spaces:
Runtime error
Runtime error
feat(train-ready): single-file pusher train-ready/latest.jsonl.gz for CDN-only training fetches — eliminates HF API listing during Lightning/Kaggle training
Browse files- bin/train-ready-pusher.sh +84 -0
- start.sh +7 -0
bin/train-ready-pusher.sh
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
# Train-ready pusher — every 5 min, push /data/training-pairs.jsonl content
|
| 3 |
+
# (latest N samples, gzip'd) to a FIXED path:
|
| 4 |
+
# axentx/surrogate-1-training-pairs/train-ready/latest.jsonl.gz
|
| 5 |
+
#
|
| 6 |
+
# Why fixed path: Lightning training script can curl this URL directly via CDN
|
| 7 |
+
# without ANY HF API calls (no list_repo_files, no rate limit). Solves the
|
| 8 |
+
# 1000-req/5min token contention between HF Space daemons + training jobs.
|
| 9 |
+
#
|
| 10 |
+
# Format: each line = {"prompt": "...", "response": "..."} — same as live file.
|
| 11 |
+
|
| 12 |
+
set -uo pipefail
|
| 13 |
+
set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
|
| 14 |
+
|
| 15 |
+
LOG="$HOME/.surrogate/logs/train-ready-pusher.log"
|
| 16 |
+
mkdir -p "$(dirname "$LOG")"
|
| 17 |
+
SOURCE="${HOME}/.surrogate/training-pairs.jsonl"
|
| 18 |
+
TARGET_REPO="axentx/surrogate-1-training-pairs"
|
| 19 |
+
TARGET_PATH="train-ready/latest.jsonl.gz"
|
| 20 |
+
MAX_LINES="${MAX_LINES:-200000}" # 200K samples — Lightning can sample down
|
| 21 |
+
|
| 22 |
+
if [[ -z "${HF_TOKEN:-}" ]]; then
|
| 23 |
+
echo "[$(date +%H:%M:%S)] train-ready-pusher: HF_TOKEN not set" | tee -a "$LOG"
|
| 24 |
+
exit 0
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
while true; do
|
| 28 |
+
if [[ ! -f "$SOURCE" ]]; then
|
| 29 |
+
echo "[$(date +%H:%M:%S)] source not found: $SOURCE" | tee -a "$LOG"
|
| 30 |
+
sleep 300
|
| 31 |
+
continue
|
| 32 |
+
fi
|
| 33 |
+
|
| 34 |
+
# Take latest N lines (most-recent samples = most-curated by self-improvement loop)
|
| 35 |
+
TMP="/tmp/train-ready-$(date +%s).jsonl.gz"
|
| 36 |
+
tail -n "$MAX_LINES" "$SOURCE" | gzip -c > "$TMP" 2>>"$LOG"
|
| 37 |
+
BYTES=$(stat -c %s "$TMP" 2>/dev/null || echo 0)
|
| 38 |
+
|
| 39 |
+
if [[ "$BYTES" -lt 1000 ]]; then
|
| 40 |
+
echo "[$(date +%H:%M:%S)] file too small ($BYTES B), skip" | tee -a "$LOG"
|
| 41 |
+
rm -f "$TMP"
|
| 42 |
+
sleep 300
|
| 43 |
+
continue
|
| 44 |
+
fi
|
| 45 |
+
|
| 46 |
+
echo "[$(date +%H:%M:%S)] pushing ${BYTES} bytes (${MAX_LINES} lines) → ${TARGET_PATH}" | tee -a "$LOG"
|
| 47 |
+
|
| 48 |
+
HF_TOKEN="$HF_TOKEN" python3 - "$TMP" "$TARGET_REPO" "$TARGET_PATH" <<'PYEOF' 2>>"$LOG"
|
| 49 |
+
import sys, os, time
|
| 50 |
+
local_path, repo, remote = sys.argv[1], sys.argv[2], sys.argv[3]
|
| 51 |
+
from huggingface_hub import HfApi
|
| 52 |
+
from huggingface_hub.errors import HfHubHTTPError
|
| 53 |
+
|
| 54 |
+
api = HfApi(token=os.environ["HF_TOKEN"])
|
| 55 |
+
for attempt in range(5):
|
| 56 |
+
try:
|
| 57 |
+
api.upload_file(
|
| 58 |
+
path_or_fileobj=local_path,
|
| 59 |
+
path_in_repo=remote,
|
| 60 |
+
repo_id=repo,
|
| 61 |
+
repo_type="dataset",
|
| 62 |
+
commit_message=f"train-ready pusher: latest snapshot {time.strftime('%H:%M')}",
|
| 63 |
+
)
|
| 64 |
+
print(f" ✅ pushed → {repo}/{remote}")
|
| 65 |
+
sys.exit(0)
|
| 66 |
+
except HfHubHTTPError as e:
|
| 67 |
+
if "429" in str(e):
|
| 68 |
+
wait = 60 * (attempt + 1)
|
| 69 |
+
print(f" rate-limit; wait {wait}s")
|
| 70 |
+
time.sleep(wait)
|
| 71 |
+
else:
|
| 72 |
+
print(f" ❌ {type(e).__name__}: {str(e)[:200]}")
|
| 73 |
+
sys.exit(1)
|
| 74 |
+
except Exception as e:
|
| 75 |
+
print(f" ❌ {type(e).__name__}: {str(e)[:200]}")
|
| 76 |
+
sys.exit(1)
|
| 77 |
+
print(" ❌ all retries exhausted")
|
| 78 |
+
sys.exit(1)
|
| 79 |
+
PYEOF
|
| 80 |
+
|
| 81 |
+
rm -f "$TMP"
|
| 82 |
+
# Push every 5 min — keeps Lightning's view fresh without burning commits
|
| 83 |
+
sleep 300
|
| 84 |
+
done
|
start.sh
CHANGED
|
@@ -314,6 +314,13 @@ echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$L
|
|
| 314 |
nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
|
| 315 |
echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
|
| 316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
# ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─
|
| 318 |
cat > /tmp/hermes-cron.sh <<'CRONSH'
|
| 319 |
#!/bin/bash
|
|
|
|
| 314 |
nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
|
| 315 |
echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
|
| 316 |
|
| 317 |
+
# ── 7d. Train-ready pusher — pushes /data/training-pairs.jsonl as a SINGLE
|
| 318 |
+
# fixed-path file (train-ready/latest.jsonl.gz) every 5 min so Lightning /
|
| 319 |
+
# Kaggle / Modal training scripts can curl one URL via CDN without any
|
| 320 |
+
# HF API calls (avoids the 1000-req/5min token contention).
|
| 321 |
+
nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &
|
| 322 |
+
echo "[$(date +%H:%M:%S)] train-ready-pusher daemon started" >> "$LOG_DIR/boot.log"
|
| 323 |
+
|
| 324 |
# ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─
|
| 325 |
cat > /tmp/hermes-cron.sh <<'CRONSH'
|
| 326 |
#!/bin/bash
|