ashirato commited on
Commit
2fd0435
·
1 Parent(s): 21c6a6e

feat(train-ready): single-file pusher train-ready/latest.jsonl.gz for CDN-only training fetches — eliminates HF API listing during Lightning/Kaggle training

Browse files
Files changed (2) hide show
  1. bin/train-ready-pusher.sh +84 -0
  2. start.sh +7 -0
bin/train-ready-pusher.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ # Train-ready pusher — every 5 min, push /data/training-pairs.jsonl content
3
+ # (latest N samples, gzip'd) to a FIXED path:
4
+ # axentx/surrogate-1-training-pairs/train-ready/latest.jsonl.gz
5
+ #
6
+ # Why fixed path: Lightning training script can curl this URL directly via CDN
7
+ # without ANY HF API calls (no list_repo_files, no rate limit). Solves the
8
+ # 1000-req/5min token contention between HF Space daemons + training jobs.
9
+ #
10
+ # Format: each line = {"prompt": "...", "response": "..."} — same as live file.
11
+
12
+ set -uo pipefail
13
+ set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
14
+
15
+ LOG="$HOME/.surrogate/logs/train-ready-pusher.log"
16
+ mkdir -p "$(dirname "$LOG")"
17
+ SOURCE="${HOME}/.surrogate/training-pairs.jsonl"
18
+ TARGET_REPO="axentx/surrogate-1-training-pairs"
19
+ TARGET_PATH="train-ready/latest.jsonl.gz"
20
+ MAX_LINES="${MAX_LINES:-200000}" # 200K samples — Lightning can sample down
21
+
22
+ if [[ -z "${HF_TOKEN:-}" ]]; then
23
+ echo "[$(date +%H:%M:%S)] train-ready-pusher: HF_TOKEN not set" | tee -a "$LOG"
24
+ exit 0
25
+ fi
26
+
27
+ while true; do
28
+ if [[ ! -f "$SOURCE" ]]; then
29
+ echo "[$(date +%H:%M:%S)] source not found: $SOURCE" | tee -a "$LOG"
30
+ sleep 300
31
+ continue
32
+ fi
33
+
34
+ # Take latest N lines (most-recent samples = most-curated by self-improvement loop)
35
+ TMP="/tmp/train-ready-$(date +%s).jsonl.gz"
36
+ tail -n "$MAX_LINES" "$SOURCE" | gzip -c > "$TMP" 2>>"$LOG"
37
+ BYTES=$(stat -c %s "$TMP" 2>/dev/null || echo 0)
38
+
39
+ if [[ "$BYTES" -lt 1000 ]]; then
40
+ echo "[$(date +%H:%M:%S)] file too small ($BYTES B), skip" | tee -a "$LOG"
41
+ rm -f "$TMP"
42
+ sleep 300
43
+ continue
44
+ fi
45
+
46
+ echo "[$(date +%H:%M:%S)] pushing ${BYTES} bytes (${MAX_LINES} lines) → ${TARGET_PATH}" | tee -a "$LOG"
47
+
48
+ HF_TOKEN="$HF_TOKEN" python3 - "$TMP" "$TARGET_REPO" "$TARGET_PATH" <<'PYEOF' 2>>"$LOG"
49
+ import sys, os, time
50
+ local_path, repo, remote = sys.argv[1], sys.argv[2], sys.argv[3]
51
+ from huggingface_hub import HfApi
52
+ from huggingface_hub.errors import HfHubHTTPError
53
+
54
+ api = HfApi(token=os.environ["HF_TOKEN"])
55
+ for attempt in range(5):
56
+ try:
57
+ api.upload_file(
58
+ path_or_fileobj=local_path,
59
+ path_in_repo=remote,
60
+ repo_id=repo,
61
+ repo_type="dataset",
62
+ commit_message=f"train-ready pusher: latest snapshot {time.strftime('%H:%M')}",
63
+ )
64
+ print(f" ✅ pushed → {repo}/{remote}")
65
+ sys.exit(0)
66
+ except HfHubHTTPError as e:
67
+ if "429" in str(e):
68
+ wait = 60 * (attempt + 1)
69
+ print(f" rate-limit; wait {wait}s")
70
+ time.sleep(wait)
71
+ else:
72
+ print(f" ❌ {type(e).__name__}: {str(e)[:200]}")
73
+ sys.exit(1)
74
+ except Exception as e:
75
+ print(f" ❌ {type(e).__name__}: {str(e)[:200]}")
76
+ sys.exit(1)
77
+ print(" ❌ all retries exhausted")
78
+ sys.exit(1)
79
+ PYEOF
80
+
81
+ rm -f "$TMP"
82
+ # Push every 5 min — keeps Lightning's view fresh without burning commits
83
+ sleep 300
84
+ done
start.sh CHANGED
@@ -314,6 +314,13 @@ echo "[$(date +%H:%M:%S)] parquet-direct-ingest started (2 parallel DLs)" >> "$L
314
  nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
315
  echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
316
 
 
 
 
 
 
 
 
317
  # ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─
318
  cat > /tmp/hermes-cron.sh <<'CRONSH'
319
  #!/bin/bash
 
314
  nohup bash ~/.surrogate/bin/skill-synthesis-daemon.sh > "$LOG_DIR/skill-synthesis.log" 2>&1 &
315
  echo "[$(date +%H:%M:%S)] skill-synthesis daemon started" >> "$LOG_DIR/boot.log"
316
 
317
+ # ── 7d. Train-ready pusher — pushes /data/training-pairs.jsonl as a SINGLE
318
+ # fixed-path file (train-ready/latest.jsonl.gz) every 5 min so Lightning /
319
+ # Kaggle / Modal training scripts can curl one URL via CDN without any
320
+ # HF API calls (avoids the 1000-req/5min token contention).
321
+ nohup bash ~/.surrogate/bin/train-ready-pusher.sh > "$LOG_DIR/train-ready-pusher.log" 2>&1 &
322
+ echo "[$(date +%H:%M:%S)] train-ready-pusher daemon started" >> "$LOG_DIR/boot.log"
323
+
324
  # ── 7b. Cron loop — non-scrape daemons (scrape now runs continuously above) ─
325
  cat > /tmp/hermes-cron.sh <<'CRONSH'
326
  #!/bin/bash