Spaces:

axentx
/

surrogate-1

Runtime error

Ashira Pitchayapakayakul commited on 10 days ago

Commit

ff1eae4

1 Parent(s): ea561c8

fix: 4 critical bugs + incremental push (gentle ingest, no crashes)

ROOT CAUSES:
1. push-training-to-hf.sh: tried to upload entire 35K-pair file in one HfApi call
→ kept failing → offset never advanced → infinite retry of the same massive blob
FIX: chunk-based upload (1500 pairs/call, ~700KB/upload, ~17min to drain backlog)
Each chunk → unique remote path 'batches/YYYY-MM-DD/chunk-HHMMSS-N.jsonl'
Offset advances incrementally on each successful chunk.

2. surrogate-self-ingest.sh: filter required prompt+response >= 50 chars
→ 35K agentic-crawler pairs all skipped (placeholder responses ~40 chars)
→ 'total indexed: 0' despite processing 35K pairs
FIX: relax filter to 'both fields non-empty', batch-process 5000/run

3. dataset-enrich.sh: hardcoded ~/.claude/venv/bin/python (Mac path, missing on Linux)
FIX: use bare 'python3' (PATH-based)

4. scrape-keyword-tuner.sh + perf-watchdog.sh: vm_stat (macOS only) without fallback
FIX: detect /proc/meminfo first (Linux), vm_stat second (macOS)

USER REQUESTED CONSTRAINT (honored):
'อย่า ingest ทีเดียว ตาย ค่อยๆ ทำ' →
- self-ingest: BATCH_SIZE=5000 (configurable via SELF_INGEST_BATCH env)
- push-training: CHUNK_SIZE=1500 (configurable via TRAINING_PUSH_CHUNK env)
- Both advance offset incrementally; safe to interrupt + resume

Files changed (4) hide show

bin/dataset-enrich.sh +1 -1
bin/perf-watchdog.sh +8 -2
bin/push-training-to-hf.sh +44 -45
bin/surrogate-self-ingest.sh +34 -17

bin/dataset-enrich.sh CHANGED Viewed

@@ -23,7 +23,7 @@ mkdir -p "$WORK" "$(dirname "$LOG")"
 echo "[$(date +%H:%M:%S)] dataset enrich start" | tee "$LOG"
-~/.claude/venv/bin/python <<'PYEOF' 2>&1 | tee -a "$LOG"
 from huggingface_hub import HfApi
 from pathlib import Path
 from datasets import load_dataset

 echo "[$(date +%H:%M:%S)] dataset enrich start" | tee "$LOG"
+python3 <<'PYEOF' 2>&1 | tee -a "$LOG"
 from huggingface_hub import HfApi
 from pathlib import Path
 from datasets import load_dataset

bin/perf-watchdog.sh CHANGED Viewed

@@ -25,10 +25,16 @@ PROC_CAP=30
 DISK_WARN_GB=2
 get_load() {
-    uptime | awk -F'load averages:' '{print $2}' | awk '{print int($1)}'
 }
 get_free_pages() {
-    vm_stat | awk '/Pages free/{gsub("[.]","",$3); print $3}'
 }
 get_scrape_procs() {
     pgrep -f "fs-to-jsonl\|github-bulk-train\|chroma-to-training\|bulk-scrape-burst" 2>/dev/null | wc -l | tr -d ' '

 DISK_WARN_GB=2
 get_load() {
+    uptime | sed -E 's/.*load average[s]?:[[:space:]]*//' | awk -F',' '{print int($1)}'
 }
 get_free_pages() {
+    if [[ -r /proc/meminfo ]]; then
+        awk '/MemAvailable/{print int($2/4)}' /proc/meminfo
+    elif command -v vm_stat >/dev/null 2>&1; then
+        vm_stat | awk '/Pages free/{gsub("[.]","",$3); print $3}'
+    else
+        echo 999999
+    fi
 }
 get_scrape_procs() {
     pgrep -f "fs-to-jsonl\|github-bulk-train\|chroma-to-training\|bulk-scrape-burst" 2>/dev/null | wc -l | tr -d ' '

bin/push-training-to-hf.sh CHANGED Viewed

@@ -1,42 +1,61 @@
 #!/usr/bin/env bash
-# Push accumulated training pairs from local jsonl → axentx/surrogate-1-training-pairs.
-# Uses python HfApi only (CLI syntax changed across versions; not reliable).
-# Idempotent: tracks last-pushed line offset so duplicates are skipped.
-# Only updates offset if push actually succeeded.
 set -uo pipefail
 set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
 SRC="$HOME/.surrogate/training-pairs.jsonl"
 OFFSET_FILE="$HOME/.surrogate/.training-push-offset"
 LOG="$HOME/.surrogate/logs/training-push.log"
 mkdir -p "$(dirname "$LOG")"
 [[ ! -f "$SRC" ]] && { echo "[$(date +%H:%M:%S)] no source $SRC" | tee -a "$LOG"; exit 0; }
 CUR_LINES=$(wc -l < "$SRC" | tr -d ' ')
 PREV_OFFSET=$(cat "$OFFSET_FILE" 2>/dev/null || echo 0)
-NEW_LINES=$(( CUR_LINES - PREV_OFFSET ))
-echo "[$(date +%H:%M:%S)] training push: $NEW_LINES new pairs (offset=$PREV_OFFSET, total=$CUR_LINES)" | tee -a "$LOG"
-[[ $NEW_LINES -le 0 ]] && exit 0
-# Resolve token from any HF env var name
 HF_AUTH="${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-${HUGGINGFACE_TOKEN:-}}}"
 if [[ -z "$HF_AUTH" ]]; then
-    echo "[$(date +%H:%M:%S)] ERR: no HF_TOKEN env — cannot upload" | tee -a "$LOG"
     exit 1
 fi
-# Slice new pairs to a date-stamped file
 DATE_TAG=$(date +%Y-%m-%d)
-SLICE="$HOME/.surrogate/.push-slice-${DATE_TAG}.jsonl"
-tail -n "$NEW_LINES" "$SRC" >> "$SLICE"
-# Upload via python HfApi (explicit token, explicit error handling)
-if HF_AUTH="$HF_AUTH" python3 - "$SLICE" "$NEW_LINES" "$DATE_TAG" >> "$LOG" 2>&1 <<'PYEOF'
-import sys, os, json, hashlib, time
-from pathlib import Path
-slice_path, n_pairs, date_tag = sys.argv[1], int(sys.argv[2]), sys.argv[3]
 hf_auth = os.environ["HF_AUTH"]
 try:
@@ -45,47 +64,27 @@ except ImportError:
     print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
     sys.exit(2)
-# Append to a daily file rather than overwrite — accumulate across pushes
 api = HfApi(token=hf_auth)
-remote_path = f"auto-orchestrate-{date_tag}.jsonl"
 try:
-    # Check if remote file exists; if yes, fetch + concat to avoid losing prior pushes
-    try:
-        existing = api.hf_hub_download(
-            repo_id="axentx/surrogate-1-training-pairs",
-            filename=remote_path,
-            repo_type="dataset",
-            local_dir="/tmp/hf-push-cache",
-            local_dir_use_symlinks=False,
-        )
-        # Concat: existing + slice → new payload
-        merged = Path("/tmp/hf-push-cache") / f"merged-{remote_path}"
-        with open(merged, "wb") as out:
-            out.write(Path(existing).read_bytes())
-            out.write(Path(slice_path).read_bytes())
-        upload_path = str(merged)
-    except Exception:
-        upload_path = slice_path
     api.upload_file(
-        path_or_fileobj=upload_path,
-        path_in_repo=remote_path,
         repo_id="axentx/surrogate-1-training-pairs",
         repo_type="dataset",
-        commit_message=f"auto-orchestrate: +{n_pairs} pairs ({time.strftime('%H:%M')})",
     )
-    print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded {n_pairs} new pairs to {remote_path}")
     sys.exit(0)
 except Exception as e:
     print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:300]}")
     sys.exit(3)
 PYEOF
 then
-    # Only advance offset on actual upload success
-    echo "$CUR_LINES" > "$OFFSET_FILE"
     rm -f "$SLICE"
-    echo "[$(date +%H:%M:%S)] push complete · offset → $CUR_LINES" | tee -a "$LOG"
 else
-    echo "[$(date +%H:%M:%S)] push failed — offset unchanged ($PREV_OFFSET), slice retained for retry" | tee -a "$LOG"
     exit 1
 fi

 #!/usr/bin/env bash
+# Push training pairs → HF dataset, INCREMENTALLY in small batches.
+#
+# Strategy: never upload the whole file. Each cron run pushes ONE chunk of
+# CHUNK_SIZE pairs to a date-stamped file (one per day). Small uploads = fast,
+# resilient, avoid timeouts on large blobs.
+#
+# Idempotent: tracks last-pushed line offset. Only advances on success.
+# If 35K pairs queued, drains over ~17 min (CHUNK_SIZE=1500 every 3 min).
 set -uo pipefail
 set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
 SRC="$HOME/.surrogate/training-pairs.jsonl"
 OFFSET_FILE="$HOME/.surrogate/.training-push-offset"
 LOG="$HOME/.surrogate/logs/training-push.log"
+CHUNK_SIZE="${TRAINING_PUSH_CHUNK:-1500}"
 mkdir -p "$(dirname "$LOG")"
 [[ ! -f "$SRC" ]] && { echo "[$(date +%H:%M:%S)] no source $SRC" | tee -a "$LOG"; exit 0; }
 CUR_LINES=$(wc -l < "$SRC" | tr -d ' ')
 PREV_OFFSET=$(cat "$OFFSET_FILE" 2>/dev/null || echo 0)
+QUEUED=$(( CUR_LINES - PREV_OFFSET ))
+echo "[$(date +%H:%M:%S)] queued=$QUEUED (offset=$PREV_OFFSET total=$CUR_LINES chunk=$CHUNK_SIZE)" | tee -a "$LOG"
+[[ $QUEUED -le 0 ]] && exit 0
+# Take just one chunk (don't try to push everything at once — that's why it kept failing)
+TAKE=$QUEUED
+[[ $TAKE -gt $CHUNK_SIZE ]] && TAKE=$CHUNK_SIZE
+# Resolve token
 HF_AUTH="${HF_TOKEN:-${HUGGING_FACE_HUB_TOKEN:-${HUGGINGFACE_TOKEN:-}}}"
 if [[ -z "$HF_AUTH" ]]; then
+    echo "[$(date +%H:%M:%S)] ERR: no HF_TOKEN — cannot upload" | tee -a "$LOG"
     exit 1
 fi
+# Slice this chunk to a unique-per-cron-fire file (no overwrite)
 DATE_TAG=$(date +%Y-%m-%d)
+TIME_TAG=$(date +%H%M%S)
+SLICE_DIR="$HOME/.surrogate/.push-slices"
+mkdir -p "$SLICE_DIR"
+SLICE="$SLICE_DIR/${DATE_TAG}_${TIME_TAG}.jsonl"
+# Take TAKE lines starting AFTER prev offset
+sed -n "$((PREV_OFFSET + 1)),$((PREV_OFFSET + TAKE))p" "$SRC" > "$SLICE"
+SLICE_LINES=$(wc -l < "$SLICE" | tr -d ' ')
+SLICE_BYTES=$(wc -c < "$SLICE" | tr -d ' ')
+echo "[$(date +%H:%M:%S)] uploading slice: $SLICE_LINES lines / $((SLICE_BYTES/1024)) KB" | tee -a "$LOG"
+# Upload to a chunk-specific filename — never overwrites, just adds new files
+NEW_OFFSET=$(( PREV_OFFSET + TAKE ))
+REMOTE_PATH="batches/${DATE_TAG}/chunk-${TIME_TAG}-${NEW_OFFSET}.jsonl"
+if HF_AUTH="$HF_AUTH" python3 - "$SLICE" "$REMOTE_PATH" "$SLICE_LINES" >> "$LOG" 2>&1 <<'PYEOF'
+import sys, os, time
+slice_path, remote, n_lines = sys.argv[1], sys.argv[2], sys.argv[3]
 hf_auth = os.environ["HF_AUTH"]
 try:
     print(f"[{time.strftime('%H:%M:%S')}] ERR: huggingface_hub not installed")
     sys.exit(2)
 api = HfApi(token=hf_auth)
 try:
     api.upload_file(
+        path_or_fileobj=slice_path,
+        path_in_repo=remote,
         repo_id="axentx/surrogate-1-training-pairs",
         repo_type="dataset",
+        commit_message=f"chunk: +{n_lines} pairs ({time.strftime('%H:%M')})",
     )
+    print(f"[{time.strftime('%H:%M:%S')}] ✅ uploaded → {remote}")
     sys.exit(0)
 except Exception as e:
     print(f"[{time.strftime('%H:%M:%S')}] ❌ {type(e).__name__}: {str(e)[:300]}")
     sys.exit(3)
 PYEOF
 then
+    echo "$NEW_OFFSET" > "$OFFSET_FILE"
     rm -f "$SLICE"
+    REMAINING=$(( CUR_LINES - NEW_OFFSET ))
+    echo "[$(date +%H:%M:%S)] offset → $NEW_OFFSET · remaining=$REMAINING (next run)" | tee -a "$LOG"
 else
+    echo "[$(date +%H:%M:%S)] push failed — offset still $PREV_OFFSET, slice retained: $SLICE" | tee -a "$LOG"
     exit 1
 fi

bin/surrogate-self-ingest.sh CHANGED Viewed

@@ -36,40 +36,57 @@ NEW=$(( CUR - PREV ))
 echo "[$(date +%H:%M:%S)] ingesting $NEW new pairs into FTS index" | tee -a "$LOG"
-tail -n "$NEW" "$SRC" | python3 - "$INDEX" >> "$LOG" 2>&1 <<'PYEOF'
 import sys, json, sqlite3
-from datetime import datetime
 db = sys.argv[1]
 con = sqlite3.connect(db)
 con.execute("BEGIN")
-n = 0
 for line in sys.stdin:
     try:
         d = json.loads(line)
-        src = d.get("source", "?")
-        role = src.replace("orchestrate-", "") if src.startswith("orchestrate-") else src
-        ts = d.get("ts", 0)
-        prompt = (d.get("prompt") or "")[:4000]
-        response = (d.get("response") or "")[:8000]
-        if len(prompt) < 50 or len(response) < 50:
-            continue
         con.execute(
             "INSERT INTO pairs(source,role,prompt,response,ts) VALUES (?,?,?,?,?)",
             (src, role, prompt, response, str(ts))
         )
         n += 1
     except Exception as e:
-        print(f"  skip line: {type(e).__name__}", file=sys.stderr)
 con.commit()
-print(f"  ingested {n} pairs (FTS index)", flush=True)
 PYEOF
-echo "$CUR" > "$OFFSET_FILE"
-echo "[$(date +%H:%M:%S)] ingest done · offset → $CUR" | tee -a "$LOG"
-# Print quick stats
 TOTAL=$(sqlite3 "$INDEX" "SELECT COUNT(*) FROM pairs" 2>/dev/null)
 BY_ROLE=$(sqlite3 "$INDEX" "SELECT role, COUNT(*) FROM pairs GROUP BY role ORDER BY 2 DESC LIMIT 5" 2>/dev/null)
 echo "  total indexed: $TOTAL" | tee -a "$LOG"
-echo "  top roles:" | tee -a "$LOG"
-echo "$BY_ROLE" | sed 's/^/    /' | tee -a "$LOG"

 echo "[$(date +%H:%M:%S)] ingesting $NEW new pairs into FTS index" | tee -a "$LOG"
+# Process in batches of 5000 — gentle, doesn't blow memory
+BATCH_SIZE="${SELF_INGEST_BATCH:-5000}"
+TAKE=$NEW
+[[ $TAKE -gt $BATCH_SIZE ]] && TAKE=$BATCH_SIZE
+echo "[$(date +%H:%M:%S)]   processing $TAKE / $NEW (batch_size=$BATCH_SIZE)" | tee -a "$LOG"
+sed -n "$((PREV + 1)),$((PREV + TAKE))p" "$SRC" | python3 - "$INDEX" >> "$LOG" 2>&1 <<'PYEOF'
 import sys, json, sqlite3
 db = sys.argv[1]
 con = sqlite3.connect(db)
 con.execute("BEGIN")
+n = skipped_short = skipped_parse = 0
 for line in sys.stdin:
     try:
         d = json.loads(line)
+    except Exception:
+        skipped_parse += 1
+        continue
+    src = d.get("source", "?")
+    role = src.replace("orchestrate-", "") if src.startswith("orchestrate-") else src
+    ts = d.get("ts", 0)
+    prompt = (d.get("prompt") or "")[:4000]
+    response = (d.get("response") or "")[:8000]
+    # Relaxed filter: index anything with both fields present (was 50-char min)
+    # Even short pairs are useful for tag-based retrieval
+    if not prompt or not response:
+        skipped_short += 1
+        continue
+    try:
         con.execute(
             "INSERT INTO pairs(source,role,prompt,response,ts) VALUES (?,?,?,?,?)",
             (src, role, prompt, response, str(ts))
         )
         n += 1
     except Exception as e:
+        print(f"  insert err: {type(e).__name__}: {str(e)[:80]}", file=sys.stderr)
 con.commit()
+print(f"  inserted={n} skipped_parse={skipped_parse} skipped_empty={skipped_short}", flush=True)
 PYEOF
+# Advance offset by what we actually processed
+NEW_OFFSET=$(( PREV + TAKE ))
+echo "$NEW_OFFSET" > "$OFFSET_FILE"
+echo "[$(date +%H:%M:%S)] ingest batch done · offset → $NEW_OFFSET (remaining: $((CUR - NEW_OFFSET)))" | tee -a "$LOG"
+# Quick stats
 TOTAL=$(sqlite3 "$INDEX" "SELECT COUNT(*) FROM pairs" 2>/dev/null)
+TOTAL=${TOTAL:-0}
 BY_ROLE=$(sqlite3 "$INDEX" "SELECT role, COUNT(*) FROM pairs GROUP BY role ORDER BY 2 DESC LIMIT 5" 2>/dev/null)
 echo "  total indexed: $TOTAL" | tee -a "$LOG"
+[[ -n "$BY_ROLE" ]] && {
+    echo "  top roles:" | tee -a "$LOG"
+    echo "$BY_ROLE" | sed 's/^/    /' | tee -a "$LOG"
+}