surrogate-1 / bin /domain-scrape-loop.sh
Ashira Pitchayapakayakul
fix: scrape pause threshold load>10 \u2192 load>50 (consistent with auto-orchestrate)
e40f7ec
#!/usr/bin/env bash
# Continuous domain-scrape loop β€” runs until taxonomy exhausted or duration hit
# Uses rotating tokens from pool. Respects 60 search/min combined (1 call/2s avg).
#
# Usage:
# domain-scrape-loop.sh # default 15 min
# domain-scrape-loop.sh 1800 # 30 min
set -u
DUR="${1:-900}"
PARALLEL="${2:-3}"
LOG="$HOME/.surrogate/logs/domain-scrape-loop.log"
START=$(date +%s)
# Initialize ledger if missing (creates the 'scraped' table)
LEDGER_DB="$HOME/.surrogate/state/scrape-ledger.db"
if [[ ! -f "$LEDGER_DB" ]] || ! sqlite3 "$LEDGER_DB" "SELECT 1 FROM scraped LIMIT 1" >/dev/null 2>&1; then
bash "$HOME/.surrogate/bin/scrape-ledger-init.sh" 2>>"$LOG"
fi
# Default 0 if query fails (was causing empty arithmetic in iter logs)
BEFORE_PAIRS=$(wc -l "$HOME/.surrogate/training-pairs.jsonl" 2>/dev/null | awk '{print $1}')
BEFORE_PAIRS=${BEFORE_PAIRS:-0}
BEFORE_LEDGER=$(sqlite3 "$LEDGER_DB" "SELECT COUNT(*) FROM scraped" 2>/dev/null)
BEFORE_LEDGER=${BEFORE_LEDGER:-0}
echo "═══ LOOP START $(date +%H:%M:%S) duration=${DUR}s parallel=$PARALLEL" | tee -a "$LOG"
echo " before: pairs=$BEFORE_PAIRS ledger_repos=$BEFORE_LEDGER" | tee -a "$LOG"
ITER=0
while true; do
NOW=$(date +%s)
[[ $((NOW - START)) -gt $DUR ]] && break
ITER=$((ITER + 1))
# Health check β€” pause only on real saturation (load>50 on HF Space CPU)
LOAD=$(uptime | sed -E 's/.*load average[s]?:[[:space:]]*//' | awk -F',' '{print int($1)}')
LOAD=${LOAD:-0}
if [[ $LOAD -gt 50 ]]; then
echo " [iter=$ITER] load=$LOAD pause 30s" | tee -a "$LOG"
sleep 30
continue
fi
# Fire N parallel instances, each picks different domain via ledger
for i in $(seq 1 $PARALLEL); do
(
~/.surrogate/bin/github-domain-scrape.sh >> "$LOG" 2>&1
) &
done
wait # wait all parallel to finish (30-60s typical)
# Pause 10s to let rate limit breathe
sleep 10
# Progress every 5 iters
if (( ITER % 5 == 0 )); then
PAIRS=$(wc -l "$HOME/.surrogate/training-pairs.jsonl" 2>/dev/null | awk '{print $1}')
PAIRS=${PAIRS:-0}
LEDGER=$(sqlite3 "$LEDGER_DB" "SELECT COUNT(*) FROM scraped" 2>/dev/null)
LEDGER=${LEDGER:-0}
echo " [iter=$ITER $((NOW - START))s] pairs=$PAIRS (+$((PAIRS - BEFORE_PAIRS))) ledger=$LEDGER (+$((LEDGER - BEFORE_LEDGER)))" | tee -a "$LOG"
fi
done
AFTER_PAIRS=$(wc -l "$HOME/.surrogate/training-pairs.jsonl" 2>/dev/null | awk '{print $1}')
AFTER_PAIRS=${AFTER_PAIRS:-0}
AFTER_LEDGER=$(sqlite3 "$LEDGER_DB" "SELECT COUNT(*) FROM scraped" 2>/dev/null)
AFTER_LEDGER=${AFTER_LEDGER:-0}
echo "═══ LOOP DONE $(date +%H:%M:%S)" | tee -a "$LOG"
echo " iters: $ITER" | tee -a "$LOG"
echo " pairs added: $((AFTER_PAIRS - BEFORE_PAIRS))" | tee -a "$LOG"
echo " ledger added: $((AFTER_LEDGER - BEFORE_LEDGER)) repos" | tee -a "$LOG"