File size: 2,904 Bytes
f7e1070
 
 
 
 
 
 
 
 
 
e36381e
f7e1070
ecd4593
 
 
 
 
 
 
 
 
 
 
 
f7e1070
 
 
 
 
 
 
 
 
 
e40f7ec
ecd4593
 
e40f7ec
f7e1070
 
 
 
 
 
 
 
e36381e
f7e1070
 
 
 
 
 
 
 
 
ecd4593
 
 
 
f7e1070
 
 
 
ecd4593
 
 
 
f7e1070
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env bash
# Continuous domain-scrape loop β€” runs until taxonomy exhausted or duration hit
# Uses rotating tokens from pool. Respects 60 search/min combined (1 call/2s avg).
#
# Usage:
#   domain-scrape-loop.sh              # default 15 min
#   domain-scrape-loop.sh 1800         # 30 min
set -u
DUR="${1:-900}"
PARALLEL="${2:-3}"
LOG="$HOME/.surrogate/logs/domain-scrape-loop.log"
START=$(date +%s)

# Initialize ledger if missing (creates the 'scraped' table)
LEDGER_DB="$HOME/.surrogate/state/scrape-ledger.db"
if [[ ! -f "$LEDGER_DB" ]] || ! sqlite3 "$LEDGER_DB" "SELECT 1 FROM scraped LIMIT 1" >/dev/null 2>&1; then
    bash "$HOME/.surrogate/bin/scrape-ledger-init.sh" 2>>"$LOG"
fi

# Default 0 if query fails (was causing empty arithmetic in iter logs)
BEFORE_PAIRS=$(wc -l "$HOME/.surrogate/training-pairs.jsonl" 2>/dev/null | awk '{print $1}')
BEFORE_PAIRS=${BEFORE_PAIRS:-0}
BEFORE_LEDGER=$(sqlite3 "$LEDGER_DB" "SELECT COUNT(*) FROM scraped" 2>/dev/null)
BEFORE_LEDGER=${BEFORE_LEDGER:-0}

echo "═══ LOOP START $(date +%H:%M:%S) duration=${DUR}s parallel=$PARALLEL" | tee -a "$LOG"
echo "   before: pairs=$BEFORE_PAIRS ledger_repos=$BEFORE_LEDGER" | tee -a "$LOG"

ITER=0
while true; do
    NOW=$(date +%s)
    [[ $((NOW - START)) -gt $DUR ]] && break
    ITER=$((ITER + 1))

    # Health check β€” pause only on real saturation (load>50 on HF Space CPU)
    LOAD=$(uptime | sed -E 's/.*load average[s]?:[[:space:]]*//' | awk -F',' '{print int($1)}')
    LOAD=${LOAD:-0}
    if [[ $LOAD -gt 50 ]]; then
        echo "  [iter=$ITER] load=$LOAD pause 30s" | tee -a "$LOG"
        sleep 30
        continue
    fi

    # Fire N parallel instances, each picks different domain via ledger
    for i in $(seq 1 $PARALLEL); do
        (
            ~/.surrogate/bin/github-domain-scrape.sh >> "$LOG" 2>&1
        ) &
    done
    wait  # wait all parallel to finish (30-60s typical)

    # Pause 10s to let rate limit breathe
    sleep 10

    # Progress every 5 iters
    if (( ITER % 5 == 0 )); then
        PAIRS=$(wc -l "$HOME/.surrogate/training-pairs.jsonl" 2>/dev/null | awk '{print $1}')
        PAIRS=${PAIRS:-0}
        LEDGER=$(sqlite3 "$LEDGER_DB" "SELECT COUNT(*) FROM scraped" 2>/dev/null)
        LEDGER=${LEDGER:-0}
        echo "  [iter=$ITER $((NOW - START))s] pairs=$PAIRS (+$((PAIRS - BEFORE_PAIRS))) ledger=$LEDGER (+$((LEDGER - BEFORE_LEDGER)))" | tee -a "$LOG"
    fi
done

AFTER_PAIRS=$(wc -l "$HOME/.surrogate/training-pairs.jsonl" 2>/dev/null | awk '{print $1}')
AFTER_PAIRS=${AFTER_PAIRS:-0}
AFTER_LEDGER=$(sqlite3 "$LEDGER_DB" "SELECT COUNT(*) FROM scraped" 2>/dev/null)
AFTER_LEDGER=${AFTER_LEDGER:-0}
echo "═══ LOOP DONE $(date +%H:%M:%S)" | tee -a "$LOG"
echo "   iters: $ITER" | tee -a "$LOG"
echo "   pairs added:  $((AFTER_PAIRS - BEFORE_PAIRS))" | tee -a "$LOG"
echo "   ledger added: $((AFTER_LEDGER - BEFORE_LEDGER)) repos" | tee -a "$LOG"