Spaces:
Runtime error
Runtime error
File size: 11,237 Bytes
5c8d6dd e36381e 5c8d6dd 023ab84 5c8d6dd 023ab84 5c8d6dd 39c61d0 5c8d6dd aa008c4 5c8d6dd aa008c4 5c8d6dd aa008c4 5c8d6dd aa008c4 5c8d6dd 37f0117 5c8d6dd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 | #!/usr/bin/env bash
# Agentic crawler β URL frontier with visited stamps + link discovery (BFS).
# Runs continuously: pop URL β fetch β extract links β score β push back to frontier.
# Stamps every visited URL in SQLite so we never revisit. Persists across restarts.
#
# Seeds (re-injected nightly): GitHub trending, arxiv recent, HF trending, MoC pages.
# Filtering: only follow links matching domain allowlist + minimum relevance.
# Output: training pairs (page β summary) pushed to HF dataset every 50 fetches.
set -uo pipefail
set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a
DB="$HOME/.surrogate/state/agentic-frontier.db"
LOG="$HOME/.surrogate/logs/agentic-crawler.log"
PAIRS="$HOME/.surrogate/training-pairs.jsonl"
mkdir -p "$(dirname "$DB")" "$(dirname "$LOG")" "$(dirname "$PAIRS")"
# ββ Schema ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
sqlite3 "$DB" <<'SQL'
CREATE TABLE IF NOT EXISTS visited (
url TEXT PRIMARY KEY,
fetched_ts INTEGER NOT NULL,
status INTEGER NOT NULL,
title TEXT,
domain TEXT,
depth INTEGER DEFAULT 0,
bytes INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS frontier (
url TEXT PRIMARY KEY,
score REAL NOT NULL,
depth INTEGER NOT NULL,
parent TEXT,
added_ts INTEGER NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_frontier_score ON frontier(score DESC, added_ts);
CREATE INDEX IF NOT EXISTS idx_visited_domain ON visited(domain);
SQL
# ββ Seed if empty βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM frontier;")
if [[ $COUNT -lt 5 ]]; then
echo "[$(date +%H:%M:%S)] seeding frontier" | tee -a "$LOG"
python3 - "$DB" <<'PYEOF'
import sqlite3, sys, time
db = sys.argv[1]
seeds = [
# AI agent / coding
("https://github.com/trending?since=daily", 1.0, 0),
("https://github.com/trending/python?since=daily", 0.9, 0),
("https://github.com/trending/typescript?since=daily", 0.9, 0),
("https://github.com/trending/rust?since=daily", 0.85, 0),
("https://github.com/trending/go?since=daily", 0.85, 0),
("https://huggingface.co/models?sort=trending", 0.95, 0),
("https://huggingface.co/datasets?sort=trending", 0.85, 0),
("https://arxiv.org/list/cs.AI/recent", 0.95, 0),
("https://arxiv.org/list/cs.SE/recent", 0.9, 0),
("https://arxiv.org/list/cs.CR/recent", 0.85, 0),
("https://news.ycombinator.com/", 0.8, 0),
("https://lobste.rs/", 0.75, 0),
# DevSecOps / SRE / cloud
("https://aws.amazon.com/blogs/devops/", 0.7, 0),
("https://cloud.google.com/blog/products/devops-sre", 0.7, 0),
("https://kubernetes.io/blog/", 0.7, 0),
("https://www.cncf.io/blog/", 0.7, 0),
# Awesome lists (rich link sources)
("https://github.com/sindresorhus/awesome", 0.9, 0),
("https://github.com/stevenjoezhang/awesome-llm-agents", 0.95, 0),
("https://github.com/e2b-dev/awesome-ai-agents", 0.95, 0),
("https://github.com/Hannibal046/Awesome-LLM", 0.9, 0),
("https://github.com/punkpeye/awesome-mcp-servers", 0.95, 0),
]
con = sqlite3.connect(db)
now = int(time.time())
for url, score, depth in seeds:
con.execute("INSERT OR IGNORE INTO frontier(url,score,depth,parent,added_ts) VALUES (?,?,?,NULL,?)",
(url, score, depth, now))
con.commit()
print(f" seeded {len(seeds)} URLs")
PYEOF
fi
# ββ Worker: fetch one URL, extract links, score, push back to frontier βββββ
fetch_one() {
local url="$1" depth="$2"
python3 - "$url" "$depth" "$DB" "$PAIRS" "${HF_TOKEN:-}" <<'PYEOF' 2>&1
import sys, sqlite3, urllib.request, urllib.parse, re, time, json, os
url, depth, db, pairs, hf_token = sys.argv[1], int(sys.argv[2]), sys.argv[3], sys.argv[4], sys.argv[5]
con = sqlite3.connect(db)
# Skip if already visited
if con.execute("SELECT 1 FROM visited WHERE url=?", (url,)).fetchone():
print(f" [skip-visited] {url[:80]}")
sys.exit(0)
domain = urllib.parse.urlparse(url).netloc
allow = {"github.com","huggingface.co","arxiv.org","news.ycombinator.com","lobste.rs",
"aws.amazon.com","cloud.google.com","azure.microsoft.com","kubernetes.io","cncf.io",
"anthropic.com","openai.com","mistral.ai","meta.com","ai.google.dev",
"datadog.com","newrelic.com","dynatrace.com","grafana.com","prometheus.io",
"redhat.com","docker.com","hashicorp.com","cncf.io","github.io","medium.com",
"dev.to","substack.com","blogspot.com"}
if domain not in allow and not any(domain.endswith("."+a) for a in allow):
con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
(url, int(time.time()), -2, None, domain, depth, 0))
con.commit()
print(f" [skip-domain] {domain}")
sys.exit(0)
# Fetch
try:
req = urllib.request.Request(url, headers={
"User-Agent": "Mozilla/5.0 Surrogate-1/agentic-crawler",
"Accept": "text/html,application/xhtml+xml"})
with urllib.request.urlopen(req, timeout=20) as r:
body = r.read(2_000_000).decode("utf-8", errors="ignore")
status = r.status
ctype = (r.headers.get("Content-Type") or "").lower()
# Skip non-HTML responses (DNS records, raw zone files, etc. were crashing parser)
if "html" not in ctype and "<html" not in body[:1000].lower():
con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
(url, int(time.time()), status, "", domain, depth, len(body)))
con.commit()
print(f" [skip-non-html] {ctype[:30]} {url[:80]}")
sys.exit(0)
except Exception as e:
con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
(url, int(time.time()), -1, None, domain, depth, 0))
con.commit()
print(f" [fail] {url[:80]} :: {type(e).__name__}")
sys.exit(0)
# Title
m = re.search(r"<title[^>]*>([^<]+)</title>", body, re.IGNORECASE)
title = (m.group(1) if m else "").strip()[:200]
con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
(url, int(time.time()), status, title, domain, depth, len(body)))
# Extract links + score
links = re.findall(r'href=["\'](https?://[^"\'#?\s<>]+)', body, re.IGNORECASE)
seen_set = set()
added = 0
for link in links:
if link in seen_set: continue
seen_set.add(link)
if con.execute("SELECT 1 FROM visited WHERE url=?", (link,)).fetchone(): continue
if con.execute("SELECT 1 FROM frontier WHERE url=?", (link,)).fetchone(): continue
ldomain = urllib.parse.urlparse(link).netloc
if not ldomain or len(link) > 500: continue
# Score: domain relevance + keyword bonus + depth penalty
score = 0.5
keywords_high = ("agent","llm","rag","mcp","claude","gpt","coder","devops","sre","kubernetes","terraform")
keywords_mid = ("ai","ml","cloud","devsec","security","python","typescript","go","rust","blog","paper")
low = link.lower()
if any(k in low for k in keywords_high): score += 0.3
elif any(k in low for k in keywords_mid): score += 0.1
if ldomain in allow or any(ldomain.endswith("."+a) for a in allow): score += 0.2
score -= 0.05 * (depth + 1)
if score < 0.3: continue
if depth + 1 > 4: continue # max depth
con.execute("INSERT OR IGNORE INTO frontier VALUES (?,?,?,?,?)",
(link, score, depth + 1, url, int(time.time())))
added += 1
if added > 30: break
con.commit()
print(f" [ok {status}] {title[:60]} β {url[:60]} (+{added} new links)")
# Save fetched page metadata to a SEPARATE crawl log β NOT to training-pairs.jsonl.
# (Placeholder responses pollute training data; only insert when we have real summary.)
crawl_log = os.path.expanduser("~/.surrogate/state/agentic-crawl-raw.jsonl")
text_only = re.sub(r"<[^>]+>", " ", body)
text_only = re.sub(r"\s+", " ", text_only).strip()[:6000]
if len(text_only) > 200:
raw_record = {
"ts": time.time(),
"source": "agentic-crawler",
"url": url,
"title": title,
"domain": domain,
"depth": depth,
"text": text_only[:6000],
}
with open(crawl_log, "a") as f:
f.write(json.dumps(raw_record, ensure_ascii=False) + "\n")
PYEOF
}
# ββ Main loop: parallel workers βββββββββββββββββββββββββββββββββββββββββββββ
PARALLEL="${1:-4}" # default 4 concurrent
BATCH_SIZE=20
echo "[$(date +%H:%M:%S)] crawler start (parallel=$PARALLEL)" | tee -a "$LOG"
while true; do
# Pop top-scoring URLs from frontier
BATCH=$(sqlite3 "$DB" "SELECT url||'|'||depth FROM frontier ORDER BY score DESC, added_ts ASC LIMIT $BATCH_SIZE;")
if [[ -z "$BATCH" ]]; then
echo "[$(date +%H:%M:%S)] frontier empty β re-seeding from awesome lists" >> "$LOG"
# Re-seed: re-fetch awesome lists to pick up new repos added since last seed
python3 -c "
import sqlite3, time
con = sqlite3.connect('$DB')
# Drop visited stamps for awesome list pages so they get re-fetched
seeds = [
'https://github.com/trending?since=daily',
'https://github.com/trending/python?since=daily',
'https://github.com/trending/typescript?since=daily',
'https://github.com/sindresorhus/awesome',
'https://github.com/e2b-dev/awesome-ai-agents',
'https://github.com/Hannibal046/Awesome-LLM',
'https://github.com/punkpeye/awesome-mcp-servers',
'https://github.com/dastergon/awesome-sre',
'https://huggingface.co/models?sort=trending',
'https://huggingface.co/datasets?sort=trending',
'https://arxiv.org/list/cs.AI/recent',
'https://arxiv.org/list/cs.SE/recent',
'https://news.ycombinator.com/',
]
for url in seeds:
con.execute('DELETE FROM visited WHERE url=?', (url,))
con.execute('INSERT OR IGNORE INTO frontier(url,score,depth,parent,added_ts) VALUES (?,?,?,NULL,?)',
(url, 0.95, 0, int(time.time())))
con.commit()
print(f' re-seeded {len(seeds)} URLs')
"
sleep 30
continue
fi
# Process in parallel
JOBS=0
while IFS='|' read -r URL DEPTH; do
[[ -z "$URL" ]] && continue
# Remove from frontier (atomic)
sqlite3 "$DB" "DELETE FROM frontier WHERE url='$URL';" 2>/dev/null
# Spawn fetch
fetch_one "$URL" "$DEPTH" >> "$LOG" 2>&1 &
JOBS=$((JOBS + 1))
if [[ $JOBS -ge $PARALLEL ]]; then
wait -n 2>/dev/null || wait
JOBS=$((JOBS - 1))
fi
done <<< "$BATCH"
wait # finish remaining
# Brief cool-down between batches
VISITED=$(sqlite3 "$DB" "SELECT COUNT(*) FROM visited;")
PENDING=$(sqlite3 "$DB" "SELECT COUNT(*) FROM frontier;")
echo "[$(date +%H:%M:%S)] batch done Β· visited=$VISITED Β· pending=$PENDING" >> "$LOG"
# Sleep adaptively: short if frontier full, longer if empty/rate-limit risk
if [[ $PENDING -gt 100 ]]; then
sleep 5
elif [[ $PENDING -gt 20 ]]; then
sleep 15
else
sleep 30
fi
done
|