Spaces:

axentx
/

surrogate-1

Runtime error

surrogate-1 / bin /github-domain-scrape.sh

Ashira Pitchayapakayakul

feat: migrate $HOME/.claude/* to $HOME/.surrogate/* (clean separation from Claude Code)

e36381e 10 days ago

12.6 kB

	#!/usr/bin/env bash
	# GitHub domain-systematic scraper v2 — token-rotating, full-taxonomy, ledger-driven
	# Fixes from v1: proper error logging, extraction logic copied from working bulk-train
	set -u
	set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a

	LEDGER="$HOME/.surrogate/state/scrape-ledger.db"
	LOG="$HOME/.surrogate/logs/github-domain-scrape.log"
	DATE=$(date +%Y-%m-%d)
	OUT="$HOME/axentx/surrogate/data/training-jsonl/github-domain-${DATE}.jsonl"
	mkdir -p "$(dirname "$LOG")" "$(dirname "$OUT")"

	[[ ! -f "$LEDGER" ]] && bash "$HOME/.surrogate/bin/scrape-ledger-init.sh"

	TARGET="${1:-}"
	export LEDGER OUT GITHUB_TOKEN GITHUB_TOKEN_POOL TARGET

	python3 <<'PYEOF' 2>&1 \| tee -a "$LOG"
	import os, json, urllib.request, urllib.parse, re, time, base64, sqlite3, random
	from datetime import datetime
	from pathlib import Path

	LEDGER = os.environ['LEDGER']
	OUT = Path(os.environ['OUT'])
	POOL = [t.strip() for t in os.environ.get('GITHUB_TOKEN_POOL','').split(',') if t.strip()]
	if not POOL: POOL = [os.environ.get('GITHUB_TOKEN','')]
	TARGET = os.environ.get('TARGET', '')
	print(f"[{datetime.now().strftime('%H:%M:%S')}] token pool: {len(POOL)} tokens")

	_tok_idx = [0]
	def next_token():
	"""Round-robin token rotation."""
	t = POOL[_tok_idx[0] % len(POOL)]
	_tok_idx[0] += 1
	return t

	def gh_req(url, timeout=15, retry_on_403=True):
	for attempt in range(len(POOL) + 1): # try each token at most once
	t = next_token()
	req = urllib.request.Request(url, headers={
	'Accept':'application/vnd.github+json',
	'Authorization': f'Bearer {t}'
	})
	try:
	return urllib.request.urlopen(req, timeout=timeout)
	except urllib.error.HTTPError as e:
	if e.code == 403 and retry_on_403 and attempt < len(POOL):
	print(f" [403 tok#{_tok_idx[0]%len(POOL)}] retrying next token")
	continue
	raise
	except Exception:
	raise
	raise RuntimeError("all tokens exhausted")

	conn = sqlite3.connect(LEDGER, timeout=30)
	conn.execute('PRAGMA journal_mode=WAL')
	conn.execute('PRAGMA busy_timeout=15000')
	cur = conn.cursor()

	# Pick domain slot (priority + least scraped)
	if TARGET and '/' in TARGET:
	domain, sub = TARGET.split('/', 1)
	cur.execute("SELECT domain, subdomain, search_keywords, target_repos FROM domain_taxonomy WHERE domain=? AND subdomain=?", (domain, sub))
	else:
	cur.execute("""
	SELECT t.domain, t.subdomain, t.search_keywords, t.target_repos
	FROM domain_taxonomy t
	LEFT JOIN (SELECT domain, subdomain, COUNT(*) AS n FROM scraped GROUP BY domain, subdomain) s
	ON t.domain=s.domain AND t.subdomain=s.subdomain
	WHERE COALESCE(s.n, 0) < t.target_repos
	ORDER BY t.priority ASC, COALESCE(s.n, 0) ASC, RANDOM()
	LIMIT 1
	""")
	row = cur.fetchone()
	if not row:
	print("[done] no domain needs scraping"); exit()
	domain, sub, keywords, target = row
	cur.execute("SELECT COUNT(*) FROM scraped WHERE domain=? AND subdomain=?", (domain, sub))
	already = cur.fetchone()[0]
	remaining = max(0, target - already)
	print(f"[slot] {domain}/{sub} already={already} target={target} remaining={remaining}")
	if remaining <= 0: exit()

	SCRUB = [
	(r'\bAshira[^\s@]\b', 'user'), (r'\bอชิระ[^\s]\b', 'user'),
	(r'\bPitchaya\w\b', ''), (r'\bTHINKBIT\w\b', 'company'),
	(r'\bCIMB\b', 'company'), (r'\bSwiftlet\b', 'company'), (r'\bKMITL\b', 'school'),
	]
	def scrub(t):
	if not t: return t
	for p, r in SCRUB: t = re.sub(p, r, t, flags=re.IGNORECASE)
	return t

	def write_pair(instr, resp, tag, lang, path):
	if len(instr) < 20 or len(resp) < 100: return 0
	pair = {
	'instruction': scrub(instr[:500]),
	'response': scrub(resp[:3500]),
	'source': f'github-domain:{domain}/{sub}',
	'domain': domain, 'subdomain': sub,
	'language': lang, 'path': path[:200], 'repo': tag,
	'timestamp': datetime.utcnow().isoformat(),
	}
	with open(OUT, 'a') as f:
	f.write(json.dumps(pair, ensure_ascii=False) + '\n')
	return 1

	# Do 2 searches with different keyword combos
	kw_list = keywords.split()
	searches = set()
	for _ in range(4):
	if len(kw_list) >= 2: searches.add(' '.join(random.sample(kw_list, 2)))
	else: searches.add(kw_list[0])
	searches = list(searches)[:3]

	total_pairs = 0
	repos_done = 0
	budget = min(remaining, 50) # 30 repos per run (bumped from 12 — 3-token pool handles it)

	for kw in searches:
	if repos_done >= budget: break
	# DEEP scan: pages 1-5 = top 150 by stars, instead of just top 30
	for page in range(1, 6):
	if repos_done >= budget: break
	q = urllib.parse.quote(f"{kw} stars:>30 pushed:>2024-01-01")
	try:
	with gh_req(f'https://api.github.com/search/repositories?q={q}&sort=stars&order=desc&per_page=30&page={page}') as r:
	d = json.load(r)
	except Exception as e:
	print(f" [search err '{kw}' p{page}] {e}"); break
	items = d.get('items', [])
	if not items: break
	print(f" [search '{kw}' p{page}] {len(items)} items")
	# Inline the inner loop body (indent carefully)
	for repo in items:
	if repos_done >= budget: break
	full = repo['full_name']
	stars = repo.get('stargazers_count', 0)
	desc = repo.get('description') or ''
	lang = repo.get('language') or ''
	cur.execute("SELECT 1 FROM scraped WHERE source=? AND identifier=?", ('github', full))
	if cur.fetchone(): continue
	pairs_this = 0
	readme_ok = False
	try:
	with gh_req(f'https://api.github.com/repos/{full}/readme') as r:
	rd = json.load(r)
	raw = rd.get('content','') or ''
	if raw:
	readme = base64.b64decode(raw).decode('utf-8', errors='replace')
	if len(readme) > 200:
	readme_ok = True
	pairs_this += write_pair(f"อธิบาย repository '{full}' ({stars}⭐, {domain}/{sub}). Description: {desc[:150]}", readme[:3000], full, lang, 'README.md')
	blocks = re.findall(r'```(\w)\n(.?)```', readme, re.DOTALL)
	for blang, bcode in blocks[:2]:
	if len(bcode) > 80:
	pairs_this += write_pair(f"Show a minimal working example from {full} ({domain}/{sub})", bcode[:1500], full, blang or lang, 'README.md:code')
	except Exception: pass
	try:
	with gh_req(f'https://api.github.com/repos/{full}/git/trees/HEAD?recursive=1') as r:
	tree = json.load(r)
	files = tree.get('tree', [])
	code_files = [f for f in files if f.get('type')=='blob' and 200<f.get('size',0)<40000 and f['path'].endswith(('.py','.go','.rs','.ts','.tsx','.js','.yaml','.yml','.tf','.hcl','.sh','.md','.java','.kt')) and not any(sk in f['path'].lower() for sk in ['test_','_test.','spec.','tests/','vendor/','node_modules','.lock','.min.js'])][:5]
	for f_ent in code_files:
	path = f_ent['path']
	try:
	with urllib.request.urlopen(f'https://raw.githubusercontent.com/{full}/HEAD/{path}', timeout=10) as r2:
	content = r2.read().decode('utf-8', errors='replace')[:8000]
	if len(content) > 200:
	ext = path.split('.')[-1] if '.' in path else ''
	pairs_this += write_pair(f"Explain '{path}' from {full} ({domain}/{sub}).", content, full, ext, path)
	except Exception: continue
	except Exception: pass
	try:
	cur.execute("INSERT OR IGNORE INTO scraped (source, identifier, domain, subdomain, language, stars, scraped_at, pairs_written, status) VALUES (?,?,?,?,?,?,?,?,?)",
	('github', full, domain, sub, lang, stars, datetime.utcnow().isoformat(), pairs_this, 'ok' if pairs_this else 'empty'))
	conn.commit()
	except Exception as e: print(f" [ledger err] {e}")
	total_pairs += pairs_this
	repos_done += 1
	if pairs_this: print(f" ✓ p{page} {full} ({stars}⭐) → {pairs_this}")
	time.sleep(0.1)
	continue # skip old single-page block below

	for repo in items:
	if repos_done >= budget: break
	full = repo['full_name']
	stars = repo.get('stargazers_count', 0)
	desc = repo.get('description') or ''
	lang = repo.get('language') or ''

	# Ledger dedup
	cur.execute("SELECT 1 FROM scraped WHERE source=? AND identifier=?", ('github', full))
	if cur.fetchone():
	continue

	pairs_this = 0
	readme_ok = False
	# README
	try:
	with gh_req(f'https://api.github.com/repos/{full}/readme') as r:
	rd = json.load(r)
	raw = rd.get('content','') or ''
	if raw:
	readme = base64.b64decode(raw).decode('utf-8', errors='replace')
	if len(readme) > 200:
	readme_ok = True
	pairs_this += write_pair(
	f"อธิบาย repository '{full}' ({stars}⭐, {domain}/{sub}). Description: {desc[:150]}",
	readme[:3000], full, lang, 'README.md'
	)
	# Code block from README
	blocks = re.findall(r'```(\w)\n(.?)```', readme, re.DOTALL)
	for blang, bcode in blocks[:2]:
	if len(bcode) > 80:
	pairs_this += write_pair(
	f"Show a minimal working example from {full} ({domain}/{sub})",
	bcode[:1500], full, blang or lang, 'README.md:code'
	)
	except urllib.error.HTTPError as e:
	print(f" [readme err {full}] {e.code}")
	except Exception as e:
	print(f" [readme err {full}] {type(e).__name__}: {str(e)[:60]}")

	# Top source files
	try:
	with gh_req(f'https://api.github.com/repos/{full}/git/trees/HEAD?recursive=1') as r:
	tree = json.load(r)
	files = tree.get('tree', [])
	code_files = [f for f in files
	if f.get('type') == 'blob'
	and 200 < f.get('size', 0) < 40000
	and f['path'].endswith(('.py','.go','.rs','.ts','.tsx','.js','.yaml','.yml','.tf','.hcl','.sh','.md','.java','.kt'))
	and not any(sk in f['path'].lower() for sk in ['test_','_test.','spec.','tests/','vendor/','node_modules','.lock','.min.js'])][:5]
	for f_ent in code_files:
	path = f_ent['path']
	try:
	url = f'https://raw.githubusercontent.com/{full}/HEAD/{path}'
	with urllib.request.urlopen(url, timeout=10) as r2:
	content = r2.read().decode('utf-8', errors='replace')[:8000]
	if len(content) > 200:
	ext = path.split('.')[-1] if '.' in path else ''
	pairs_this += write_pair(
	f"Explain '{path}' from {full} ({domain}/{sub}).",
	content, full, ext, path
	)
	except Exception: continue
	except Exception as e:
	print(f" [tree err {full}] {type(e).__name__}")

	# Ledger write (always, even if pairs=0 so we don't retry immediately)
	try:
	cur.execute(
	"INSERT OR IGNORE INTO scraped (source, identifier, domain, subdomain, language, stars, scraped_at, pairs_written, status) VALUES (?,?,?,?,?,?,?,?,?)",
	('github', full, domain, sub, lang, stars, datetime.utcnow().isoformat(), pairs_this, 'ok' if pairs_this else 'empty')
	)
	conn.commit()
	except Exception as e:
	print(f" [ledger err] {e}")

	total_pairs += pairs_this
	repos_done += 1
	if pairs_this:
	print(f" ✓ {full} ({stars}⭐) → {pairs_this} pairs")
	else:
	print(f" ∅ {full} ({stars}⭐) → 0 pairs (readme_ok={readme_ok})")
	time.sleep(0.1)

	conn.close()
	print(f"[done] {domain}/{sub}: {repos_done} repos, {total_pairs} pairs")
	PYEOF