#!/usr/bin/env bash # GitHub domain-systematic scraper v2 — token-rotating, full-taxonomy, ledger-driven # Fixes from v1: proper error logging, extraction logic copied from working bulk-train set -u set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a LEDGER="$HOME/.surrogate/state/scrape-ledger.db" LOG="$HOME/.surrogate/logs/github-domain-scrape.log" DATE=$(date +%Y-%m-%d) OUT="$HOME/axentx/surrogate/data/training-jsonl/github-domain-${DATE}.jsonl" mkdir -p "$(dirname "$LOG")" "$(dirname "$OUT")" [[ ! -f "$LEDGER" ]] && bash "$HOME/.surrogate/bin/scrape-ledger-init.sh" TARGET="${1:-}" export LEDGER OUT GITHUB_TOKEN GITHUB_TOKEN_POOL TARGET python3 <<'PYEOF' 2>&1 | tee -a "$LOG" import os, json, urllib.request, urllib.parse, re, time, base64, sqlite3, random from datetime import datetime from pathlib import Path LEDGER = os.environ['LEDGER'] OUT = Path(os.environ['OUT']) POOL = [t.strip() for t in os.environ.get('GITHUB_TOKEN_POOL','').split(',') if t.strip()] if not POOL: POOL = [os.environ.get('GITHUB_TOKEN','')] TARGET = os.environ.get('TARGET', '') print(f"[{datetime.now().strftime('%H:%M:%S')}] token pool: {len(POOL)} tokens") _tok_idx = [0] def next_token(): """Round-robin token rotation.""" t = POOL[_tok_idx[0] % len(POOL)] _tok_idx[0] += 1 return t def gh_req(url, timeout=15, retry_on_403=True): for attempt in range(len(POOL) + 1): # try each token at most once t = next_token() req = urllib.request.Request(url, headers={ 'Accept':'application/vnd.github+json', 'Authorization': f'Bearer {t}' }) try: return urllib.request.urlopen(req, timeout=timeout) except urllib.error.HTTPError as e: if e.code == 403 and retry_on_403 and attempt < len(POOL): print(f" [403 tok#{_tok_idx[0]%len(POOL)}] retrying next token") continue raise except Exception: raise raise RuntimeError("all tokens exhausted") conn = sqlite3.connect(LEDGER, timeout=30) conn.execute('PRAGMA journal_mode=WAL') conn.execute('PRAGMA busy_timeout=15000') cur = conn.cursor() # Pick domain slot (priority + least scraped) if TARGET and '/' in TARGET: domain, sub = TARGET.split('/', 1) cur.execute("SELECT domain, subdomain, search_keywords, target_repos FROM domain_taxonomy WHERE domain=? AND subdomain=?", (domain, sub)) else: cur.execute(""" SELECT t.domain, t.subdomain, t.search_keywords, t.target_repos FROM domain_taxonomy t LEFT JOIN (SELECT domain, subdomain, COUNT(*) AS n FROM scraped GROUP BY domain, subdomain) s ON t.domain=s.domain AND t.subdomain=s.subdomain WHERE COALESCE(s.n, 0) < t.target_repos ORDER BY t.priority ASC, COALESCE(s.n, 0) ASC, RANDOM() LIMIT 1 """) row = cur.fetchone() if not row: print("[done] no domain needs scraping"); exit() domain, sub, keywords, target = row cur.execute("SELECT COUNT(*) FROM scraped WHERE domain=? AND subdomain=?", (domain, sub)) already = cur.fetchone()[0] remaining = max(0, target - already) print(f"[slot] {domain}/{sub} already={already} target={target} remaining={remaining}") if remaining <= 0: exit() SCRUB = [ (r'\bAshira[^\s@]*\b', 'user'), (r'\bอชิระ[^\s]*\b', 'user'), (r'\bPitchaya\w*\b', ''), (r'\bTHINKBIT\w*\b', 'company'), (r'\bCIMB\b', 'company'), (r'\bSwiftlet\b', 'company'), (r'\bKMITL\b', 'school'), ] def scrub(t): if not t: return t for p, r in SCRUB: t = re.sub(p, r, t, flags=re.IGNORECASE) return t def write_pair(instr, resp, tag, lang, path): if len(instr) < 20 or len(resp) < 100: return 0 pair = { 'instruction': scrub(instr[:500]), 'response': scrub(resp[:3500]), 'source': f'github-domain:{domain}/{sub}', 'domain': domain, 'subdomain': sub, 'language': lang, 'path': path[:200], 'repo': tag, 'timestamp': datetime.utcnow().isoformat(), } with open(OUT, 'a') as f: f.write(json.dumps(pair, ensure_ascii=False) + '\n') return 1 # Do 2 searches with different keyword combos kw_list = keywords.split() searches = set() for _ in range(4): if len(kw_list) >= 2: searches.add(' '.join(random.sample(kw_list, 2))) else: searches.add(kw_list[0]) searches = list(searches)[:3] total_pairs = 0 repos_done = 0 budget = min(remaining, 50) # 30 repos per run (bumped from 12 — 3-token pool handles it) for kw in searches: if repos_done >= budget: break # DEEP scan: pages 1-5 = top 150 by stars, instead of just top 30 for page in range(1, 6): if repos_done >= budget: break q = urllib.parse.quote(f"{kw} stars:>30 pushed:>2024-01-01") try: with gh_req(f'https://api.github.com/search/repositories?q={q}&sort=stars&order=desc&per_page=30&page={page}') as r: d = json.load(r) except Exception as e: print(f" [search err '{kw}' p{page}] {e}"); break items = d.get('items', []) if not items: break print(f" [search '{kw}' p{page}] {len(items)} items") # Inline the inner loop body (indent carefully) for repo in items: if repos_done >= budget: break full = repo['full_name'] stars = repo.get('stargazers_count', 0) desc = repo.get('description') or '' lang = repo.get('language') or '' cur.execute("SELECT 1 FROM scraped WHERE source=? AND identifier=?", ('github', full)) if cur.fetchone(): continue pairs_this = 0 readme_ok = False try: with gh_req(f'https://api.github.com/repos/{full}/readme') as r: rd = json.load(r) raw = rd.get('content','') or '' if raw: readme = base64.b64decode(raw).decode('utf-8', errors='replace') if len(readme) > 200: readme_ok = True pairs_this += write_pair(f"อธิบาย repository '{full}' ({stars}⭐, {domain}/{sub}). Description: {desc[:150]}", readme[:3000], full, lang, 'README.md') blocks = re.findall(r'```(\w*)\n(.*?)```', readme, re.DOTALL) for blang, bcode in blocks[:2]: if len(bcode) > 80: pairs_this += write_pair(f"Show a minimal working example from {full} ({domain}/{sub})", bcode[:1500], full, blang or lang, 'README.md:code') except Exception: pass try: with gh_req(f'https://api.github.com/repos/{full}/git/trees/HEAD?recursive=1') as r: tree = json.load(r) files = tree.get('tree', []) code_files = [f for f in files if f.get('type')=='blob' and 200 200: ext = path.split('.')[-1] if '.' in path else '' pairs_this += write_pair(f"Explain '{path}' from {full} ({domain}/{sub}).", content, full, ext, path) except Exception: continue except Exception: pass try: cur.execute("INSERT OR IGNORE INTO scraped (source, identifier, domain, subdomain, language, stars, scraped_at, pairs_written, status) VALUES (?,?,?,?,?,?,?,?,?)", ('github', full, domain, sub, lang, stars, datetime.utcnow().isoformat(), pairs_this, 'ok' if pairs_this else 'empty')) conn.commit() except Exception as e: print(f" [ledger err] {e}") total_pairs += pairs_this repos_done += 1 if pairs_this: print(f" ✓ p{page} {full} ({stars}⭐) → {pairs_this}") time.sleep(0.1) continue # skip old single-page block below for repo in items: if repos_done >= budget: break full = repo['full_name'] stars = repo.get('stargazers_count', 0) desc = repo.get('description') or '' lang = repo.get('language') or '' # Ledger dedup cur.execute("SELECT 1 FROM scraped WHERE source=? AND identifier=?", ('github', full)) if cur.fetchone(): continue pairs_this = 0 readme_ok = False # README try: with gh_req(f'https://api.github.com/repos/{full}/readme') as r: rd = json.load(r) raw = rd.get('content','') or '' if raw: readme = base64.b64decode(raw).decode('utf-8', errors='replace') if len(readme) > 200: readme_ok = True pairs_this += write_pair( f"อธิบาย repository '{full}' ({stars}⭐, {domain}/{sub}). Description: {desc[:150]}", readme[:3000], full, lang, 'README.md' ) # Code block from README blocks = re.findall(r'```(\w*)\n(.*?)```', readme, re.DOTALL) for blang, bcode in blocks[:2]: if len(bcode) > 80: pairs_this += write_pair( f"Show a minimal working example from {full} ({domain}/{sub})", bcode[:1500], full, blang or lang, 'README.md:code' ) except urllib.error.HTTPError as e: print(f" [readme err {full}] {e.code}") except Exception as e: print(f" [readme err {full}] {type(e).__name__}: {str(e)[:60]}") # Top source files try: with gh_req(f'https://api.github.com/repos/{full}/git/trees/HEAD?recursive=1') as r: tree = json.load(r) files = tree.get('tree', []) code_files = [f for f in files if f.get('type') == 'blob' and 200 < f.get('size', 0) < 40000 and f['path'].endswith(('.py','.go','.rs','.ts','.tsx','.js','.yaml','.yml','.tf','.hcl','.sh','.md','.java','.kt')) and not any(sk in f['path'].lower() for sk in ['test_','_test.','spec.','tests/','vendor/','node_modules','.lock','.min.js'])][:5] for f_ent in code_files: path = f_ent['path'] try: url = f'https://raw.githubusercontent.com/{full}/HEAD/{path}' with urllib.request.urlopen(url, timeout=10) as r2: content = r2.read().decode('utf-8', errors='replace')[:8000] if len(content) > 200: ext = path.split('.')[-1] if '.' in path else '' pairs_this += write_pair( f"Explain '{path}' from {full} ({domain}/{sub}).", content, full, ext, path ) except Exception: continue except Exception as e: print(f" [tree err {full}] {type(e).__name__}") # Ledger write (always, even if pairs=0 so we don't retry immediately) try: cur.execute( "INSERT OR IGNORE INTO scraped (source, identifier, domain, subdomain, language, stars, scraped_at, pairs_written, status) VALUES (?,?,?,?,?,?,?,?,?)", ('github', full, domain, sub, lang, stars, datetime.utcnow().isoformat(), pairs_this, 'ok' if pairs_this else 'empty') ) conn.commit() except Exception as e: print(f" [ledger err] {e}") total_pairs += pairs_this repos_done += 1 if pairs_this: print(f" ✓ {full} ({stars}⭐) → {pairs_this} pairs") else: print(f" ∅ {full} ({stars}⭐) → 0 pairs (readme_ok={readme_ok})") time.sleep(0.1) conn.close() print(f"[done] {domain}/{sub}: {repos_done} repos, {total_pairs} pairs") PYEOF