Spaces:
Runtime error
Runtime error
Ashira Pitchayapakayakul
feat: migrate $HOME/.claude/* to $HOME/.surrogate/* (clean separation from Claude Code)
e36381e | # GitHub domain-systematic scraper v2 — token-rotating, full-taxonomy, ledger-driven | |
| # Fixes from v1: proper error logging, extraction logic copied from working bulk-train | |
| set -u | |
| set -a; source "$HOME/.hermes/.env" 2>/dev/null; set +a | |
| LEDGER="$HOME/.surrogate/state/scrape-ledger.db" | |
| LOG="$HOME/.surrogate/logs/github-domain-scrape.log" | |
| DATE=$(date +%Y-%m-%d) | |
| OUT="$HOME/axentx/surrogate/data/training-jsonl/github-domain-${DATE}.jsonl" | |
| mkdir -p "$(dirname "$LOG")" "$(dirname "$OUT")" | |
| [[ ! -f "$LEDGER" ]] && bash "$HOME/.surrogate/bin/scrape-ledger-init.sh" | |
| TARGET="${1:-}" | |
| export LEDGER OUT GITHUB_TOKEN GITHUB_TOKEN_POOL TARGET | |
| python3 <<'PYEOF' 2>&1 | tee -a "$LOG" | |
| import os, json, urllib.request, urllib.parse, re, time, base64, sqlite3, random | |
| from datetime import datetime | |
| from pathlib import Path | |
| LEDGER = os.environ['LEDGER'] | |
| OUT = Path(os.environ['OUT']) | |
| POOL = [t.strip() for t in os.environ.get('GITHUB_TOKEN_POOL','').split(',') if t.strip()] | |
| if not POOL: POOL = [os.environ.get('GITHUB_TOKEN','')] | |
| TARGET = os.environ.get('TARGET', '') | |
| print(f"[{datetime.now().strftime('%H:%M:%S')}] token pool: {len(POOL)} tokens") | |
| _tok_idx = [0] | |
| def next_token(): | |
| """Round-robin token rotation.""" | |
| t = POOL[_tok_idx[0] % len(POOL)] | |
| _tok_idx[0] += 1 | |
| return t | |
| def gh_req(url, timeout=15, retry_on_403=True): | |
| for attempt in range(len(POOL) + 1): # try each token at most once | |
| t = next_token() | |
| req = urllib.request.Request(url, headers={ | |
| 'Accept':'application/vnd.github+json', | |
| 'Authorization': f'Bearer {t}' | |
| }) | |
| try: | |
| return urllib.request.urlopen(req, timeout=timeout) | |
| except urllib.error.HTTPError as e: | |
| if e.code == 403 and retry_on_403 and attempt < len(POOL): | |
| print(f" [403 tok#{_tok_idx[0]%len(POOL)}] retrying next token") | |
| continue | |
| raise | |
| except Exception: | |
| raise | |
| raise RuntimeError("all tokens exhausted") | |
| conn = sqlite3.connect(LEDGER, timeout=30) | |
| conn.execute('PRAGMA journal_mode=WAL') | |
| conn.execute('PRAGMA busy_timeout=15000') | |
| cur = conn.cursor() | |
| # Pick domain slot (priority + least scraped) | |
| if TARGET and '/' in TARGET: | |
| domain, sub = TARGET.split('/', 1) | |
| cur.execute("SELECT domain, subdomain, search_keywords, target_repos FROM domain_taxonomy WHERE domain=? AND subdomain=?", (domain, sub)) | |
| else: | |
| cur.execute(""" | |
| SELECT t.domain, t.subdomain, t.search_keywords, t.target_repos | |
| FROM domain_taxonomy t | |
| LEFT JOIN (SELECT domain, subdomain, COUNT(*) AS n FROM scraped GROUP BY domain, subdomain) s | |
| ON t.domain=s.domain AND t.subdomain=s.subdomain | |
| WHERE COALESCE(s.n, 0) < t.target_repos | |
| ORDER BY t.priority ASC, COALESCE(s.n, 0) ASC, RANDOM() | |
| LIMIT 1 | |
| """) | |
| row = cur.fetchone() | |
| if not row: | |
| print("[done] no domain needs scraping"); exit() | |
| domain, sub, keywords, target = row | |
| cur.execute("SELECT COUNT(*) FROM scraped WHERE domain=? AND subdomain=?", (domain, sub)) | |
| already = cur.fetchone()[0] | |
| remaining = max(0, target - already) | |
| print(f"[slot] {domain}/{sub} already={already} target={target} remaining={remaining}") | |
| if remaining <= 0: exit() | |
| SCRUB = [ | |
| (r'\bAshira[^\s@]*\b', 'user'), (r'\bอชิระ[^\s]*\b', 'user'), | |
| (r'\bPitchaya\w*\b', ''), (r'\bTHINKBIT\w*\b', 'company'), | |
| (r'\bCIMB\b', 'company'), (r'\bSwiftlet\b', 'company'), (r'\bKMITL\b', 'school'), | |
| ] | |
| def scrub(t): | |
| if not t: return t | |
| for p, r in SCRUB: t = re.sub(p, r, t, flags=re.IGNORECASE) | |
| return t | |
| def write_pair(instr, resp, tag, lang, path): | |
| if len(instr) < 20 or len(resp) < 100: return 0 | |
| pair = { | |
| 'instruction': scrub(instr[:500]), | |
| 'response': scrub(resp[:3500]), | |
| 'source': f'github-domain:{domain}/{sub}', | |
| 'domain': domain, 'subdomain': sub, | |
| 'language': lang, 'path': path[:200], 'repo': tag, | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| } | |
| with open(OUT, 'a') as f: | |
| f.write(json.dumps(pair, ensure_ascii=False) + '\n') | |
| return 1 | |
| # Do 2 searches with different keyword combos | |
| kw_list = keywords.split() | |
| searches = set() | |
| for _ in range(4): | |
| if len(kw_list) >= 2: searches.add(' '.join(random.sample(kw_list, 2))) | |
| else: searches.add(kw_list[0]) | |
| searches = list(searches)[:3] | |
| total_pairs = 0 | |
| repos_done = 0 | |
| budget = min(remaining, 50) # 30 repos per run (bumped from 12 — 3-token pool handles it) | |
| for kw in searches: | |
| if repos_done >= budget: break | |
| # DEEP scan: pages 1-5 = top 150 by stars, instead of just top 30 | |
| for page in range(1, 6): | |
| if repos_done >= budget: break | |
| q = urllib.parse.quote(f"{kw} stars:>30 pushed:>2024-01-01") | |
| try: | |
| with gh_req(f'https://api.github.com/search/repositories?q={q}&sort=stars&order=desc&per_page=30&page={page}') as r: | |
| d = json.load(r) | |
| except Exception as e: | |
| print(f" [search err '{kw}' p{page}] {e}"); break | |
| items = d.get('items', []) | |
| if not items: break | |
| print(f" [search '{kw}' p{page}] {len(items)} items") | |
| # Inline the inner loop body (indent carefully) | |
| for repo in items: | |
| if repos_done >= budget: break | |
| full = repo['full_name'] | |
| stars = repo.get('stargazers_count', 0) | |
| desc = repo.get('description') or '' | |
| lang = repo.get('language') or '' | |
| cur.execute("SELECT 1 FROM scraped WHERE source=? AND identifier=?", ('github', full)) | |
| if cur.fetchone(): continue | |
| pairs_this = 0 | |
| readme_ok = False | |
| try: | |
| with gh_req(f'https://api.github.com/repos/{full}/readme') as r: | |
| rd = json.load(r) | |
| raw = rd.get('content','') or '' | |
| if raw: | |
| readme = base64.b64decode(raw).decode('utf-8', errors='replace') | |
| if len(readme) > 200: | |
| readme_ok = True | |
| pairs_this += write_pair(f"อธิบาย repository '{full}' ({stars}⭐, {domain}/{sub}). Description: {desc[:150]}", readme[:3000], full, lang, 'README.md') | |
| blocks = re.findall(r'```(\w*)\n(.*?)```', readme, re.DOTALL) | |
| for blang, bcode in blocks[:2]: | |
| if len(bcode) > 80: | |
| pairs_this += write_pair(f"Show a minimal working example from {full} ({domain}/{sub})", bcode[:1500], full, blang or lang, 'README.md:code') | |
| except Exception: pass | |
| try: | |
| with gh_req(f'https://api.github.com/repos/{full}/git/trees/HEAD?recursive=1') as r: | |
| tree = json.load(r) | |
| files = tree.get('tree', []) | |
| code_files = [f for f in files if f.get('type')=='blob' and 200<f.get('size',0)<40000 and f['path'].endswith(('.py','.go','.rs','.ts','.tsx','.js','.yaml','.yml','.tf','.hcl','.sh','.md','.java','.kt')) and not any(sk in f['path'].lower() for sk in ['test_','_test.','spec.','tests/','vendor/','node_modules','.lock','.min.js'])][:5] | |
| for f_ent in code_files: | |
| path = f_ent['path'] | |
| try: | |
| with urllib.request.urlopen(f'https://raw.githubusercontent.com/{full}/HEAD/{path}', timeout=10) as r2: | |
| content = r2.read().decode('utf-8', errors='replace')[:8000] | |
| if len(content) > 200: | |
| ext = path.split('.')[-1] if '.' in path else '' | |
| pairs_this += write_pair(f"Explain '{path}' from {full} ({domain}/{sub}).", content, full, ext, path) | |
| except Exception: continue | |
| except Exception: pass | |
| try: | |
| cur.execute("INSERT OR IGNORE INTO scraped (source, identifier, domain, subdomain, language, stars, scraped_at, pairs_written, status) VALUES (?,?,?,?,?,?,?,?,?)", | |
| ('github', full, domain, sub, lang, stars, datetime.utcnow().isoformat(), pairs_this, 'ok' if pairs_this else 'empty')) | |
| conn.commit() | |
| except Exception as e: print(f" [ledger err] {e}") | |
| total_pairs += pairs_this | |
| repos_done += 1 | |
| if pairs_this: print(f" ✓ p{page} {full} ({stars}⭐) → {pairs_this}") | |
| time.sleep(0.1) | |
| continue # skip old single-page block below | |
| for repo in items: | |
| if repos_done >= budget: break | |
| full = repo['full_name'] | |
| stars = repo.get('stargazers_count', 0) | |
| desc = repo.get('description') or '' | |
| lang = repo.get('language') or '' | |
| # Ledger dedup | |
| cur.execute("SELECT 1 FROM scraped WHERE source=? AND identifier=?", ('github', full)) | |
| if cur.fetchone(): | |
| continue | |
| pairs_this = 0 | |
| readme_ok = False | |
| # README | |
| try: | |
| with gh_req(f'https://api.github.com/repos/{full}/readme') as r: | |
| rd = json.load(r) | |
| raw = rd.get('content','') or '' | |
| if raw: | |
| readme = base64.b64decode(raw).decode('utf-8', errors='replace') | |
| if len(readme) > 200: | |
| readme_ok = True | |
| pairs_this += write_pair( | |
| f"อธิบาย repository '{full}' ({stars}⭐, {domain}/{sub}). Description: {desc[:150]}", | |
| readme[:3000], full, lang, 'README.md' | |
| ) | |
| # Code block from README | |
| blocks = re.findall(r'```(\w*)\n(.*?)```', readme, re.DOTALL) | |
| for blang, bcode in blocks[:2]: | |
| if len(bcode) > 80: | |
| pairs_this += write_pair( | |
| f"Show a minimal working example from {full} ({domain}/{sub})", | |
| bcode[:1500], full, blang or lang, 'README.md:code' | |
| ) | |
| except urllib.error.HTTPError as e: | |
| print(f" [readme err {full}] {e.code}") | |
| except Exception as e: | |
| print(f" [readme err {full}] {type(e).__name__}: {str(e)[:60]}") | |
| # Top source files | |
| try: | |
| with gh_req(f'https://api.github.com/repos/{full}/git/trees/HEAD?recursive=1') as r: | |
| tree = json.load(r) | |
| files = tree.get('tree', []) | |
| code_files = [f for f in files | |
| if f.get('type') == 'blob' | |
| and 200 < f.get('size', 0) < 40000 | |
| and f['path'].endswith(('.py','.go','.rs','.ts','.tsx','.js','.yaml','.yml','.tf','.hcl','.sh','.md','.java','.kt')) | |
| and not any(sk in f['path'].lower() for sk in ['test_','_test.','spec.','tests/','vendor/','node_modules','.lock','.min.js'])][:5] | |
| for f_ent in code_files: | |
| path = f_ent['path'] | |
| try: | |
| url = f'https://raw.githubusercontent.com/{full}/HEAD/{path}' | |
| with urllib.request.urlopen(url, timeout=10) as r2: | |
| content = r2.read().decode('utf-8', errors='replace')[:8000] | |
| if len(content) > 200: | |
| ext = path.split('.')[-1] if '.' in path else '' | |
| pairs_this += write_pair( | |
| f"Explain '{path}' from {full} ({domain}/{sub}).", | |
| content, full, ext, path | |
| ) | |
| except Exception: continue | |
| except Exception as e: | |
| print(f" [tree err {full}] {type(e).__name__}") | |
| # Ledger write (always, even if pairs=0 so we don't retry immediately) | |
| try: | |
| cur.execute( | |
| "INSERT OR IGNORE INTO scraped (source, identifier, domain, subdomain, language, stars, scraped_at, pairs_written, status) VALUES (?,?,?,?,?,?,?,?,?)", | |
| ('github', full, domain, sub, lang, stars, datetime.utcnow().isoformat(), pairs_this, 'ok' if pairs_this else 'empty') | |
| ) | |
| conn.commit() | |
| except Exception as e: | |
| print(f" [ledger err] {e}") | |
| total_pairs += pairs_this | |
| repos_done += 1 | |
| if pairs_this: | |
| print(f" ✓ {full} ({stars}⭐) → {pairs_this} pairs") | |
| else: | |
| print(f" ∅ {full} ({stars}⭐) → 0 pairs (readme_ok={readme_ok})") | |
| time.sleep(0.1) | |
| conn.close() | |
| print(f"[done] {domain}/{sub}: {repos_done} repos, {total_pairs} pairs") | |
| PYEOF | |