Spaces:

axentx
/

surrogate-1

Runtime error

Ashira Pitchayapakayakul commited on 10 days ago

Commit

39c61d0

1 Parent(s): 84a1ec7

fix: 4 chain bugs found via /selftest+ chain audit

VERIFIED via curl /selftest = ok:true (env clean)
But chain audit found 4 runtime bugs:

BUG 1: dedup.db CORRUPTION
16 parallel shards writing to SQLite \u2192 'database disk image is malformed'
All bulk-ingest shards failing on stats() lookup
FIX: dedup.py auto-recovery
- Add busy_timeout=30000 (30s wait on lock)
- Add wal_autocheckpoint=1000
- On corruption: rename to .corrupt-{ts}.bak + retry init (auto-rebuild)
- Smoke-test with SELECT 1 before declaring connection healthy

BUG 2: agentic-crawler ValueError on non-HTML
Hit DNS records / zone files \u2192 ValueError on int parsing
FIX: agentic-crawler.sh
- Read Content-Type header; skip if not html
- Stamp visited with status, exit cleanly

BUG 3: github-agentic-crawler 'all tokens exhausted' loops
Was: sleep min(60, soonest_reset - now) capped 600s
But all 4 tokens hit 403, sleep 60s, hit 403 again, repeat
FIX: TokenPool.wait_for_any_reset()
- Compute earliest reset_at among exhausted tokens
- Sleep until earliest + 5s buffer
- Cap 1 hour (was 10 min)
- Now: hit limit \u2192 sleep until GitHub resets \u2192 resume cleanly

BUG 4: parquet-direct heredoc bash escaping (visible in log)
Not crashing but printing garbled. Skip for now \u2014 dataset-server API
returning 'no shards' for fineweb anyway. Defer cleanup.

EXPECTED after rebuild:
- dedup.db rebuilds clean (corrupt backup saved)
- 16 shards finally drain datasets without sqlite errors
- GH crawler self-paces around rate limits
- Web crawler skips non-HTML cleanly

Run /selftest after rebuild to confirm all green again.

Files changed (3) hide show

bin/agentic-crawler.sh +8 -0
bin/github-agentic-crawler.py +14 -7
bin/lib/dedup.py +38 -13

bin/agentic-crawler.sh CHANGED Viewed

@@ -114,6 +114,14 @@ try:
     with urllib.request.urlopen(req, timeout=20) as r:
         body = r.read(2_000_000).decode("utf-8", errors="ignore")
         status = r.status
 except Exception as e:
     con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
                 (url, int(time.time()), -1, None, domain, depth, 0))

     with urllib.request.urlopen(req, timeout=20) as r:
         body = r.read(2_000_000).decode("utf-8", errors="ignore")
         status = r.status
+        ctype = (r.headers.get("Content-Type") or "").lower()
+    # Skip non-HTML responses (DNS records, raw zone files, etc. were crashing parser)
+    if "html" not in ctype and "<html" not in body[:1000].lower():
+        con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
+                    (url, int(time.time()), status, "", domain, depth, len(body)))
+        con.commit()
+        print(f"  [skip-non-html] {ctype[:30]} {url[:80]}")
+        sys.exit(0)
 except Exception as e:
     con.execute("INSERT OR REPLACE INTO visited VALUES (?,?,?,?,?,?,?)",
                 (url, int(time.time()), -1, None, domain, depth, 0))

bin/github-agentic-crawler.py CHANGED Viewed

@@ -197,7 +197,6 @@ class TokenPool:
         """Pick token with most remaining quota; if all exhausted, return None."""
         with self.lock:
             now = time.time()
-            # Reset expired counters
             for s in self.states:
                 if s.reset_at and now > s.reset_at:
                     s.remaining = 5000
@@ -205,13 +204,22 @@ class TokenPool:
             ready = [s for s in self.states if s.remaining > 50]
             if not ready:
                 return None
-            # Round-robin among ready, weighted by remaining
             ready.sort(key=lambda s: (-s.remaining, s.last_used))
             picked = ready[0]
             picked.last_used = now
-            picked.remaining -= 1   # optimistic; refined from response headers
             return picked
     def update_from_headers(self, state: TokenState, headers: dict) -> None:
         with self.lock:
             try:
@@ -233,10 +241,9 @@ def gh_get(url: str, pool: TokenPool, retries: int = 2) -> tuple[dict | list | N
     for attempt in range(retries + 1):
         state = pool.acquire()
         if state is None:
-            soonest = pool.soonest_reset()
-            wait = max(60, int(soonest - time.time()))
-            log(f"  all tokens exhausted, sleeping {wait}s until reset")
-            time.sleep(min(wait, 600))
             continue
         req = urllib.request.Request(url, headers={
             "Accept": "application/vnd.github+json",

         """Pick token with most remaining quota; if all exhausted, return None."""
         with self.lock:
             now = time.time()
             for s in self.states:
                 if s.reset_at and now > s.reset_at:
                     s.remaining = 5000
             ready = [s for s in self.states if s.remaining > 50]
             if not ready:
                 return None
             ready.sort(key=lambda s: (-s.remaining, s.last_used))
             picked = ready[0]
             picked.last_used = now
+            picked.remaining -= 1
             return picked
+    def wait_for_any_reset(self) -> int:
+        """Sleep until earliest token reset (instead of 60s naive sleep)."""
+        with self.lock:
+            now = time.time()
+            resets = [s.reset_at - int(now) for s in self.states if s.reset_at and s.reset_at > now]
+            if not resets:
+                return 60
+            wait = min(resets) + 5  # buffer
+            return min(wait, 3600)  # cap 1h
     def update_from_headers(self, state: TokenState, headers: dict) -> None:
         with self.lock:
             try:
     for attempt in range(retries + 1):
         state = pool.acquire()
         if state is None:
+            wait = pool.wait_for_any_reset()
+            log(f"  all tokens exhausted — sleep {wait}s until earliest reset")
+            time.sleep(wait)
             continue
         req = urllib.request.Request(url, headers={
             "Accept": "application/vnd.github+json",

bin/lib/dedup.py CHANGED Viewed

@@ -33,19 +33,44 @@ class DedupStore:
     def _connection(cls) -> sqlite3.Connection:
         if cls._conn is None:
             DB_PATH.parent.mkdir(parents=True, exist_ok=True)
-            c = sqlite3.connect(str(DB_PATH), check_same_thread=False, timeout=10)
-            c.execute("PRAGMA journal_mode=WAL")
-            c.execute("PRAGMA synchronous=NORMAL")
-            c.executescript("""
-                CREATE TABLE IF NOT EXISTS seen_hashes (
-                    hash    TEXT PRIMARY KEY,
-                    source  TEXT NOT NULL,
-                    ts      INTEGER NOT NULL
-                );
-                CREATE INDEX IF NOT EXISTS idx_seen_source ON seen_hashes(source);
-                CREATE INDEX IF NOT EXISTS idx_seen_ts ON seen_hashes(ts);
-            """)
-            cls._conn = c
         return cls._conn
     @classmethod

     def _connection(cls) -> sqlite3.Connection:
         if cls._conn is None:
             DB_PATH.parent.mkdir(parents=True, exist_ok=True)
+            # Auto-recover from corruption (16 parallel shards can corrupt SQLite)
+            for attempt in range(3):
+                try:
+                    c = sqlite3.connect(str(DB_PATH), check_same_thread=False,
+                                         timeout=30, isolation_level=None)
+                    c.execute("PRAGMA journal_mode=WAL")
+                    c.execute("PRAGMA synchronous=NORMAL")
+                    c.execute("PRAGMA busy_timeout=30000")  # 30s wait on lock
+                    c.execute("PRAGMA wal_autocheckpoint=1000")
+                    c.executescript("""
+                        CREATE TABLE IF NOT EXISTS seen_hashes (
+                            hash    TEXT PRIMARY KEY,
+                            source  TEXT NOT NULL,
+                            ts      INTEGER NOT NULL
+                        );
+                        CREATE INDEX IF NOT EXISTS idx_seen_source ON seen_hashes(source);
+                        CREATE INDEX IF NOT EXISTS idx_seen_ts ON seen_hashes(ts);
+                    """)
+                    # Smoke-test the table
+                    c.execute("SELECT 1 FROM seen_hashes LIMIT 1").fetchall()
+                    cls._conn = c
+                    break
+                except sqlite3.DatabaseError as e:
+                    if "malformed" in str(e).lower() or "corrupt" in str(e).lower():
+                        # Backup + reset corrupted DB
+                        import time as _t
+                        backup = DB_PATH.with_suffix(f".corrupt-{int(_t.time())}.bak")
+                        try:
+                            DB_PATH.rename(backup)
+                            for ext in ("-wal", "-shm"):
+                                p = DB_PATH.with_suffix(DB_PATH.suffix + ext)
+                                if p.exists():
+                                    p.unlink()
+                        except Exception:
+                            pass
+                        if attempt < 2:
+                            continue
+                    raise
         return cls._conn
     @classmethod