#!/usr/bin/env python3 """Opt-in live URL checker for submission README links.""" from __future__ import annotations import argparse import json from pathlib import Path import re import ssl from urllib.error import HTTPError, URLError from urllib.parse import urlparse from urllib.request import Request, urlopen ROOT = Path(__file__).resolve().parents[1] URL_RE = re.compile(r"https?://[^\s)>\]]+") STORY_PATTERNS = ("huggingface.co/blog/", "youtube.com/", "youtu.be/") LOCAL_HOSTS = {"127.0.0.1", "localhost", "::1"} try: import certifi SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where()) except Exception: # noqa: BLE001 - fall back to system CA store when certifi is unavailable. SSL_CONTEXT = ssl.create_default_context() def _clean_url(url: str) -> str: return url.rstrip(".,;:!\"'") def _read_urls(readme: Path) -> list[str]: text = readme.read_text(encoding="utf-8") urls = [_clean_url(match.group(0)) for match in URL_RE.finditer(text)] return list(dict.fromkeys(urls)) def _should_skip(url: str) -> bool: parsed = urlparse(url) return parsed.hostname in LOCAL_HOSTS def _probe(url: str, timeout: float) -> dict[str, object]: headers = {"User-Agent": "polyguard-submission-link-check/1.0"} for method in ("HEAD", "GET"): request = Request(url, headers=headers, method=method) try: with urlopen(request, timeout=timeout, context=SSL_CONTEXT) as response: # noqa: S310 - explicit live submission link checker. return {"url": url, "method": method, "status": int(response.status), "ok": response.status < 400} except HTTPError as exc: if method == "HEAD" and exc.code in {403, 405}: continue return {"url": url, "method": method, "status": int(exc.code), "ok": False, "error": str(exc)} except URLError as exc: if method == "HEAD": continue return {"url": url, "method": method, "status": 0, "ok": False, "error": str(exc.reason)} except TimeoutError: if method == "HEAD": continue return {"url": url, "method": method, "status": 0, "ok": False, "error": "timeout"} return {"url": url, "method": "GET", "status": 0, "ok": False, "error": "unreachable"} def main() -> None: parser = argparse.ArgumentParser(description="Check live README submission links.") parser.add_argument("--readme", default=str(ROOT / "README.md")) parser.add_argument("--timeout", type=float, default=12.0) parser.add_argument( "--story-only", action="store_true", help="Only check YouTube/Hugging Face blog story links.", ) args = parser.parse_args() readme = Path(args.readme) all_urls = _read_urls(readme) skipped = [url for url in all_urls if _should_skip(url)] urls = [url for url in all_urls if not _should_skip(url)] if args.story_only: urls = [url for url in urls if any(pattern in url for pattern in STORY_PATTERNS)] results = [_probe(url, timeout=args.timeout) for url in urls] broken = [item for item in results if not item.get("ok")] story_urls = [item for item in results if any(pattern in str(item.get("url", "")) for pattern in STORY_PATTERNS)] story_broken = [item for item in story_urls if not item.get("ok")] payload = { "readme": str(readme), "checked": len(results), "skipped_local_or_dev_urls": skipped, "broken": broken, "story_urls": story_urls, "story_broken": story_broken, "ok": not broken and bool(story_urls), } print(json.dumps(payload, ensure_ascii=True, indent=2)) if broken: raise SystemExit(1) if not story_urls: raise SystemExit("No YouTube or Hugging Face blog story URL found in README.") if __name__ == "__main__": main()