| |
| """Opt-in live URL checker for submission README links.""" |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import json |
| from pathlib import Path |
| import re |
| import ssl |
| from urllib.error import HTTPError, URLError |
| from urllib.parse import urlparse |
| from urllib.request import Request, urlopen |
|
|
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| URL_RE = re.compile(r"https?://[^\s)>\]]+") |
| STORY_PATTERNS = ("huggingface.co/blog/", "youtube.com/", "youtu.be/") |
| LOCAL_HOSTS = {"127.0.0.1", "localhost", "::1"} |
|
|
|
|
| try: |
| import certifi |
|
|
| SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where()) |
| except Exception: |
| SSL_CONTEXT = ssl.create_default_context() |
|
|
|
|
| def _clean_url(url: str) -> str: |
| return url.rstrip(".,;:!\"'") |
|
|
|
|
| def _read_urls(readme: Path) -> list[str]: |
| text = readme.read_text(encoding="utf-8") |
| urls = [_clean_url(match.group(0)) for match in URL_RE.finditer(text)] |
| return list(dict.fromkeys(urls)) |
|
|
|
|
| def _should_skip(url: str) -> bool: |
| parsed = urlparse(url) |
| return parsed.hostname in LOCAL_HOSTS |
|
|
|
|
| def _probe(url: str, timeout: float) -> dict[str, object]: |
| headers = {"User-Agent": "polyguard-submission-link-check/1.0"} |
| for method in ("HEAD", "GET"): |
| request = Request(url, headers=headers, method=method) |
| try: |
| with urlopen(request, timeout=timeout, context=SSL_CONTEXT) as response: |
| return {"url": url, "method": method, "status": int(response.status), "ok": response.status < 400} |
| except HTTPError as exc: |
| if method == "HEAD" and exc.code in {403, 405}: |
| continue |
| return {"url": url, "method": method, "status": int(exc.code), "ok": False, "error": str(exc)} |
| except URLError as exc: |
| if method == "HEAD": |
| continue |
| return {"url": url, "method": method, "status": 0, "ok": False, "error": str(exc.reason)} |
| except TimeoutError: |
| if method == "HEAD": |
| continue |
| return {"url": url, "method": method, "status": 0, "ok": False, "error": "timeout"} |
| return {"url": url, "method": "GET", "status": 0, "ok": False, "error": "unreachable"} |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="Check live README submission links.") |
| parser.add_argument("--readme", default=str(ROOT / "README.md")) |
| parser.add_argument("--timeout", type=float, default=12.0) |
| parser.add_argument( |
| "--story-only", |
| action="store_true", |
| help="Only check YouTube/Hugging Face blog story links.", |
| ) |
| args = parser.parse_args() |
|
|
| readme = Path(args.readme) |
| all_urls = _read_urls(readme) |
| skipped = [url for url in all_urls if _should_skip(url)] |
| urls = [url for url in all_urls if not _should_skip(url)] |
| if args.story_only: |
| urls = [url for url in urls if any(pattern in url for pattern in STORY_PATTERNS)] |
|
|
| results = [_probe(url, timeout=args.timeout) for url in urls] |
| broken = [item for item in results if not item.get("ok")] |
| story_urls = [item for item in results if any(pattern in str(item.get("url", "")) for pattern in STORY_PATTERNS)] |
| story_broken = [item for item in story_urls if not item.get("ok")] |
|
|
| payload = { |
| "readme": str(readme), |
| "checked": len(results), |
| "skipped_local_or_dev_urls": skipped, |
| "broken": broken, |
| "story_urls": story_urls, |
| "story_broken": story_broken, |
| "ok": not broken and bool(story_urls), |
| } |
| print(json.dumps(payload, ensure_ascii=True, indent=2)) |
|
|
| if broken: |
| raise SystemExit(1) |
| if not story_urls: |
| raise SystemExit("No YouTube or Hugging Face blog story URL found in README.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|