File size: 3,902 Bytes
21c7db9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | #!/usr/bin/env python3
"""Opt-in live URL checker for submission README links."""
from __future__ import annotations
import argparse
import json
from pathlib import Path
import re
import ssl
from urllib.error import HTTPError, URLError
from urllib.parse import urlparse
from urllib.request import Request, urlopen
ROOT = Path(__file__).resolve().parents[1]
URL_RE = re.compile(r"https?://[^\s)>\]]+")
STORY_PATTERNS = ("huggingface.co/blog/", "youtube.com/", "youtu.be/")
LOCAL_HOSTS = {"127.0.0.1", "localhost", "::1"}
try:
import certifi
SSL_CONTEXT = ssl.create_default_context(cafile=certifi.where())
except Exception: # noqa: BLE001 - fall back to system CA store when certifi is unavailable.
SSL_CONTEXT = ssl.create_default_context()
def _clean_url(url: str) -> str:
return url.rstrip(".,;:!\"'")
def _read_urls(readme: Path) -> list[str]:
text = readme.read_text(encoding="utf-8")
urls = [_clean_url(match.group(0)) for match in URL_RE.finditer(text)]
return list(dict.fromkeys(urls))
def _should_skip(url: str) -> bool:
parsed = urlparse(url)
return parsed.hostname in LOCAL_HOSTS
def _probe(url: str, timeout: float) -> dict[str, object]:
headers = {"User-Agent": "polyguard-submission-link-check/1.0"}
for method in ("HEAD", "GET"):
request = Request(url, headers=headers, method=method)
try:
with urlopen(request, timeout=timeout, context=SSL_CONTEXT) as response: # noqa: S310 - explicit live submission link checker.
return {"url": url, "method": method, "status": int(response.status), "ok": response.status < 400}
except HTTPError as exc:
if method == "HEAD" and exc.code in {403, 405}:
continue
return {"url": url, "method": method, "status": int(exc.code), "ok": False, "error": str(exc)}
except URLError as exc:
if method == "HEAD":
continue
return {"url": url, "method": method, "status": 0, "ok": False, "error": str(exc.reason)}
except TimeoutError:
if method == "HEAD":
continue
return {"url": url, "method": method, "status": 0, "ok": False, "error": "timeout"}
return {"url": url, "method": "GET", "status": 0, "ok": False, "error": "unreachable"}
def main() -> None:
parser = argparse.ArgumentParser(description="Check live README submission links.")
parser.add_argument("--readme", default=str(ROOT / "README.md"))
parser.add_argument("--timeout", type=float, default=12.0)
parser.add_argument(
"--story-only",
action="store_true",
help="Only check YouTube/Hugging Face blog story links.",
)
args = parser.parse_args()
readme = Path(args.readme)
all_urls = _read_urls(readme)
skipped = [url for url in all_urls if _should_skip(url)]
urls = [url for url in all_urls if not _should_skip(url)]
if args.story_only:
urls = [url for url in urls if any(pattern in url for pattern in STORY_PATTERNS)]
results = [_probe(url, timeout=args.timeout) for url in urls]
broken = [item for item in results if not item.get("ok")]
story_urls = [item for item in results if any(pattern in str(item.get("url", "")) for pattern in STORY_PATTERNS)]
story_broken = [item for item in story_urls if not item.get("ok")]
payload = {
"readme": str(readme),
"checked": len(results),
"skipped_local_or_dev_urls": skipped,
"broken": broken,
"story_urls": story_urls,
"story_broken": story_broken,
"ok": not broken and bool(story_urls),
}
print(json.dumps(payload, ensure_ascii=True, indent=2))
if broken:
raise SystemExit(1)
if not story_urls:
raise SystemExit("No YouTube or Hugging Face blog story URL found in README.")
if __name__ == "__main__":
main()
|