Spaces:
Sleeping
Sleeping
Prasham.Jain
feat(branch-b): B2 GitHub Actions log mining β gh-CLI scraper, anonymizer, throttle, cache
a4ff035 | """Best-effort anonymizer for mined GitHub Actions logs. | |
| Replaces project-specific identifiers (usernames, emails, hostnames, paths, | |
| IPv4s, SHAs) with neutral tokens. Long random IDs (full + short SHAs) are | |
| hashed to a stable short token so distinct IDs remain distinguishable in the | |
| output without leaking the original. This is *best-effort* β the README's | |
| Limitations section warns that perfect anonymization is impossible. | |
| Order matters: emails must be replaced before ``@USER`` matches the local | |
| part, and full 40-char SHAs must be replaced before the more permissive | |
| 8-char hex pattern would clobber a substring. | |
| """ | |
| from __future__ import annotations | |
| import hashlib | |
| import re | |
| from re import Match | |
| def hash_short(s: str) -> str: | |
| return hashlib.sha1(s.encode()).hexdigest()[:8] | |
| def _sub_full_sha(m: Match[str]) -> str: | |
| return f"sha-{hash_short(m.group())}" | |
| def _sub_short_hex(m: Match[str]) -> str: | |
| return f"hex-{hash_short(m.group())}" | |
| # Compiled in-order; each tuple is (pattern, replacement). Replacement may be a | |
| # callable for hashing or a literal string for redaction. | |
| _PATTERNS: list[tuple[re.Pattern[str], object]] = [ | |
| # Emails first (before the @USER pattern eats the local part). | |
| (re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), "EMAIL"), | |
| # Full SHAs (40 hex) before short hex (8 hex) so the longer match wins. | |
| (re.compile(r"\b[a-f0-9]{40}\b"), _sub_full_sha), | |
| # User home dirs. | |
| (re.compile(r"/(?:home|Users)/[^/\s]+/"), "/PATH/USER/"), | |
| # GitHub @-mentions. | |
| (re.compile(r"@[A-Za-z0-9][A-Za-z0-9_\-]{0,38}\b"), "@USER"), | |
| # IPv4. Run before short-hex so 192.168.1.1 isn't mistaken for hex tokens. | |
| (re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "IP"), | |
| # Short hex tokens (8 chars) β order after the long-SHA pattern. The | |
| # negative lookbehinds keep this from re-matching the 8-char hash inside | |
| # an already-substituted ``sha-<8hex>`` or ``hex-<8hex>`` token. | |
| (re.compile(r"(?<!sha-)(?<!hex-)\b[a-f0-9]{8}\b"), _sub_short_hex), | |
| # Common internal-hostname prefixes. | |
| (re.compile(r"\b(?:corp|internal)\.[A-Za-z0-9.\-_]+"), "HOST"), | |
| ] | |
| def anonymize(text: str) -> str: | |
| """Return ``text`` with project-specific identifiers replaced by tokens. | |
| Idempotent on the documented patterns: applying ``anonymize`` twice gives | |
| the same string (the replacement tokens themselves don't match any | |
| pattern). | |
| """ | |
| out = text | |
| for pat, repl in _PATTERNS: | |
| if callable(repl): | |
| out = pat.sub(repl, out) # type: ignore[arg-type] | |
| else: | |
| out = pat.sub(repl, out) | |
| return out | |