File size: 2,648 Bytes
a4ff035
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Best-effort anonymizer for mined GitHub Actions logs.

Replaces project-specific identifiers (usernames, emails, hostnames, paths,
IPv4s, SHAs) with neutral tokens. Long random IDs (full + short SHAs) are
hashed to a stable short token so distinct IDs remain distinguishable in the
output without leaking the original. This is *best-effort* — the README's
Limitations section warns that perfect anonymization is impossible.

Order matters: emails must be replaced before ``@USER`` matches the local
part, and full 40-char SHAs must be replaced before the more permissive
8-char hex pattern would clobber a substring.
"""

from __future__ import annotations

import hashlib
import re
from re import Match


def hash_short(s: str) -> str:
    return hashlib.sha1(s.encode()).hexdigest()[:8]


def _sub_full_sha(m: Match[str]) -> str:
    return f"sha-{hash_short(m.group())}"


def _sub_short_hex(m: Match[str]) -> str:
    return f"hex-{hash_short(m.group())}"


# Compiled in-order; each tuple is (pattern, replacement). Replacement may be a
# callable for hashing or a literal string for redaction.
_PATTERNS: list[tuple[re.Pattern[str], object]] = [
    # Emails first (before the @USER pattern eats the local part).
    (re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"), "EMAIL"),
    # Full SHAs (40 hex) before short hex (8 hex) so the longer match wins.
    (re.compile(r"\b[a-f0-9]{40}\b"), _sub_full_sha),
    # User home dirs.
    (re.compile(r"/(?:home|Users)/[^/\s]+/"), "/PATH/USER/"),
    # GitHub @-mentions.
    (re.compile(r"@[A-Za-z0-9][A-Za-z0-9_\-]{0,38}\b"), "@USER"),
    # IPv4. Run before short-hex so 192.168.1.1 isn't mistaken for hex tokens.
    (re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "IP"),
    # Short hex tokens (8 chars) — order after the long-SHA pattern. The
    # negative lookbehinds keep this from re-matching the 8-char hash inside
    # an already-substituted ``sha-<8hex>`` or ``hex-<8hex>`` token.
    (re.compile(r"(?<!sha-)(?<!hex-)\b[a-f0-9]{8}\b"), _sub_short_hex),
    # Common internal-hostname prefixes.
    (re.compile(r"\b(?:corp|internal)\.[A-Za-z0-9.\-_]+"), "HOST"),
]


def anonymize(text: str) -> str:
    """Return ``text`` with project-specific identifiers replaced by tokens.

    Idempotent on the documented patterns: applying ``anonymize`` twice gives
    the same string (the replacement tokens themselves don't match any
    pattern).
    """
    out = text
    for pat, repl in _PATTERNS:
        if callable(repl):
            out = pat.sub(repl, out)  # type: ignore[arg-type]
        else:
            out = pat.sub(repl, out)
    return out