File size: 2,847 Bytes
2a2e170
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Secret scrubbing for session trajectories before upload.

Users frequently paste HF / API / GitHub tokens into the chat, or scripts echo
them via env dumps. This module applies regex-based redaction to any string
value found recursively in a trajectory payload. The goal is best-effort —
strict formats are matched; we won't catch free-form leaks like "my password
is hunter2".
"""

from __future__ import annotations

import re
from typing import Any

# Each entry: (compiled regex, replacement placeholder).
# Patterns are conservative: they only match tokens with the canonical prefix
# and a minimum body length so we don't paint over normal text.
_PATTERNS: list[tuple[re.Pattern, str]] = [
    # Hugging Face tokens: hf_[A-Za-z0-9]{30,}
    (re.compile(r"hf_[A-Za-z0-9]{30,}"), "[REDACTED_HF_TOKEN]"),
    # Anthropic: sk-ant-[A-Za-z0-9_\-]{20,}
    (re.compile(r"sk-ant-[A-Za-z0-9_\-]{20,}"), "[REDACTED_ANTHROPIC_KEY]"),
    # OpenAI: sk-[A-Za-z0-9]{40,}  (legacy + proj keys)
    (re.compile(r"sk-(?!ant-)[A-Za-z0-9_\-]{40,}"), "[REDACTED_OPENAI_KEY]"),
    # GitHub classic PATs: ghp_, gho_, ghu_, ghs_, ghr_ followed by 36+ chars
    (re.compile(r"gh[pousr]_[A-Za-z0-9]{36,}"), "[REDACTED_GITHUB_TOKEN]"),
    # GitHub fine-grained PATs: github_pat_<alphanumeric_underscore>
    (re.compile(r"github_pat_[A-Za-z0-9_]{36,}"), "[REDACTED_GITHUB_TOKEN]"),
    # AWS access key IDs: AKIA / ASIA + 16 uppercase alnum
    (re.compile(r"\b(?:AKIA|ASIA)[A-Z0-9]{16}\b"), "[REDACTED_AWS_KEY_ID]"),
    # Generic 'Bearer <token>' header values
    (re.compile(r"(?i)bearer\s+[A-Za-z0-9_\-\.=]{20,}"), "Bearer [REDACTED]"),
]

# Env-var-like exports: we scrub the value but keep the name so callers can
# still see which secret was referenced. Covers `KEY=value` and `KEY: value`
# when the key looks secret-y.
_SECRETY_NAMES = re.compile(
    r"(?i)\b(HF_TOKEN|HUGGINGFACEHUB_API_TOKEN|ANTHROPIC_API_KEY|OPENAI_API_KEY|"
    r"GITHUB_TOKEN|AWS_SECRET_ACCESS_KEY|AWS_ACCESS_KEY_ID|PASSWORD|SECRET|API_KEY)"
    r"\s*[:=]\s*([^\s\"']+)"
)


def scrub_string(s: str) -> str:
    """Apply all redaction patterns to a single string. Safe on non-strings."""
    if not isinstance(s, str) or not s:
        return s
    out = s
    for pat, repl in _PATTERNS:
        out = pat.sub(repl, out)
    out = _SECRETY_NAMES.sub(lambda m: f"{m.group(1)}=[REDACTED]", out)
    return out


def scrub(obj: Any) -> Any:
    """Recursively scrub every string value in a nested dict/list structure.

    Returns a new object — inputs are not mutated."""
    if isinstance(obj, str):
        return scrub_string(obj)
    if isinstance(obj, dict):
        return {k: scrub(v) for k, v in obj.items()}
    if isinstance(obj, list):
        return [scrub(v) for v in obj]
    if isinstance(obj, tuple):
        return tuple(scrub(v) for v in obj)
    return obj