File size: 10,856 Bytes
1dfdc54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b772ad8
1dfdc54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4668b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dfdc54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b4668b2
 
 
 
 
 
 
 
1dfdc54
 
 
 
b4668b2
1dfdc54
 
 
b4668b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dfdc54
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
"""Surrogate-1 training data sanitizer β€” drops rows that would leak internals.

Discovered 2026-04-29: v1 LoRA leaked path "/home/hermes/.surrogate/state/orchestrate/77426592/1-README.md"
and "# generated via cerebras:llama3.1-8b" tag in inference response. Root cause: dataset-mirror
+ dataset-enrich ingested rows where the response was an LLM-generated artifact still tagged
with provider attribution + internal filesystem context.

Apply this filter at row-level BEFORE the row is added to training-pairs.jsonl. Drop entire
row if either prompt or response matches any high-risk pattern.
"""
import re

# Patterns that indicate the row contains Surrogate-1 internal pollution.
# Order: most-specific first so re.search short-circuits on hits.
POLLUTION_PATTERNS = [
    # 1. LLM provider attribution lines β€” often added by llm-burst-generator outputs
    r"^\s*#\s*generated\s+via\s+(cerebras|groq|openrouter|gemini|chutes|samba|kimi|nvidia|hf-router|hf-llama|hf-qwen|hf-mistral)",
    r"<<\s*generated by\s+(cerebras|groq|openrouter|gemini|chutes|samba|kimi)",
    r"\[(cerebras|groq|openrouter|gemini|chutes|samba|kimi):[a-z0-9.-]+\]",

    # 2. Internal filesystem paths β€” exposing host structure
    r"/home/hermes/(?:\.surrogate|\.hermes|\.codex|\.gemini|\.kaggle)/",
    r"/data/(?:state|logs|memory|skills|sessions|workspace|projects|ollama|training|reflexion|index|surrogate)/",
    r"~/\.surrogate/(?:state|logs|memory|bin|skills|sessions)/",
    r"/Users/Ashira/(?:\.surrogate|\.hermes|\.kaggle|\.oci|\.note|develope|axentx)/",

    # 3. Internal directory names (state-management dirs)
    r"\bstate/orchestrate/\d+/",
    r"\bagentic-discovery/",
    r"\braw-mirrors/[a-z0-9-]+/",
    r"\benriched/[a-z0-9-]+/",
    r"\bbatches/(?:public-merged|mirror-merged)/\d{4}-\d{2}-\d{2}/",

    # 4. Daemon names + commit messages from our pipeline
    r"\b(?:dataset-mirror|dataset-enrich|llm-burst-generator|bulk-ingest-parallel|"
    r"agentic-crawler|github-agentic-crawler|skill-synthesis-daemon|push-training-to-hf|"
    r"surrogate-orchestrate|self-heal-watchdog|hermes-status-server|hermes-discord-bot)"
    r"(?:\.sh|\.py)?\b",

    # 5. Specific axentx repo identifiers β€” model shouldn't reproduce these
    r"axentx/surrogate-1-(?:training-pairs|pairs-[A-D]|coder-[a-zA-Z0-9-]+-v[\d.]+)",

    # 6. Token / secret-shaped strings (leaked credentials)
    r"\b(?:hf_[A-Za-z0-9]{30,}|sk-or-v\d-[A-Za-z0-9]{40,}|sk-ant-[A-Za-z0-9-]{30,}|"
    r"KGAT_[A-Za-z0-9]{30,}|csk-[A-Za-z0-9]{40,}|cpk_[A-Za-z0-9.]{40,}|"
    r"gsk_[A-Za-z0-9]{40,}|nvapi-[A-Za-z0-9_-]{40,}|fc-[a-f0-9]{32}|"
    r"ghp_[A-Za-z0-9]{30,}|sbp_[a-f0-9]{40}|cfut_[A-Za-z0-9]{30,}|"
    r"AIzaSy[A-Za-z0-9_-]{30,}|xai-[A-Za-z0-9]{40,}|r8_[A-Za-z0-9]{30,}|rnd_[A-Za-z0-9]{20,}|"
    r"sk-kimi-[A-Za-z0-9]{40,})\b",

    # 7. Common debug / introspection leakage (when LLM was asked to echo state)
    r"\b(?:LIGHTNING_USER_ID|LIGHTNING_API_KEY|HF_TOKEN|KAGGLE_API_TOKEN|KAGGLE_KEY|"
    r"OPENROUTER_API_KEY|CEREBRAS_API_KEY|GROQ_API_KEY|ANTHROPIC_API_KEY)\s*[=:]\s*['\"]?[A-Za-z0-9_-]{20,}",

    # 8. Discord webhook URLs
    r"https://discord\.com/api/webhooks/\d+/[A-Za-z0-9_-]+",

    # 9. Internal commit messages (from daemons pushing to HF)
    r"^(?:enriched|mirror|chunk):\s+",
    r"^train-ready pusher:",
    r"^clean mirror(?:\s+final)?:",

    # 10. JWT-shaped strings (NVIDIA Brev tokens, etc.)
    r"\beyJ[A-Za-z0-9_-]{50,}\.eyJ[A-Za-z0-9_-]{100,}\.[A-Za-z0-9_=-]{40,}",
]

POLLUTION_RE = re.compile("|".join(f"(?:{p})" for p in POLLUTION_PATTERNS),
                           re.MULTILINE | re.IGNORECASE)


def is_polluted(text: str) -> tuple[bool, str | None]:
    """Return (polluted?, matching_pattern_id_for_log).

    Use the matched substring (truncated) so you can log which type of
    pollution caused the drop. Useful for tuning patterns later.
    """
    if not text or not isinstance(text, str):
        return False, None
    m = POLLUTION_RE.search(text)
    if m:
        return True, m.group(0)[:120]
    return False, None


def is_polluted_pair(prompt: str, response: str) -> tuple[bool, str | None]:
    """Check both fields. Drop the row if either is polluted."""
    p_bad, p_match = is_polluted(prompt)
    if p_bad:
        return True, f"prompt: {p_match}"
    r_bad, r_match = is_polluted(response)
    if r_bad:
        return True, f"response: {r_match}"
    return False, None


# Optional: PII regex set (apply alongside)
PII_PATTERNS = [
    # Email
    r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
    # Phone (US/intl basic)
    r"\b\+?\d{1,3}[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
    # SSN
    r"\b\d{3}-\d{2}-\d{4}\b",
    # AWS keys
    r"\bAKIA[0-9A-Z]{16}\b",
    # Stripe keys
    r"\bsk_(?:test|live)_[A-Za-z0-9]{32,}\b",
]
PII_RE = re.compile("|".join(PII_PATTERNS), re.IGNORECASE)


def has_pii(text: str) -> bool:
    return bool(PII_RE.search(text or ""))


# ── Optional NER + secrets scanners (lazy, fail-soft) ──────────────────
# starpii (BigCode) β€” neural PII NER; better than regex for free-form text.
# detect-secrets (Yelp) β€” entropy + plugin-based secret detector.
# Both are optional dependencies; if unavailable we fall back to regex above.
_starpii_pipeline = None
_detect_secrets_collection = None


def _load_starpii():
    """Lazy-load BigCode/starpii pipeline. None on failure."""
    global _starpii_pipeline
    if _starpii_pipeline is not None:
        return _starpii_pipeline if _starpii_pipeline is not False else None
    try:
        from transformers import pipeline  # type: ignore
        _starpii_pipeline = pipeline(
            "token-classification",
            model="bigcode/starpii",
            aggregation_strategy="simple",
        )
        return _starpii_pipeline
    except Exception:
        _starpii_pipeline = False  # sentinel: "tried, don't try again"
        return None


def starpii_pii_hits(text: str, threshold: float = 0.8) -> list[dict]:
    """Return [{type, score, span}] for confidently-detected PII spans.
    Empty list if starpii not installed or no hits.
    """
    pipe = _load_starpii()
    if not pipe or not text:
        return []
    try:
        hits = pipe(text[:4000])  # cap input for speed
    except Exception:
        return []
    return [{"type": h["entity_group"], "score": float(h["score"]),
             "span": text[h["start"]:h["end"]][:120]}
            for h in hits if h.get("score", 0) >= threshold]


def _load_detect_secrets():
    """Lazy-load detect-secrets SecretsCollection. None on failure."""
    global _detect_secrets_collection
    if _detect_secrets_collection is not None:
        return _detect_secrets_collection if _detect_secrets_collection is not False else None
    try:
        from detect_secrets import SecretsCollection  # type: ignore
        from detect_secrets.settings import default_settings  # type: ignore
        _detect_secrets_collection = (SecretsCollection, default_settings)
        return _detect_secrets_collection
    except Exception:
        _detect_secrets_collection = False
        return None


def detect_secrets_hits(text: str) -> list[dict]:
    """Return [{type, line}] for any secret detect-secrets finds.
    Empty list if not installed or none detected.
    """
    loaded = _load_detect_secrets()
    if not loaded or not text:
        return []
    SecretsCollection, default_settings = loaded
    import tempfile, os
    fd, path = tempfile.mkstemp(suffix=".txt")
    try:
        os.write(fd, text.encode("utf-8", "ignore")[:200_000])
        os.close(fd)
        with default_settings():
            sc = SecretsCollection()
            sc.scan_file(path)
        out = []
        for _, secrets in sc.data.items():
            for s in secrets:
                out.append({"type": s.type, "line": s.line_number,
                            "secret_hash": s.secret_hash[:16]})
        return out
    except Exception:
        return []
    finally:
        try: os.unlink(path)
        except OSError: pass


# Quality heuristics β€” drop if response is too short, identical to prompt, etc.
def is_low_quality(prompt: str, response: str) -> tuple[bool, str | None]:
    if not prompt or not response:
        return True, "empty"
    if len(prompt) < 20:
        return True, "prompt_too_short"
    if len(response) < 30:
        return True, "response_too_short"
    if response.strip().lower() == prompt.strip().lower():
        return True, "response_equals_prompt"
    # Detect when response is just an apology / refusal
    if re.match(r"^\s*i('?m| am)?\s+(sorry|afraid|unable|cannot|cant|can't)\b",
                response.strip(), re.IGNORECASE):
        return True, "refusal"
    # Repeated character spam
    if re.search(r"(.)\1{50,}", response):
        return True, "char_spam"
    return False, None


def filter_pair(prompt: str, response: str,
                deep_scan: bool = False) -> dict:
    """Return verdict: {'keep': bool, 'reason': str|None, 'matched': str|None}.

    deep_scan=True: also runs starpii NER + detect-secrets if installed.
    Slow (model load + per-row scan) β€” use for the final pre-train pass,
    not for every dedup row. Heuristic (regex) checks always run.
    """
    polluted, p_match = is_polluted_pair(prompt, response)
    if polluted:
        return {"keep": False, "reason": "polluted", "matched": p_match}
    if has_pii(prompt) or has_pii(response):
        return {"keep": False, "reason": "pii_regex", "matched": None}
    low_q, lq_reason = is_low_quality(prompt, response)
    if low_q:
        return {"keep": False, "reason": f"low_quality:{lq_reason}", "matched": None}

    if deep_scan:
        # NER PII
        for field, txt in (("prompt", prompt), ("response", response)):
            hits = starpii_pii_hits(txt)
            if hits:
                return {"keep": False, "reason": f"pii_ner:{field}",
                        "matched": str(hits[:3])[:300]}
        # detect-secrets entropy/plugins
        for field, txt in (("prompt", prompt), ("response", response)):
            hits = detect_secrets_hits(txt)
            if hits:
                return {"keep": False, "reason": f"secrets:{field}",
                        "matched": str(hits[:3])[:300]}

    return {"keep": True, "reason": None, "matched": None}


# CLI helper for testing
if __name__ == "__main__":
    import sys, json
    sample = sys.stdin.read() if not sys.stdin.isatty() else """{"prompt": "fix bug", "response": "# generated via cerebras:llama3.1-8b\\nReadFile path /home/hermes/.surrogate/state/orchestrate/77426592/1-README.md"}"""
    obj = json.loads(sample) if sample.strip().startswith("{") else {"prompt": "test", "response": sample}
    v = filter_pair(obj.get("prompt", ""), obj.get("response", ""))
    print(json.dumps(v, indent=2))