ashirato commited on
Commit
1dfdc54
·
1 Parent(s): b532db8

feat(sanitize): training data leak filter — drop rows w/ FS paths, LLM-provider tags, secrets, PII

Browse files

Critical finding from v1 LoRA eval (2026-04-29): model leaked
'/home/hermes/.surrogate/state/orchestrate/77426592/1-README.md' and
'# generated via cerebras:llama3.1-8b' in inference response, exposing internal
file paths and LLM provider attribution to end users.

bin/lib/sanitize.py: 10 categories of POLLUTION_PATTERNS + PII detection +
low-quality heuristics (refusals, char spam). Integrated into both
dataset-mirror (community SFT mirror) and dataset-enrich (per-row stream)
ingest paths so v2 dataset is clean from the start.

Tested 7/7 cases pass: drops polluted/PII/refusal/token-leak; keeps legit
Dockerfile + daemon-name conceptual mention.

Files changed (3) hide show
  1. bin/dataset-enrich.sh +10 -0
  2. bin/dataset-mirror.sh +17 -0
  3. bin/lib/sanitize.py +155 -0
bin/dataset-enrich.sh CHANGED
@@ -802,6 +802,16 @@ with open(out_path, "w") as out:
802
  if not prompt or not response or len(prompt) < 20 or len(response) < 20:
803
  continue
804
 
 
 
 
 
 
 
 
 
 
 
805
  # Central dedup store — atomic, shared with every other writer
806
  if not DedupStore.is_new(prompt, source=f"enrich-{slug}"):
807
  dup += 1
 
802
  if not prompt or not response or len(prompt) < 20 or len(response) < 20:
803
  continue
804
 
805
+ # Sanitize: drop polluted (filesystem paths, LLM-provider tags, secrets, PII).
806
+ # Audit 2026-04-29: v1 LoRA leaked these in inference. Fix at ingest.
807
+ try:
808
+ from sanitize import filter_pair
809
+ _sv = filter_pair(prompt, response)
810
+ if not _sv["keep"]:
811
+ continue
812
+ except ImportError:
813
+ pass # sanitize lib not available — accept (LEAK RISK)
814
+
815
  # Central dedup store — atomic, shared with every other writer
816
  if not DedupStore.is_new(prompt, source=f"enrich-{slug}"):
817
  dup += 1
bin/dataset-mirror.sh CHANGED
@@ -79,6 +79,18 @@ except Exception as e:
79
  print(f"⚠ DedupStore not importable: {e}; running without central dedup", flush=True)
80
  HAS_DEDUP = False
81
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  # Top 30 community SFT mixes that are HUGE and immediately useful.
83
  # Each = 100K-10M pairs. License flag = OK to redistribute.
84
  SOURCES = [
@@ -256,6 +268,11 @@ for src_id, slug in SOURCES:
256
  if not is_relevant(p, r):
257
  irrelevant += 1
258
  continue
 
 
 
 
 
259
  if HAS_DEDUP and not DedupStore.is_new(p, source=f"mirror-{slug}"):
260
  duped += 1
261
  continue
 
79
  print(f"⚠ DedupStore not importable: {e}; running without central dedup", flush=True)
80
  HAS_DEDUP = False
81
 
82
+ # Sanitizer — drops rows that would leak Surrogate-1 internals into model output.
83
+ # Audit 2026-04-29: v1 LoRA leaked /home/hermes/.surrogate/state/... paths and
84
+ # "# generated via cerebras:..." tags into inference. Filter at ingestion to prevent.
85
+ try:
86
+ from sanitize import filter_pair as _sanitize_filter
87
+ HAS_SANITIZE = True
88
+ except Exception as e:
89
+ print(f"⚠ sanitize not importable: {e}; running without sanitization (LEAK RISK)", flush=True)
90
+ HAS_SANITIZE = False
91
+ def _sanitize_filter(p, r):
92
+ return {"keep": True, "reason": None, "matched": None}
93
+
94
  # Top 30 community SFT mixes that are HUGE and immediately useful.
95
  # Each = 100K-10M pairs. License flag = OK to redistribute.
96
  SOURCES = [
 
268
  if not is_relevant(p, r):
269
  irrelevant += 1
270
  continue
271
+ # Sanitize BEFORE dedup so we don't waste dedup capacity on rows we'll drop
272
+ _sv = _sanitize_filter(p, r)
273
+ if not _sv["keep"]:
274
+ # Track but don't spam: only log first few per slug
275
+ continue
276
  if HAS_DEDUP and not DedupStore.is_new(p, source=f"mirror-{slug}"):
277
  duped += 1
278
  continue
bin/lib/sanitize.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Surrogate-1 training data sanitizer — drops rows that would leak internals.
2
+
3
+ Discovered 2026-04-29: v1 LoRA leaked path "/home/hermes/.surrogate/state/orchestrate/77426592/1-README.md"
4
+ and "# generated via cerebras:llama3.1-8b" tag in inference response. Root cause: dataset-mirror
5
+ + dataset-enrich ingested rows where the response was an LLM-generated artifact still tagged
6
+ with provider attribution + internal filesystem context.
7
+
8
+ Apply this filter at row-level BEFORE the row is added to training-pairs.jsonl. Drop entire
9
+ row if either prompt or response matches any high-risk pattern.
10
+ """
11
+ import re
12
+
13
+ # Patterns that indicate the row contains Surrogate-1 internal pollution.
14
+ # Order: most-specific first so re.search short-circuits on hits.
15
+ POLLUTION_PATTERNS = [
16
+ # 1. LLM provider attribution lines — often added by llm-burst-generator outputs
17
+ r"^\s*#\s*generated\s+via\s+(cerebras|groq|openrouter|gemini|chutes|samba|kimi|nvidia|hf-router|hf-llama|hf-qwen|hf-mistral)",
18
+ r"<<\s*generated by\s+(cerebras|groq|openrouter|gemini|chutes|samba|kimi)",
19
+ r"\[(cerebras|groq|openrouter|gemini|chutes|samba|kimi):[a-z0-9.-]+\]",
20
+
21
+ # 2. Internal filesystem paths — exposing host structure
22
+ r"/home/hermes/(?:\.surrogate|\.hermes|\.codex|\.gemini|\.kaggle)/",
23
+ r"/data/(?:state|logs|memory|skills|sessions|workspace|projects|ollama|training|reflexion|index|surrogate)/",
24
+ r"~/\.surrogate/(?:state|logs|memory|bin|skills|sessions)/",
25
+ r"/Users/Ashira/(?:\.surrogate|\.hermes|\.kaggle|\.oci|\.note|develope|axentx)/",
26
+
27
+ # 3. Internal directory names (state-management dirs)
28
+ r"\bstate/orchestrate/\d+/",
29
+ r"\bagentic-discovery/",
30
+ r"\braw-mirrors/[a-z0-9-]+/",
31
+ r"\benriched/[a-z0-9-]+/",
32
+ r"\bbatches/(?:public-merged|mirror-merged)/\d{4}-\d{2}-\d{2}/",
33
+
34
+ # 4. Daemon names + commit messages from our pipeline
35
+ r"\b(?:dataset-mirror|dataset-enrich|llm-burst-generator|bulk-ingest-parallel|"
36
+ r"agentic-crawler|github-agentic-crawler|skill-synthesis-daemon|push-training-to-hf|"
37
+ r"surrogate-orchestrate|self-heal-watchdog|hermes-status-server|hermes-discord-bot)"
38
+ r"(?:\.sh|\.py)?\b",
39
+
40
+ # 5. Specific axentx repo identifiers — model shouldn't reproduce these
41
+ r"axentx/surrogate-1-(?:training-pairs|pairs-[A-D]|coder-[a-z0-9]+-lora-v\d+)",
42
+
43
+ # 6. Token / secret-shaped strings (leaked credentials)
44
+ r"\b(?:hf_[A-Za-z0-9]{30,}|sk-or-v\d-[A-Za-z0-9]{40,}|sk-ant-[A-Za-z0-9-]{30,}|"
45
+ r"KGAT_[A-Za-z0-9]{30,}|csk-[A-Za-z0-9]{40,}|cpk_[A-Za-z0-9.]{40,}|"
46
+ r"gsk_[A-Za-z0-9]{40,}|nvapi-[A-Za-z0-9_-]{40,}|fc-[a-f0-9]{32}|"
47
+ r"ghp_[A-Za-z0-9]{30,}|sbp_[a-f0-9]{40}|cfut_[A-Za-z0-9]{30,}|"
48
+ r"AIzaSy[A-Za-z0-9_-]{30,}|xai-[A-Za-z0-9]{40,}|r8_[A-Za-z0-9]{30,}|rnd_[A-Za-z0-9]{20,}|"
49
+ r"sk-kimi-[A-Za-z0-9]{40,})\b",
50
+
51
+ # 7. Common debug / introspection leakage (when LLM was asked to echo state)
52
+ r"\b(?:LIGHTNING_USER_ID|LIGHTNING_API_KEY|HF_TOKEN|KAGGLE_API_TOKEN|KAGGLE_KEY|"
53
+ r"OPENROUTER_API_KEY|CEREBRAS_API_KEY|GROQ_API_KEY|ANTHROPIC_API_KEY)\s*[=:]\s*['\"]?[A-Za-z0-9_-]{20,}",
54
+
55
+ # 8. Discord webhook URLs
56
+ r"https://discord\.com/api/webhooks/\d+/[A-Za-z0-9_-]+",
57
+
58
+ # 9. Internal commit messages (from daemons pushing to HF)
59
+ r"^(?:enriched|mirror|chunk):\s+",
60
+ r"^train-ready pusher:",
61
+ r"^clean mirror(?:\s+final)?:",
62
+
63
+ # 10. JWT-shaped strings (NVIDIA Brev tokens, etc.)
64
+ r"\beyJ[A-Za-z0-9_-]{50,}\.eyJ[A-Za-z0-9_-]{100,}\.[A-Za-z0-9_=-]{40,}",
65
+ ]
66
+
67
+ POLLUTION_RE = re.compile("|".join(f"(?:{p})" for p in POLLUTION_PATTERNS),
68
+ re.MULTILINE | re.IGNORECASE)
69
+
70
+
71
+ def is_polluted(text: str) -> tuple[bool, str | None]:
72
+ """Return (polluted?, matching_pattern_id_for_log).
73
+
74
+ Use the matched substring (truncated) so you can log which type of
75
+ pollution caused the drop. Useful for tuning patterns later.
76
+ """
77
+ if not text or not isinstance(text, str):
78
+ return False, None
79
+ m = POLLUTION_RE.search(text)
80
+ if m:
81
+ return True, m.group(0)[:120]
82
+ return False, None
83
+
84
+
85
+ def is_polluted_pair(prompt: str, response: str) -> tuple[bool, str | None]:
86
+ """Check both fields. Drop the row if either is polluted."""
87
+ p_bad, p_match = is_polluted(prompt)
88
+ if p_bad:
89
+ return True, f"prompt: {p_match}"
90
+ r_bad, r_match = is_polluted(response)
91
+ if r_bad:
92
+ return True, f"response: {r_match}"
93
+ return False, None
94
+
95
+
96
+ # Optional: PII regex set (apply alongside)
97
+ PII_PATTERNS = [
98
+ # Email
99
+ r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
100
+ # Phone (US/intl basic)
101
+ r"\b\+?\d{1,3}[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
102
+ # SSN
103
+ r"\b\d{3}-\d{2}-\d{4}\b",
104
+ # AWS keys
105
+ r"\bAKIA[0-9A-Z]{16}\b",
106
+ # Stripe keys
107
+ r"\bsk_(?:test|live)_[A-Za-z0-9]{32,}\b",
108
+ ]
109
+ PII_RE = re.compile("|".join(PII_PATTERNS), re.IGNORECASE)
110
+
111
+
112
+ def has_pii(text: str) -> bool:
113
+ return bool(PII_RE.search(text or ""))
114
+
115
+
116
+ # Quality heuristics — drop if response is too short, identical to prompt, etc.
117
+ def is_low_quality(prompt: str, response: str) -> tuple[bool, str | None]:
118
+ if not prompt or not response:
119
+ return True, "empty"
120
+ if len(prompt) < 20:
121
+ return True, "prompt_too_short"
122
+ if len(response) < 30:
123
+ return True, "response_too_short"
124
+ if response.strip().lower() == prompt.strip().lower():
125
+ return True, "response_equals_prompt"
126
+ # Detect when response is just an apology / refusal
127
+ if re.match(r"^\s*i('?m| am)?\s+(sorry|afraid|unable|cannot|cant|can't)\b",
128
+ response.strip(), re.IGNORECASE):
129
+ return True, "refusal"
130
+ # Repeated character spam
131
+ if re.search(r"(.)\1{50,}", response):
132
+ return True, "char_spam"
133
+ return False, None
134
+
135
+
136
+ def filter_pair(prompt: str, response: str) -> dict:
137
+ """Return verdict: {'keep': bool, 'reason': str|None, 'matched': str|None}"""
138
+ polluted, p_match = is_polluted_pair(prompt, response)
139
+ if polluted:
140
+ return {"keep": False, "reason": "polluted", "matched": p_match}
141
+ if has_pii(prompt) or has_pii(response):
142
+ return {"keep": False, "reason": "pii", "matched": None}
143
+ low_q, lq_reason = is_low_quality(prompt, response)
144
+ if low_q:
145
+ return {"keep": False, "reason": f"low_quality:{lq_reason}", "matched": None}
146
+ return {"keep": True, "reason": None, "matched": None}
147
+
148
+
149
+ # CLI helper for testing
150
+ if __name__ == "__main__":
151
+ import sys, json
152
+ sample = sys.stdin.read() if not sys.stdin.isatty() else """{"prompt": "fix bug", "response": "# generated via cerebras:llama3.1-8b\\nReadFile path /home/hermes/.surrogate/state/orchestrate/77426592/1-README.md"}"""
153
+ obj = json.loads(sample) if sample.strip().startswith("{") else {"prompt": "test", "response": sample}
154
+ v = filter_pair(obj.get("prompt", ""), obj.get("response", ""))
155
+ print(json.dumps(v, indent=2))