Spaces:

axentx
/

surrogate-1

Runtime error

ashirato commited on 12 days ago

Commit

1dfdc54

1 Parent(s): b532db8

feat(sanitize): training data leak filter — drop rows w/ FS paths, LLM-provider tags, secrets, PII

Critical finding from v1 LoRA eval (2026-04-29): model leaked
'/home/hermes/.surrogate/state/orchestrate/77426592/1-README.md' and
'# generated via cerebras:llama3.1-8b' in inference response, exposing internal
file paths and LLM provider attribution to end users.

bin/lib/sanitize.py: 10 categories of POLLUTION_PATTERNS + PII detection +
low-quality heuristics (refusals, char spam). Integrated into both
dataset-mirror (community SFT mirror) and dataset-enrich (per-row stream)
ingest paths so v2 dataset is clean from the start.

Tested 7/7 cases pass: drops polluted/PII/refusal/token-leak; keeps legit
Dockerfile + daemon-name conceptual mention.

Files changed (3) hide show

bin/dataset-enrich.sh +10 -0
bin/dataset-mirror.sh +17 -0
bin/lib/sanitize.py +155 -0

bin/dataset-enrich.sh CHANGED Viewed

@@ -802,6 +802,16 @@ with open(out_path, "w") as out:
                 if not prompt or not response or len(prompt) < 20 or len(response) < 20:
                     continue
                 # Central dedup store — atomic, shared with every other writer
                 if not DedupStore.is_new(prompt, source=f"enrich-{slug}"):
                     dup += 1

                 if not prompt or not response or len(prompt) < 20 or len(response) < 20:
                     continue
+                # Sanitize: drop polluted (filesystem paths, LLM-provider tags, secrets, PII).
+                # Audit 2026-04-29: v1 LoRA leaked these in inference. Fix at ingest.
+                try:
+                    from sanitize import filter_pair
+                    _sv = filter_pair(prompt, response)
+                    if not _sv["keep"]:
+                        continue
+                except ImportError:
+                    pass  # sanitize lib not available — accept (LEAK RISK)
                 # Central dedup store — atomic, shared with every other writer
                 if not DedupStore.is_new(prompt, source=f"enrich-{slug}"):
                     dup += 1

bin/dataset-mirror.sh CHANGED Viewed

@@ -79,6 +79,18 @@ except Exception as e:
     print(f"⚠ DedupStore not importable: {e}; running without central dedup", flush=True)
     HAS_DEDUP = False
 # Top 30 community SFT mixes that are HUGE and immediately useful.
 # Each = 100K-10M pairs. License flag = OK to redistribute.
 SOURCES = [
@@ -256,6 +268,11 @@ for src_id, slug in SOURCES:
                 if not is_relevant(p, r):
                     irrelevant += 1
                     continue
                 if HAS_DEDUP and not DedupStore.is_new(p, source=f"mirror-{slug}"):
                     duped += 1
                     continue

     print(f"⚠ DedupStore not importable: {e}; running without central dedup", flush=True)
     HAS_DEDUP = False
+# Sanitizer — drops rows that would leak Surrogate-1 internals into model output.
+# Audit 2026-04-29: v1 LoRA leaked /home/hermes/.surrogate/state/... paths and
+# "# generated via cerebras:..." tags into inference. Filter at ingestion to prevent.
+try:
+    from sanitize import filter_pair as _sanitize_filter
+    HAS_SANITIZE = True
+except Exception as e:
+    print(f"⚠ sanitize not importable: {e}; running without sanitization (LEAK RISK)", flush=True)
+    HAS_SANITIZE = False
+    def _sanitize_filter(p, r):
+        return {"keep": True, "reason": None, "matched": None}
 # Top 30 community SFT mixes that are HUGE and immediately useful.
 # Each = 100K-10M pairs. License flag = OK to redistribute.
 SOURCES = [
                 if not is_relevant(p, r):
                     irrelevant += 1
                     continue
+                # Sanitize BEFORE dedup so we don't waste dedup capacity on rows we'll drop
+                _sv = _sanitize_filter(p, r)
+                if not _sv["keep"]:
+                    # Track but don't spam: only log first few per slug
+                    continue
                 if HAS_DEDUP and not DedupStore.is_new(p, source=f"mirror-{slug}"):
                     duped += 1
                     continue

bin/lib/sanitize.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""Surrogate-1 training data sanitizer — drops rows that would leak internals.
+Discovered 2026-04-29: v1 LoRA leaked path "/home/hermes/.surrogate/state/orchestrate/77426592/1-README.md"
+and "# generated via cerebras:llama3.1-8b" tag in inference response. Root cause: dataset-mirror
++ dataset-enrich ingested rows where the response was an LLM-generated artifact still tagged
+with provider attribution + internal filesystem context.
+Apply this filter at row-level BEFORE the row is added to training-pairs.jsonl. Drop entire
+row if either prompt or response matches any high-risk pattern.
+"""
+import re
+# Patterns that indicate the row contains Surrogate-1 internal pollution.
+# Order: most-specific first so re.search short-circuits on hits.
+POLLUTION_PATTERNS = [
+    # 1. LLM provider attribution lines — often added by llm-burst-generator outputs
+    r"^\s*#\s*generated\s+via\s+(cerebras|groq|openrouter|gemini|chutes|samba|kimi|nvidia|hf-router|hf-llama|hf-qwen|hf-mistral)",
+    r"<<\s*generated by\s+(cerebras|groq|openrouter|gemini|chutes|samba|kimi)",
+    r"\[(cerebras|groq|openrouter|gemini|chutes|samba|kimi):[a-z0-9.-]+\]",
+    # 2. Internal filesystem paths — exposing host structure
+    r"/home/hermes/(?:\.surrogate|\.hermes|\.codex|\.gemini|\.kaggle)/",
+    r"/data/(?:state|logs|memory|skills|sessions|workspace|projects|ollama|training|reflexion|index|surrogate)/",
+    r"~/\.surrogate/(?:state|logs|memory|bin|skills|sessions)/",
+    r"/Users/Ashira/(?:\.surrogate|\.hermes|\.kaggle|\.oci|\.note|develope|axentx)/",
+    # 3. Internal directory names (state-management dirs)
+    r"\bstate/orchestrate/\d+/",
+    r"\bagentic-discovery/",
+    r"\braw-mirrors/[a-z0-9-]+/",
+    r"\benriched/[a-z0-9-]+/",
+    r"\bbatches/(?:public-merged|mirror-merged)/\d{4}-\d{2}-\d{2}/",
+    # 4. Daemon names + commit messages from our pipeline
+    r"\b(?:dataset-mirror|dataset-enrich|llm-burst-generator|bulk-ingest-parallel|"
+    r"agentic-crawler|github-agentic-crawler|skill-synthesis-daemon|push-training-to-hf|"
+    r"surrogate-orchestrate|self-heal-watchdog|hermes-status-server|hermes-discord-bot)"
+    r"(?:\.sh|\.py)?\b",
+    # 5. Specific axentx repo identifiers — model shouldn't reproduce these
+    r"axentx/surrogate-1-(?:training-pairs|pairs-[A-D]|coder-[a-z0-9]+-lora-v\d+)",
+    # 6. Token / secret-shaped strings (leaked credentials)
+    r"\b(?:hf_[A-Za-z0-9]{30,}|sk-or-v\d-[A-Za-z0-9]{40,}|sk-ant-[A-Za-z0-9-]{30,}|"
+    r"KGAT_[A-Za-z0-9]{30,}|csk-[A-Za-z0-9]{40,}|cpk_[A-Za-z0-9.]{40,}|"
+    r"gsk_[A-Za-z0-9]{40,}|nvapi-[A-Za-z0-9_-]{40,}|fc-[a-f0-9]{32}|"
+    r"ghp_[A-Za-z0-9]{30,}|sbp_[a-f0-9]{40}|cfut_[A-Za-z0-9]{30,}|"
+    r"AIzaSy[A-Za-z0-9_-]{30,}|xai-[A-Za-z0-9]{40,}|r8_[A-Za-z0-9]{30,}|rnd_[A-Za-z0-9]{20,}|"
+    r"sk-kimi-[A-Za-z0-9]{40,})\b",
+    # 7. Common debug / introspection leakage (when LLM was asked to echo state)
+    r"\b(?:LIGHTNING_USER_ID|LIGHTNING_API_KEY|HF_TOKEN|KAGGLE_API_TOKEN|KAGGLE_KEY|"
+    r"OPENROUTER_API_KEY|CEREBRAS_API_KEY|GROQ_API_KEY|ANTHROPIC_API_KEY)\s*[=:]\s*['\"]?[A-Za-z0-9_-]{20,}",
+    # 8. Discord webhook URLs
+    r"https://discord\.com/api/webhooks/\d+/[A-Za-z0-9_-]+",
+    # 9. Internal commit messages (from daemons pushing to HF)
+    r"^(?:enriched|mirror|chunk):\s+",
+    r"^train-ready pusher:",
+    r"^clean mirror(?:\s+final)?:",
+    # 10. JWT-shaped strings (NVIDIA Brev tokens, etc.)
+    r"\beyJ[A-Za-z0-9_-]{50,}\.eyJ[A-Za-z0-9_-]{100,}\.[A-Za-z0-9_=-]{40,}",
+]
+POLLUTION_RE = re.compile("|".join(f"(?:{p})" for p in POLLUTION_PATTERNS),
+                           re.MULTILINE | re.IGNORECASE)
+def is_polluted(text: str) -> tuple[bool, str | None]:
+    """Return (polluted?, matching_pattern_id_for_log).
+    Use the matched substring (truncated) so you can log which type of
+    pollution caused the drop. Useful for tuning patterns later.
+    """
+    if not text or not isinstance(text, str):
+        return False, None
+    m = POLLUTION_RE.search(text)
+    if m:
+        return True, m.group(0)[:120]
+    return False, None
+def is_polluted_pair(prompt: str, response: str) -> tuple[bool, str | None]:
+    """Check both fields. Drop the row if either is polluted."""
+    p_bad, p_match = is_polluted(prompt)
+    if p_bad:
+        return True, f"prompt: {p_match}"
+    r_bad, r_match = is_polluted(response)
+    if r_bad:
+        return True, f"response: {r_match}"
+    return False, None
+# Optional: PII regex set (apply alongside)
+PII_PATTERNS = [
+    # Email
+    r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
+    # Phone (US/intl basic)
+    r"\b\+?\d{1,3}[\s.-]?\(?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b",
+    # SSN
+    r"\b\d{3}-\d{2}-\d{4}\b",
+    # AWS keys
+    r"\bAKIA[0-9A-Z]{16}\b",
+    # Stripe keys
+    r"\bsk_(?:test|live)_[A-Za-z0-9]{32,}\b",
+]
+PII_RE = re.compile("|".join(PII_PATTERNS), re.IGNORECASE)
+def has_pii(text: str) -> bool:
+    return bool(PII_RE.search(text or ""))
+# Quality heuristics — drop if response is too short, identical to prompt, etc.
+def is_low_quality(prompt: str, response: str) -> tuple[bool, str | None]:
+    if not prompt or not response:
+        return True, "empty"
+    if len(prompt) < 20:
+        return True, "prompt_too_short"
+    if len(response) < 30:
+        return True, "response_too_short"
+    if response.strip().lower() == prompt.strip().lower():
+        return True, "response_equals_prompt"
+    # Detect when response is just an apology / refusal
+    if re.match(r"^\s*i('?m| am)?\s+(sorry|afraid|unable|cannot|cant|can't)\b",
+                response.strip(), re.IGNORECASE):
+        return True, "refusal"
+    # Repeated character spam
+    if re.search(r"(.)\1{50,}", response):
+        return True, "char_spam"
+    return False, None
+def filter_pair(prompt: str, response: str) -> dict:
+    """Return verdict: {'keep': bool, 'reason': str|None, 'matched': str|None}"""
+    polluted, p_match = is_polluted_pair(prompt, response)
+    if polluted:
+        return {"keep": False, "reason": "polluted", "matched": p_match}
+    if has_pii(prompt) or has_pii(response):
+        return {"keep": False, "reason": "pii", "matched": None}
+    low_q, lq_reason = is_low_quality(prompt, response)
+    if low_q:
+        return {"keep": False, "reason": f"low_quality:{lq_reason}", "matched": None}
+    return {"keep": True, "reason": None, "matched": None}
+# CLI helper for testing
+if __name__ == "__main__":
+    import sys, json
+    sample = sys.stdin.read() if not sys.stdin.isatty() else """{"prompt": "fix bug", "response": "# generated via cerebras:llama3.1-8b\\nReadFile path /home/hermes/.surrogate/state/orchestrate/77426592/1-README.md"}"""
+    obj = json.loads(sample) if sample.strip().startswith("{") else {"prompt": "test", "response": sample}
+    v = filter_pair(obj.get("prompt", ""), obj.get("response", ""))
+    print(json.dumps(v, indent=2))