Spaces:
Running
Running
Commit ·
09e99f8
1
Parent(s): 0291162
Implement secret filename patterns for exclusion
Browse filesAdded secret filename patterns to exclude sensitive files from syncing.
- jupyter-devdata-sync.py +41 -2
jupyter-devdata-sync.py
CHANGED
|
@@ -81,9 +81,48 @@ def repo_id(api) -> str:
|
|
| 81 |
raise RuntimeError("Cannot resolve HF namespace for devdata sync")
|
| 82 |
return f"{ns}/{DATASET_NAME}"
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def should_skip(p: Path):
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def snapshot(src: Path, dst: Path):
|
| 89 |
for p in src.rglob("*"):
|
|
|
|
| 81 |
raise RuntimeError("Cannot resolve HF namespace for devdata sync")
|
| 82 |
return f"{ns}/{DATASET_NAME}"
|
| 83 |
|
| 84 |
+
# Filename patterns that must never be synced to a public/private HF Dataset.
|
| 85 |
+
# These are matched against the *name* of each path component (not the full path),
|
| 86 |
+
# so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
|
| 87 |
+
import fnmatch as _fnmatch
|
| 88 |
+
|
| 89 |
+
SECRET_FILENAME_PATTERNS = {
|
| 90 |
+
".env", # dotenv files — almost always contain API keys
|
| 91 |
+
".env.*", # .env.local, .env.production, etc.
|
| 92 |
+
"*secret*", # any file/dir whose name contains "secret"
|
| 93 |
+
"*secrets*",
|
| 94 |
+
"*_secret*",
|
| 95 |
+
"*-secret*",
|
| 96 |
+
"*key*", # private keys, API key files
|
| 97 |
+
"*_key*",
|
| 98 |
+
"*-key*",
|
| 99 |
+
"*token*", # token files
|
| 100 |
+
"*_token*",
|
| 101 |
+
"*-token*",
|
| 102 |
+
"*.pem", # TLS/SSH private keys
|
| 103 |
+
"*.key", # generic key files
|
| 104 |
+
"*.p12", # PKCS#12 bundles
|
| 105 |
+
"*.pfx",
|
| 106 |
+
"credentials", # common credential file names
|
| 107 |
+
"credentials.*",
|
| 108 |
+
".netrc", # stores plaintext passwords
|
| 109 |
+
".htpasswd",
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def _name_is_secret(name: str) -> bool:
|
| 114 |
+
"""Return True if *name* matches any secret-exclusion pattern."""
|
| 115 |
+
name_lower = name.lower()
|
| 116 |
+
return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
def should_skip(p: Path):
|
| 120 |
+
# Skip directories/files in the hard-coded exclude set.
|
| 121 |
+
parts = p.parts
|
| 122 |
+
if any(x in parts for x in EXCLUDE):
|
| 123 |
+
return True
|
| 124 |
+
# Skip any component whose name looks like a secret file/dir.
|
| 125 |
+
return any(_name_is_secret(part) for part in parts)
|
| 126 |
|
| 127 |
def snapshot(src: Path, dst: Path):
|
| 128 |
for p in src.rglob("*"):
|