anurag008w commited on
Commit
09e99f8
·
1 Parent(s): 0291162

Implement secret filename patterns for exclusion

Browse files

Added secret filename patterns to exclude sensitive files from syncing.

Files changed (1) hide show
  1. jupyter-devdata-sync.py +41 -2
jupyter-devdata-sync.py CHANGED
@@ -81,9 +81,48 @@ def repo_id(api) -> str:
81
  raise RuntimeError("Cannot resolve HF namespace for devdata sync")
82
  return f"{ns}/{DATASET_NAME}"
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  def should_skip(p: Path):
85
- parts = set(p.parts)
86
- return any(x in parts for x in EXCLUDE)
 
 
 
 
87
 
88
  def snapshot(src: Path, dst: Path):
89
  for p in src.rglob("*"):
 
81
  raise RuntimeError("Cannot resolve HF namespace for devdata sync")
82
  return f"{ns}/{DATASET_NAME}"
83
 
84
+ # Filename patterns that must never be synced to a public/private HF Dataset.
85
+ # These are matched against the *name* of each path component (not the full path),
86
+ # so ".env" matches /home/node/.env and /home/node/subdir/.env alike.
87
+ import fnmatch as _fnmatch
88
+
89
+ SECRET_FILENAME_PATTERNS = {
90
+ ".env", # dotenv files — almost always contain API keys
91
+ ".env.*", # .env.local, .env.production, etc.
92
+ "*secret*", # any file/dir whose name contains "secret"
93
+ "*secrets*",
94
+ "*_secret*",
95
+ "*-secret*",
96
+ "*key*", # private keys, API key files
97
+ "*_key*",
98
+ "*-key*",
99
+ "*token*", # token files
100
+ "*_token*",
101
+ "*-token*",
102
+ "*.pem", # TLS/SSH private keys
103
+ "*.key", # generic key files
104
+ "*.p12", # PKCS#12 bundles
105
+ "*.pfx",
106
+ "credentials", # common credential file names
107
+ "credentials.*",
108
+ ".netrc", # stores plaintext passwords
109
+ ".htpasswd",
110
+ }
111
+
112
+
113
+ def _name_is_secret(name: str) -> bool:
114
+ """Return True if *name* matches any secret-exclusion pattern."""
115
+ name_lower = name.lower()
116
+ return any(_fnmatch.fnmatch(name_lower, pat) for pat in SECRET_FILENAME_PATTERNS)
117
+
118
+
119
  def should_skip(p: Path):
120
+ # Skip directories/files in the hard-coded exclude set.
121
+ parts = p.parts
122
+ if any(x in parts for x in EXCLUDE):
123
+ return True
124
+ # Skip any component whose name looks like a secret file/dir.
125
+ return any(_name_is_secret(part) for part in parts)
126
 
127
  def snapshot(src: Path, dst: Path):
128
  for p in src.rglob("*"):