Farhan Beg commited on
Commit
23ba9c5
·
1 Parent(s): b2e073e

perf(hermes-sync): change-driven backups (poll/debounce, 60s ceiling)

Browse files

* Replace fixed 600s sleep loop with a 2s stat-only watcher
* Upload 3s after the watched tree settles instead of waiting for the next tick
* Hard ceiling at SYNC_INTERVAL=60s so a constantly-busy session still snapshots
* New tunables: SYNC_POLL_INTERVAL, SYNC_DEBOUNCE_SECONDS
* Reduces worst-case data loss on restart from ~10 minutes to ~3 seconds

Files changed (3) hide show
  1. .env.example +43 -38
  2. hermes-sync.py +418 -343
  3. start.sh +2 -2
.env.example CHANGED
@@ -1,38 +1,43 @@
1
- # ── Required ──────────────────────────────────────────────────────────
2
- GATEWAY_TOKEN=change-me-to-a-strong-random-string
3
- LLM_API_KEY=your-llm-provider-api-key
4
- LLM_MODEL=openrouter/anthropic/claude-sonnet-4
5
-
6
- # Examples for other providers:
7
- # LLM_MODEL=openai/gpt-4o
8
- # LLM_MODEL=anthropic/claude-sonnet-4-6
9
- # LLM_MODEL=google/gemini-2.5-flash
10
- # LLM_MODEL=deepseek/deepseek-chat
11
- # LLM_MODEL=huggingface/Qwen/Qwen3-235B-A22B-Thinking-2507
12
-
13
- # ── HF Dataset persistence (recommended) ──────────────────────────────
14
- # Write-scope token: https://huggingface.co/settings/tokens
15
- # HF_TOKEN=hf_xxx
16
- # BACKUP_DATASET_NAME=huggingmes-backup
17
- # SYNC_INTERVAL=600
18
-
19
- # ── Optional: Cloudflare proxy + keep-alive ───────────────────────────
20
- # CLOUDFLARE_WORKERS_TOKEN=cf_xxx
21
- # CLOUDFLARE_ACCOUNT_ID=
22
- # CLOUDFLARE_KEEPALIVE_CRON=*/10 * * * *
23
-
24
- # ── Optional: Telegram bridge ─────────────────────────────────────────
25
- # TELEGRAM_BOT_TOKEN=123456:ABC
26
- # TELEGRAM_ALLOWED_USERS=11111111,22222222
27
- # TELEGRAM_MODE=webhook
28
-
29
- # ── Optional: swap the landing page to the HuggingMes status page ─────
30
- # PRIMARY_UI=dashboard
31
-
32
- # ── Optional: custom OpenAI-compatible endpoint ───────────────────────
33
- # CUSTOM_BASE_URL=http://localhost:11434/v1
34
- # CUSTOM_MODEL_CONTEXT_LENGTH=131072
35
- # CUSTOM_MODEL_MAX_TOKENS=8192
36
-
37
- # ── Reproducibility: pin the Hermes Agent base image ──────────────────
38
- # HERMES_AGENT_VERSION=latest
 
 
 
 
 
 
1
+ # ── Required ──────────────────────────────────────────────────────────
2
+ GATEWAY_TOKEN=change-me-to-a-strong-random-string
3
+ LLM_API_KEY=your-llm-provider-api-key
4
+ LLM_MODEL=openrouter/anthropic/claude-sonnet-4
5
+
6
+ # Examples for other providers:
7
+ # LLM_MODEL=openai/gpt-4o
8
+ # LLM_MODEL=anthropic/claude-sonnet-4-6
9
+ # LLM_MODEL=google/gemini-2.5-flash
10
+ # LLM_MODEL=deepseek/deepseek-chat
11
+ # LLM_MODEL=huggingface/Qwen/Qwen3-235B-A22B-Thinking-2507
12
+
13
+ # ── HF Dataset persistence (recommended) ──────────────────────────────
14
+ # Write-scope token: https://huggingface.co/settings/tokens
15
+ # HF_TOKEN=hf_xxx
16
+ # BACKUP_DATASET_NAME=huggingmes-backup
17
+ # Backup is change-driven: every push happens after the watched dirs have been
18
+ # quiet for SYNC_DEBOUNCE_SECONDS, capped at SYNC_INTERVAL even under constant
19
+ # writes. Drop these only if you understand the trade-off; lower = more pushes.
20
+ # SYNC_POLL_INTERVAL=2
21
+ # SYNC_DEBOUNCE_SECONDS=3
22
+ # SYNC_INTERVAL=60
23
+
24
+ # ── Optional: Cloudflare proxy + keep-alive ───────────────────────────
25
+ # CLOUDFLARE_WORKERS_TOKEN=cf_xxx
26
+ # CLOUDFLARE_ACCOUNT_ID=
27
+ # CLOUDFLARE_KEEPALIVE_CRON=*/10 * * * *
28
+
29
+ # ── Optional: Telegram bridge ─────────────────────────────────────────
30
+ # TELEGRAM_BOT_TOKEN=123456:ABC
31
+ # TELEGRAM_ALLOWED_USERS=11111111,22222222
32
+ # TELEGRAM_MODE=webhook
33
+
34
+ # ── Optional: swap the landing page to the HuggingMes status page ─────
35
+ # PRIMARY_UI=dashboard
36
+
37
+ # ── Optional: custom OpenAI-compatible endpoint ───────────────────────
38
+ # CUSTOM_BASE_URL=http://localhost:11434/v1
39
+ # CUSTOM_MODEL_CONTEXT_LENGTH=131072
40
+ # CUSTOM_MODEL_MAX_TOKENS=8192
41
+
42
+ # ── Reproducibility: pin the Hermes Agent base image ──────────────────
43
+ # HERMES_AGENT_VERSION=latest
hermes-sync.py CHANGED
@@ -1,343 +1,418 @@
1
- #!/usr/bin/env python3
2
- """HuggingMes Hermes state backup via Hugging Face Datasets.
3
-
4
- Vendored verbatim from github.com/somratpro/HuggingMes.
5
- Backs up HERMES_HOME (which includes /opt/data/webui — the hermes-webui state dir)
6
- so sessions, profiles, skills, cron, memory, and workspace all survive restarts.
7
- """
8
-
9
- import hashlib
10
- import json
11
- import logging
12
- import os
13
- import shutil
14
- import signal
15
- import random
16
- import socket
17
- import sys
18
- import tempfile
19
- import threading
20
- import time
21
- from pathlib import Path
22
-
23
- os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
24
- os.environ.setdefault("HF_HUB_VERBOSITY", "error")
25
- os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300")
26
- os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
27
-
28
- from huggingface_hub import HfApi, snapshot_download, upload_folder
29
- from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
30
-
31
- logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
32
-
33
- HERMES_HOME = Path(os.environ.get("HERMES_HOME", "/opt/data"))
34
- STATUS_FILE = Path("/tmp/huggingmes-sync-status.json")
35
- STATE_FILE = HERMES_HOME / ".huggingmes-sync-state.json"
36
- INTERVAL = int(os.environ.get("SYNC_INTERVAL", "600"))
37
- INITIAL_DELAY = int(os.environ.get("SYNC_START_DELAY", "10"))
38
- HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
39
- HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
40
- SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
41
- BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "huggingmes-backup").strip()
42
- INCLUDE_ENV = os.environ.get("SYNC_INCLUDE_ENV", "").strip().lower() in {"1", "true", "yes"}
43
- MAX_FILE_SIZE_BYTES = int(os.environ.get("SYNC_MAX_FILE_BYTES", str(50 * 1024 * 1024)))
44
-
45
- EXCLUDED_DIRS = {
46
- ".cache",
47
- ".git",
48
- ".npm",
49
- ".venv",
50
- "__pycache__",
51
- "node_modules",
52
- "venv",
53
- "logs", # log files are useless after a restart
54
- }
55
- EXCLUDED_TOP_LEVEL = {"logs", STATE_FILE.name}
56
- EXCLUDED_SUFFIXES = (
57
- ".log", ".log.1", ".log.2",
58
- ".db-shm", ".db-wal", ".db-journal",
59
- ".pid", ".tmp",
60
- )
61
- if not INCLUDE_ENV:
62
- EXCLUDED_TOP_LEVEL.add(".env")
63
-
64
- HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
65
- STOP_EVENT = threading.Event()
66
- _REPO_ID_CACHE: str | None = None
67
-
68
-
69
- def write_status(status: str, message: str, fingerprint: str | None = None, marker: tuple[int, int, int] | None = None) -> None:
70
- timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
71
- payload = {"status": status, "message": message, "timestamp": timestamp}
72
-
73
- tmp_path = STATUS_FILE.with_suffix(".tmp")
74
- try:
75
- tmp_path.write_text(json.dumps(payload), encoding="utf-8")
76
- tmp_path.replace(STATUS_FILE)
77
- except OSError:
78
- pass
79
-
80
- if fingerprint or marker:
81
- state = {}
82
- if STATE_FILE.exists():
83
- try:
84
- state = json.loads(STATE_FILE.read_text(encoding="utf-8"))
85
- except Exception:
86
- pass
87
- if fingerprint:
88
- state["last_fingerprint"] = fingerprint
89
- if marker:
90
- state["last_marker"] = list(marker)
91
- state["last_sync"] = timestamp
92
- try:
93
- STATE_FILE.write_text(json.dumps(state), encoding="utf-8")
94
- except OSError:
95
- pass
96
-
97
-
98
- def resolve_backup_repo() -> str:
99
- global _REPO_ID_CACHE
100
- if _REPO_ID_CACHE:
101
- return _REPO_ID_CACHE
102
-
103
- namespace = HF_USERNAME or SPACE_AUTHOR_NAME
104
- if not namespace and HF_API is not None:
105
- whoami = HF_API.whoami()
106
- namespace = whoami.get("name") or whoami.get("user") or ""
107
-
108
- namespace = str(namespace).strip()
109
- if not namespace:
110
- raise RuntimeError("Could not determine HF username. Set HF_USERNAME or use an account HF_TOKEN.")
111
-
112
- _REPO_ID_CACHE = f"{namespace}/{BACKUP_DATASET_NAME}"
113
- return _REPO_ID_CACHE
114
-
115
-
116
- def ensure_repo_exists() -> str:
117
- repo_id = resolve_backup_repo()
118
- try:
119
- HF_API.repo_info(repo_id=repo_id, repo_type="dataset")
120
- except RepositoryNotFoundError:
121
- HF_API.create_repo(repo_id=repo_id, repo_type="dataset", private=True)
122
- return repo_id
123
-
124
-
125
- def should_exclude(rel_posix: str, path: Path) -> bool:
126
- parts = Path(rel_posix).parts
127
- if not parts:
128
- return False
129
- if parts[0] in EXCLUDED_TOP_LEVEL:
130
- return True
131
- if any(part in EXCLUDED_DIRS for part in parts):
132
- return True
133
- if path.is_file():
134
- name_lower = path.name.lower()
135
- if name_lower.endswith(EXCLUDED_SUFFIXES):
136
- return True
137
- try:
138
- return path.stat().st_size > MAX_FILE_SIZE_BYTES
139
- except OSError:
140
- return True
141
- return False
142
-
143
-
144
- def metadata_marker(root: Path) -> tuple[int, int, int]:
145
- if not root.exists():
146
- return (0, 0, 0)
147
- file_count = 0
148
- total_size = 0
149
- newest_mtime = 0
150
- for path in root.rglob("*"):
151
- if not path.is_file():
152
- continue
153
- rel = path.relative_to(root).as_posix()
154
- if should_exclude(rel, path):
155
- continue
156
- try:
157
- stat = path.stat()
158
- except OSError:
159
- continue
160
- file_count += 1
161
- total_size += int(stat.st_size)
162
- newest_mtime = max(newest_mtime, int(stat.st_mtime_ns))
163
- return (file_count, total_size, newest_mtime)
164
-
165
-
166
- def fingerprint_dir(root: Path) -> str:
167
- hasher = hashlib.sha256()
168
- if not root.exists():
169
- return hasher.hexdigest()
170
- for path in sorted(p for p in root.rglob("*") if p.is_file()):
171
- rel = path.relative_to(root).as_posix()
172
- if should_exclude(rel, path):
173
- continue
174
- hasher.update(rel.encode("utf-8"))
175
- with path.open("rb") as handle:
176
- for chunk in iter(lambda: handle.read(1024 * 1024), b""):
177
- hasher.update(chunk)
178
- return hasher.hexdigest()
179
-
180
-
181
- def create_snapshot_dir(source_root: Path) -> Path:
182
- staging_root = Path(tempfile.mkdtemp(prefix="huggingmes-sync-"))
183
- for path in sorted(source_root.rglob("*")):
184
- rel = path.relative_to(source_root)
185
- rel_posix = rel.as_posix()
186
- if should_exclude(rel_posix, path):
187
- continue
188
- target = staging_root / rel
189
- if path.is_dir():
190
- target.mkdir(parents=True, exist_ok=True)
191
- continue
192
- target.parent.mkdir(parents=True, exist_ok=True)
193
- try:
194
- shutil.copy2(path, target)
195
- except OSError:
196
- continue
197
- return staging_root
198
-
199
-
200
- def restore() -> bool:
201
- if not HF_TOKEN:
202
- write_status("disabled", "HF_TOKEN is not configured.")
203
- return False
204
-
205
- repo_id = resolve_backup_repo()
206
- write_status("restoring", f"Restoring Hermes state from {repo_id}")
207
- try:
208
- with tempfile.TemporaryDirectory() as tmpdir:
209
- snapshot_download(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN, local_dir=tmpdir)
210
- tmp_path = Path(tmpdir)
211
- if not any(tmp_path.iterdir()):
212
- write_status("fresh", "Backup dataset is empty. Starting fresh.")
213
- return True
214
-
215
- HERMES_HOME.mkdir(parents=True, exist_ok=True)
216
- for child in tmp_path.iterdir():
217
- if should_exclude(child.name, child):
218
- continue
219
- target = HERMES_HOME / child.name
220
- if target.is_dir():
221
- shutil.rmtree(target, ignore_errors=True)
222
- elif target.exists():
223
- target.unlink()
224
- if child.is_dir():
225
- shutil.copytree(child, target)
226
- else:
227
- shutil.copy2(child, target)
228
-
229
- write_status("restored", f"Restored Hermes state from {repo_id}")
230
- return True
231
- except RepositoryNotFoundError:
232
- write_status("fresh", f"Backup dataset {repo_id} does not exist yet.")
233
- return True
234
- except HfHubHTTPError as exc:
235
- if exc.response is not None and exc.response.status_code == 404:
236
- write_status("fresh", f"Backup dataset {repo_id} does not exist yet.")
237
- return True
238
- write_status("error", f"Restore failed: {exc}")
239
- print(f"Restore failed: {exc}", file=sys.stderr)
240
- return False
241
- except Exception as exc:
242
- write_status("error", f"Restore failed: {exc}")
243
- print(f"Restore failed: {exc}", file=sys.stderr)
244
- return False
245
-
246
-
247
- def sync_once(last_fingerprint: str | None = None, last_marker: tuple[int, int, int] | None = None):
248
- if last_fingerprint is None and last_marker is None:
249
- if STATE_FILE.exists():
250
- try:
251
- state = json.loads(STATE_FILE.read_text(encoding="utf-8"))
252
- last_fingerprint = state.get("last_fingerprint")
253
- m = state.get("last_marker")
254
- if m and len(m) == 3:
255
- last_marker = tuple(m)
256
- except Exception:
257
- pass
258
-
259
- repo_id = ensure_repo_exists()
260
- current_marker = metadata_marker(HERMES_HOME)
261
- if last_marker is not None and current_marker == last_marker:
262
- write_status("synced", "No Hermes state changes detected (marker match).")
263
- return (last_fingerprint or "", current_marker)
264
-
265
- current_fingerprint = fingerprint_dir(HERMES_HOME)
266
- if last_fingerprint is not None and current_fingerprint == last_fingerprint:
267
- write_status("synced", "No Hermes state changes detected (fingerprint match).")
268
- return (last_fingerprint, current_marker)
269
-
270
- hostname = socket.gethostname()
271
- write_status("syncing", f"Uploading Hermes state to {repo_id} from {hostname}")
272
- snapshot_dir = create_snapshot_dir(HERMES_HOME)
273
- try:
274
- upload_folder(
275
- folder_path=str(snapshot_dir),
276
- repo_id=repo_id,
277
- repo_type="dataset",
278
- token=HF_TOKEN,
279
- commit_message=f"HuggingMes sync [{hostname}] {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
280
- ignore_patterns=[".git/*", ".git"],
281
- )
282
- finally:
283
- shutil.rmtree(snapshot_dir, ignore_errors=True)
284
-
285
- write_status("success", f"Uploaded Hermes state to {repo_id}", fingerprint=current_fingerprint, marker=current_marker)
286
- return (current_fingerprint, current_marker)
287
-
288
-
289
- def handle_signal(_sig, _frame) -> None:
290
- STOP_EVENT.set()
291
-
292
-
293
- def loop() -> int:
294
- signal.signal(signal.SIGTERM, handle_signal)
295
- signal.signal(signal.SIGINT, handle_signal)
296
- try:
297
- repo_id = resolve_backup_repo()
298
- write_status("configured", f"Backup loop active for {repo_id} with {INTERVAL}s interval.")
299
- except Exception as exc:
300
- write_status("error", str(exc))
301
- print(f"Hermes sync error: {exc}")
302
- return 1
303
-
304
- last_fingerprint = fingerprint_dir(HERMES_HOME)
305
- last_marker = metadata_marker(HERMES_HOME)
306
- time.sleep(INITIAL_DELAY)
307
- print(f"Hermes state sync started: every {INTERVAL}s -> {repo_id}")
308
-
309
- while not STOP_EVENT.is_set():
310
- try:
311
- last_fingerprint, last_marker = sync_once(last_fingerprint, last_marker)
312
- except Exception as exc:
313
- write_status("error", f"Sync failed: {exc}")
314
- print(f"Hermes sync failed: {exc}")
315
- jitter = random.uniform(0.9, 1.1)
316
- if STOP_EVENT.wait(INTERVAL * jitter):
317
- break
318
- return 0
319
-
320
-
321
- def main() -> int:
322
- HERMES_HOME.mkdir(parents=True, exist_ok=True)
323
- if len(sys.argv) < 2:
324
- return loop()
325
- command = sys.argv[1]
326
- if command == "restore":
327
- return 0 if restore() else 1
328
- if command == "sync-once":
329
- try:
330
- sync_once()
331
- return 0
332
- except Exception as exc:
333
- write_status("error", f"Shutdown sync failed: {exc}")
334
- print(f"Hermes sync: shutdown sync failed: {exc}")
335
- return 1
336
- if command == "loop":
337
- return loop()
338
- print(f"Unknown command: {command}", file=sys.stderr)
339
- return 1
340
-
341
-
342
- if __name__ == "__main__":
343
- raise SystemExit(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """HuggingMes Hermes state backup via Hugging Face Datasets.
3
+
4
+ Vendored verbatim from github.com/somratpro/HuggingMes.
5
+ Backs up HERMES_HOME (which includes /opt/data/webui — the hermes-webui state dir)
6
+ so sessions, profiles, skills, cron, memory, and workspace all survive restarts.
7
+ """
8
+
9
+ import hashlib
10
+ import json
11
+ import logging
12
+ import os
13
+ import shutil
14
+ import signal
15
+ import socket
16
+ import sys
17
+ import tempfile
18
+ import threading
19
+ import time
20
+ from pathlib import Path
21
+
22
+ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
23
+ os.environ.setdefault("HF_HUB_VERBOSITY", "error")
24
+ os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300")
25
+ os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
26
+
27
+ from huggingface_hub import HfApi, snapshot_download, upload_folder
28
+ from huggingface_hub.errors import HfHubHTTPError, RepositoryNotFoundError
29
+
30
+ logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
31
+
32
+ HERMES_HOME = Path(os.environ.get("HERMES_HOME", "/opt/data"))
33
+ STATUS_FILE = Path("/tmp/huggingmes-sync-status.json")
34
+ STATE_FILE = HERMES_HOME / ".huggingmes-sync-state.json"
35
+ INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60"))
36
+ INITIAL_DELAY = int(os.environ.get("SYNC_START_DELAY", "5"))
37
+ # Change-driven settings: the loop polls cheap stat metadata every POLL_INTERVAL
38
+ # seconds, and once a change is observed waits DEBOUNCE_SECONDS of quiet before
39
+ # uploading. INTERVAL acts only as a hard ceiling — even if writes never settle,
40
+ # a sync is forced after INTERVAL seconds. This keeps the worst-case data loss
41
+ # window well under a minute without uploading on every keystroke.
42
+ POLL_INTERVAL = float(os.environ.get("SYNC_POLL_INTERVAL", "2"))
43
+ DEBOUNCE_SECONDS = float(os.environ.get("SYNC_DEBOUNCE_SECONDS", "3"))
44
+ HF_TOKEN = os.environ.get("HF_TOKEN", "").strip()
45
+ HF_USERNAME = os.environ.get("HF_USERNAME", "").strip()
46
+ SPACE_AUTHOR_NAME = os.environ.get("SPACE_AUTHOR_NAME", "").strip()
47
+ BACKUP_DATASET_NAME = os.environ.get("BACKUP_DATASET_NAME", "huggingmes-backup").strip()
48
+ INCLUDE_ENV = os.environ.get("SYNC_INCLUDE_ENV", "").strip().lower() in {"1", "true", "yes"}
49
+ MAX_FILE_SIZE_BYTES = int(os.environ.get("SYNC_MAX_FILE_BYTES", str(50 * 1024 * 1024)))
50
+
51
+ EXCLUDED_DIRS = {
52
+ ".cache",
53
+ ".git",
54
+ ".npm",
55
+ ".venv",
56
+ "__pycache__",
57
+ "node_modules",
58
+ "venv",
59
+ "logs", # log files are useless after a restart
60
+ }
61
+ EXCLUDED_TOP_LEVEL = {"logs", STATE_FILE.name}
62
+ EXCLUDED_SUFFIXES = (
63
+ ".log", ".log.1", ".log.2",
64
+ ".db-shm", ".db-wal", ".db-journal",
65
+ ".pid", ".tmp",
66
+ )
67
+ if not INCLUDE_ENV:
68
+ EXCLUDED_TOP_LEVEL.add(".env")
69
+
70
+ HF_API = HfApi(token=HF_TOKEN) if HF_TOKEN else None
71
+ STOP_EVENT = threading.Event()
72
+ _REPO_ID_CACHE: str | None = None
73
+
74
+
75
+ def write_status(status: str, message: str, fingerprint: str | None = None, marker: tuple[int, int, int] | None = None) -> None:
76
+ timestamp = time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
77
+ payload = {"status": status, "message": message, "timestamp": timestamp}
78
+
79
+ tmp_path = STATUS_FILE.with_suffix(".tmp")
80
+ try:
81
+ tmp_path.write_text(json.dumps(payload), encoding="utf-8")
82
+ tmp_path.replace(STATUS_FILE)
83
+ except OSError:
84
+ pass
85
+
86
+ if fingerprint or marker:
87
+ state = {}
88
+ if STATE_FILE.exists():
89
+ try:
90
+ state = json.loads(STATE_FILE.read_text(encoding="utf-8"))
91
+ except Exception:
92
+ pass
93
+ if fingerprint:
94
+ state["last_fingerprint"] = fingerprint
95
+ if marker:
96
+ state["last_marker"] = list(marker)
97
+ state["last_sync"] = timestamp
98
+ try:
99
+ STATE_FILE.write_text(json.dumps(state), encoding="utf-8")
100
+ except OSError:
101
+ pass
102
+
103
+
104
+ def resolve_backup_repo() -> str:
105
+ global _REPO_ID_CACHE
106
+ if _REPO_ID_CACHE:
107
+ return _REPO_ID_CACHE
108
+
109
+ namespace = HF_USERNAME or SPACE_AUTHOR_NAME
110
+ if not namespace and HF_API is not None:
111
+ whoami = HF_API.whoami()
112
+ namespace = whoami.get("name") or whoami.get("user") or ""
113
+
114
+ namespace = str(namespace).strip()
115
+ if not namespace:
116
+ raise RuntimeError("Could not determine HF username. Set HF_USERNAME or use an account HF_TOKEN.")
117
+
118
+ _REPO_ID_CACHE = f"{namespace}/{BACKUP_DATASET_NAME}"
119
+ return _REPO_ID_CACHE
120
+
121
+
122
+ def ensure_repo_exists() -> str:
123
+ repo_id = resolve_backup_repo()
124
+ try:
125
+ HF_API.repo_info(repo_id=repo_id, repo_type="dataset")
126
+ except RepositoryNotFoundError:
127
+ HF_API.create_repo(repo_id=repo_id, repo_type="dataset", private=True)
128
+ return repo_id
129
+
130
+
131
+ def should_exclude(rel_posix: str, path: Path) -> bool:
132
+ parts = Path(rel_posix).parts
133
+ if not parts:
134
+ return False
135
+ if parts[0] in EXCLUDED_TOP_LEVEL:
136
+ return True
137
+ if any(part in EXCLUDED_DIRS for part in parts):
138
+ return True
139
+ if path.is_file():
140
+ name_lower = path.name.lower()
141
+ if name_lower.endswith(EXCLUDED_SUFFIXES):
142
+ return True
143
+ try:
144
+ return path.stat().st_size > MAX_FILE_SIZE_BYTES
145
+ except OSError:
146
+ return True
147
+ return False
148
+
149
+
150
+ def metadata_marker(root: Path) -> tuple[int, int, int]:
151
+ if not root.exists():
152
+ return (0, 0, 0)
153
+ file_count = 0
154
+ total_size = 0
155
+ newest_mtime = 0
156
+ for path in root.rglob("*"):
157
+ if not path.is_file():
158
+ continue
159
+ rel = path.relative_to(root).as_posix()
160
+ if should_exclude(rel, path):
161
+ continue
162
+ try:
163
+ stat = path.stat()
164
+ except OSError:
165
+ continue
166
+ file_count += 1
167
+ total_size += int(stat.st_size)
168
+ newest_mtime = max(newest_mtime, int(stat.st_mtime_ns))
169
+ return (file_count, total_size, newest_mtime)
170
+
171
+
172
+ def fingerprint_dir(root: Path) -> str:
173
+ hasher = hashlib.sha256()
174
+ if not root.exists():
175
+ return hasher.hexdigest()
176
+ for path in sorted(p for p in root.rglob("*") if p.is_file()):
177
+ rel = path.relative_to(root).as_posix()
178
+ if should_exclude(rel, path):
179
+ continue
180
+ hasher.update(rel.encode("utf-8"))
181
+ with path.open("rb") as handle:
182
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
183
+ hasher.update(chunk)
184
+ return hasher.hexdigest()
185
+
186
+
187
+ def create_snapshot_dir(source_root: Path) -> Path:
188
+ staging_root = Path(tempfile.mkdtemp(prefix="huggingmes-sync-"))
189
+ for path in sorted(source_root.rglob("*")):
190
+ rel = path.relative_to(source_root)
191
+ rel_posix = rel.as_posix()
192
+ if should_exclude(rel_posix, path):
193
+ continue
194
+ target = staging_root / rel
195
+ if path.is_dir():
196
+ target.mkdir(parents=True, exist_ok=True)
197
+ continue
198
+ target.parent.mkdir(parents=True, exist_ok=True)
199
+ try:
200
+ shutil.copy2(path, target)
201
+ except OSError:
202
+ continue
203
+ return staging_root
204
+
205
+
206
+ def restore() -> bool:
207
+ if not HF_TOKEN:
208
+ write_status("disabled", "HF_TOKEN is not configured.")
209
+ return False
210
+
211
+ repo_id = resolve_backup_repo()
212
+ write_status("restoring", f"Restoring Hermes state from {repo_id}")
213
+ try:
214
+ with tempfile.TemporaryDirectory() as tmpdir:
215
+ snapshot_download(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN, local_dir=tmpdir)
216
+ tmp_path = Path(tmpdir)
217
+ if not any(tmp_path.iterdir()):
218
+ write_status("fresh", "Backup dataset is empty. Starting fresh.")
219
+ return True
220
+
221
+ HERMES_HOME.mkdir(parents=True, exist_ok=True)
222
+ for child in tmp_path.iterdir():
223
+ if should_exclude(child.name, child):
224
+ continue
225
+ target = HERMES_HOME / child.name
226
+ if target.is_dir():
227
+ shutil.rmtree(target, ignore_errors=True)
228
+ elif target.exists():
229
+ target.unlink()
230
+ if child.is_dir():
231
+ shutil.copytree(child, target)
232
+ else:
233
+ shutil.copy2(child, target)
234
+
235
+ write_status("restored", f"Restored Hermes state from {repo_id}")
236
+ return True
237
+ except RepositoryNotFoundError:
238
+ write_status("fresh", f"Backup dataset {repo_id} does not exist yet.")
239
+ return True
240
+ except HfHubHTTPError as exc:
241
+ if exc.response is not None and exc.response.status_code == 404:
242
+ write_status("fresh", f"Backup dataset {repo_id} does not exist yet.")
243
+ return True
244
+ write_status("error", f"Restore failed: {exc}")
245
+ print(f"Restore failed: {exc}", file=sys.stderr)
246
+ return False
247
+ except Exception as exc:
248
+ write_status("error", f"Restore failed: {exc}")
249
+ print(f"Restore failed: {exc}", file=sys.stderr)
250
+ return False
251
+
252
+
253
+ def sync_once(last_fingerprint: str | None = None, last_marker: tuple[int, int, int] | None = None):
254
+ if last_fingerprint is None and last_marker is None:
255
+ if STATE_FILE.exists():
256
+ try:
257
+ state = json.loads(STATE_FILE.read_text(encoding="utf-8"))
258
+ last_fingerprint = state.get("last_fingerprint")
259
+ m = state.get("last_marker")
260
+ if m and len(m) == 3:
261
+ last_marker = tuple(m)
262
+ except Exception:
263
+ pass
264
+
265
+ repo_id = ensure_repo_exists()
266
+ current_marker = metadata_marker(HERMES_HOME)
267
+ if last_marker is not None and current_marker == last_marker:
268
+ write_status("synced", "No Hermes state changes detected (marker match).")
269
+ return (last_fingerprint or "", current_marker)
270
+
271
+ current_fingerprint = fingerprint_dir(HERMES_HOME)
272
+ if last_fingerprint is not None and current_fingerprint == last_fingerprint:
273
+ write_status("synced", "No Hermes state changes detected (fingerprint match).")
274
+ return (last_fingerprint, current_marker)
275
+
276
+ hostname = socket.gethostname()
277
+ write_status("syncing", f"Uploading Hermes state to {repo_id} from {hostname}")
278
+ snapshot_dir = create_snapshot_dir(HERMES_HOME)
279
+ try:
280
+ upload_folder(
281
+ folder_path=str(snapshot_dir),
282
+ repo_id=repo_id,
283
+ repo_type="dataset",
284
+ token=HF_TOKEN,
285
+ commit_message=f"HuggingMes sync [{hostname}] {time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}",
286
+ ignore_patterns=[".git/*", ".git"],
287
+ )
288
+ finally:
289
+ shutil.rmtree(snapshot_dir, ignore_errors=True)
290
+
291
+ write_status("success", f"Uploaded Hermes state to {repo_id}", fingerprint=current_fingerprint, marker=current_marker)
292
+ return (current_fingerprint, current_marker)
293
+
294
+
295
+ def handle_signal(_sig, _frame) -> None:
296
+ STOP_EVENT.set()
297
+
298
+
299
+ def loop() -> int:
300
+ signal.signal(signal.SIGTERM, handle_signal)
301
+ signal.signal(signal.SIGINT, handle_signal)
302
+ try:
303
+ repo_id = resolve_backup_repo()
304
+ write_status(
305
+ "configured",
306
+ f"Backup watcher active for {repo_id} "
307
+ f"(poll={POLL_INTERVAL}s, debounce={DEBOUNCE_SECONDS}s, max={INTERVAL}s).",
308
+ )
309
+ except Exception as exc:
310
+ write_status("error", str(exc))
311
+ print(f"Hermes sync error: {exc}")
312
+ return 1
313
+
314
+ # Seed from any prior run so we don't re-upload an identical tree.
315
+ last_fingerprint: str | None = None
316
+ last_marker: tuple[int, int, int] | None = None
317
+ if STATE_FILE.exists():
318
+ try:
319
+ state = json.loads(STATE_FILE.read_text(encoding="utf-8"))
320
+ last_fingerprint = state.get("last_fingerprint")
321
+ m = state.get("last_marker")
322
+ if m and len(m) == 3:
323
+ last_marker = tuple(m)
324
+ except Exception:
325
+ pass
326
+ if last_marker is None:
327
+ last_marker = metadata_marker(HERMES_HOME)
328
+
329
+ if STOP_EVENT.wait(INITIAL_DELAY):
330
+ return 0
331
+ print(
332
+ f"Hermes state sync started: poll={POLL_INTERVAL}s "
333
+ f"debounce={DEBOUNCE_SECONDS}s max={INTERVAL}s -> {repo_id}"
334
+ )
335
+
336
+ # Change-driven scheduler. Two clocks:
337
+ # * `pending_since` — when we first noticed an unsynced change. Used
338
+ # with INTERVAL to enforce a hard ceiling so a
339
+ # continuously-busy session can't starve uploads.
340
+ # * `last_change_at` — when we most recently saw the marker move. The
341
+ # debounce timer is measured against this so we
342
+ # wait for writes to settle before uploading.
343
+ pending_since: float | None = None
344
+ last_change_at: float | None = None
345
+ candidate_marker = last_marker
346
+
347
+ while not STOP_EVENT.is_set():
348
+ if STOP_EVENT.wait(POLL_INTERVAL):
349
+ break
350
+
351
+ try:
352
+ current_marker = metadata_marker(HERMES_HOME)
353
+ except Exception as exc:
354
+ # Don't let a transient stat error kill the loop.
355
+ write_status("error", f"marker scan failed: {exc}")
356
+ continue
357
+
358
+ now = time.time()
359
+
360
+ if current_marker != candidate_marker:
361
+ # Files moved since the last poll. Start (or extend) a debounce.
362
+ if pending_since is None:
363
+ pending_since = now
364
+ last_change_at = now
365
+ candidate_marker = current_marker
366
+ continue
367
+
368
+ if pending_since is None:
369
+ # Tree is unchanged and there's nothing waiting. Nothing to do.
370
+ continue
371
+
372
+ quiet_for = now - (last_change_at or now)
373
+ held_for = now - pending_since
374
+ # Trigger when writes have settled (debounce) OR when the hard ceiling
375
+ # is hit, so a never-idle tree still gets snapshotted at least once
376
+ # per INTERVAL seconds.
377
+ if quiet_for < DEBOUNCE_SECONDS and held_for < INTERVAL:
378
+ continue
379
+
380
+ try:
381
+ last_fingerprint, last_marker = sync_once(last_fingerprint, last_marker)
382
+ candidate_marker = last_marker
383
+ except Exception as exc:
384
+ write_status("error", f"Sync failed: {exc}")
385
+ print(f"Hermes sync failed: {exc}")
386
+ # Back off briefly on failure so we don't hot-loop a broken upload.
387
+ if STOP_EVENT.wait(min(5.0, POLL_INTERVAL * 2)):
388
+ break
389
+ finally:
390
+ pending_since = None
391
+ last_change_at = None
392
+
393
+ return 0
394
+
395
+
396
+ def main() -> int:
397
+ HERMES_HOME.mkdir(parents=True, exist_ok=True)
398
+ if len(sys.argv) < 2:
399
+ return loop()
400
+ command = sys.argv[1]
401
+ if command == "restore":
402
+ return 0 if restore() else 1
403
+ if command == "sync-once":
404
+ try:
405
+ sync_once()
406
+ return 0
407
+ except Exception as exc:
408
+ write_status("error", f"Shutdown sync failed: {exc}")
409
+ print(f"Hermes sync: shutdown sync failed: {exc}")
410
+ return 1
411
+ if command == "loop":
412
+ return loop()
413
+ print(f"Unknown command: {command}", file=sys.stderr)
414
+ return 1
415
+
416
+
417
+ if __name__ == "__main__":
418
+ raise SystemExit(main())
start.sh CHANGED
@@ -19,7 +19,7 @@ DASHBOARD_PORT="${DASHBOARD_PORT:-9119}"
19
  TELEGRAM_WEBHOOK_PORT="${TELEGRAM_WEBHOOK_PORT:-8765}"
20
  WEBUI_PORT="${HERMES_WEBUI_PORT:-8787}"
21
 
22
- SYNC_INTERVAL="${SYNC_INTERVAL:-600}"
23
  BACKUP_DATASET="${BACKUP_DATASET_NAME:-huggingmes-backup}"
24
  CF_PROXY_ENV_FILE="/tmp/huggingmes-cloudflare-proxy.env"
25
 
@@ -324,7 +324,7 @@ else
324
  echo "Telegram : not configured"
325
  fi
326
  if [ -n "${HF_TOKEN:-}" ]; then
327
- echo "Backup : ${BACKUP_DATASET} (every ${SYNC_INTERVAL:-600}s)"
328
  else
329
  echo "Backup : disabled"
330
  fi
 
19
  TELEGRAM_WEBHOOK_PORT="${TELEGRAM_WEBHOOK_PORT:-8765}"
20
  WEBUI_PORT="${HERMES_WEBUI_PORT:-8787}"
21
 
22
+ SYNC_INTERVAL="${SYNC_INTERVAL:-60}"
23
  BACKUP_DATASET="${BACKUP_DATASET_NAME:-huggingmes-backup}"
24
  CF_PROXY_ENV_FILE="/tmp/huggingmes-cloudflare-proxy.env"
25
 
 
324
  echo "Telegram : not configured"
325
  fi
326
  if [ -n "${HF_TOKEN:-}" ]; then
327
+ echo "Backup : ${BACKUP_DATASET} (poll ${SYNC_POLL_INTERVAL:-2}s, debounce ${SYNC_DEBOUNCE_SECONDS:-3}s, max ${SYNC_INTERVAL:-60}s)"
328
  else
329
  echo "Backup : disabled"
330
  fi