personalgpt

Runtime error

App Files Files Community

PrismML Deploy commited on Apr 3

Commit

e28b544

1 Parent(s): 0a882d1

Fixes: push every 20min, trim to last 10 entries, pull on startup for restart recovery

Browse files

Files changed (1) hide show

metrics_pusher.py +35 -2

metrics_pusher.py CHANGED Viewed

@@ -17,7 +17,8 @@ GPU_INTERVAL_SECS  = int(os.environ.get("GPU_INTERVAL", "10"))
 SNAPSHOT_SECS      = int(os.environ.get("METRICS_INTERVAL", "3"))
 NGINX_LOG          = Path("/tmp/nginx-access.log")
 ANALYTICS_FILE     = Path("/tmp/analytics.json")
-PUSH_SECS          = int(os.environ.get("METRICS_PUSH_INTERVAL", "600"))
 METRICS_REPO       = os.environ.get("METRICS_REPO", "")
 HF_TOKEN           = os.environ.get("HF_TOKEN", "")
@@ -191,7 +192,13 @@ def compute_analytics():
 def hf_push(local_path):
     if not METRICS_REPO or not HF_TOKEN: return
     dest = f"metrics/{local_path.name}"
-    content = base64.b64encode(local_path.read_bytes()).decode()
     payload = json.dumps({"summary": f"update {local_path.name}",
         "files": [{"path": dest, "encoding": "base64", "content": content}]}).encode()
     req = urllib.request.Request(
@@ -217,6 +224,31 @@ def gpu_loop():
             print(f"[gpu] error: {e}")
         time.sleep(GPU_INTERVAL_SECS)
 def wait_for_backends():
     """Block until at least one llama-server is healthy (retries every 5s)."""
     print("[metrics] waiting for llama-server to be ready...")
@@ -235,6 +267,7 @@ def wait_for_backends():
 def metrics_loop():
     """Slow loop: scrape llama metrics, append JSONL, push to HF every SNAPSHOT_SECS seconds."""
     print(f"[metrics] snapshot={SNAPSHOT_SECS}s  push={PUSH_SECS}s  repo={METRICS_REPO or '(local only)'}")
     wait_for_backends()
     last_push, backends, first_push_done = 0.0, [], False
     while True:

 SNAPSHOT_SECS      = int(os.environ.get("METRICS_INTERVAL", "3"))
 NGINX_LOG          = Path("/tmp/nginx-access.log")
 ANALYTICS_FILE     = Path("/tmp/analytics.json")
+PUSH_SECS          = int(os.environ.get("METRICS_PUSH_INTERVAL", "1200"))
+MAX_JSONL_LINES    = 10
 METRICS_REPO       = os.environ.get("METRICS_REPO", "")
 HF_TOKEN           = os.environ.get("HF_TOKEN", "")
 def hf_push(local_path):
     if not METRICS_REPO or not HF_TOKEN: return
     dest = f"metrics/{local_path.name}"
+    # Trim to last MAX_JSONL_LINES entries before pushing
+    try:
+        lines = local_path.read_text().strip().splitlines()
+        trimmed = "\n".join(lines[-MAX_JSONL_LINES:]) + "\n"
+    except Exception:
+        trimmed = local_path.read_bytes().decode()
+    content = base64.b64encode(trimmed.encode()).decode()
     payload = json.dumps({"summary": f"update {local_path.name}",
         "files": [{"path": dest, "encoding": "base64", "content": content}]}).encode()
     req = urllib.request.Request(
             print(f"[gpu] error: {e}")
         time.sleep(GPU_INTERVAL_SECS)
+def hf_pull():
+    """Pull the latest metrics from HF Dataset on startup to survive restarts."""
+    if not METRICS_REPO or not HF_TOKEN:
+        return
+    print(f"[metrics] pulling saved data from {METRICS_REPO}...")
+    try:
+        # List files in the metrics/ folder
+        req = urllib.request.Request(
+            f"https://huggingface.co/api/datasets/{METRICS_REPO}/tree/main/metrics",
+            headers={"Authorization": f"Bearer {HF_TOKEN}"})
+        with urllib.request.urlopen(req, timeout=15) as r:
+            files = json.loads(r.read().decode())
+        for f in files:
+            name = f.get("path", "").split("/")[-1]
+            if not name.endswith(".jsonl"):
+                continue
+            local = LOG_DIR / name
+            url = f"https://huggingface.co/datasets/{METRICS_REPO}/resolve/main/metrics/{name}"
+            req = urllib.request.Request(url, headers={"Authorization": f"Bearer {HF_TOKEN}"})
+            with urllib.request.urlopen(req, timeout=15) as r:
+                local.write_bytes(r.read())
+            print(f"[metrics] restored {name} ({local.stat().st_size} bytes)")
+    except Exception as e:
+        print(f"[metrics] pull failed (starting fresh): {e}")
 def wait_for_backends():
     """Block until at least one llama-server is healthy (retries every 5s)."""
     print("[metrics] waiting for llama-server to be ready...")
 def metrics_loop():
     """Slow loop: scrape llama metrics, append JSONL, push to HF every SNAPSHOT_SECS seconds."""
     print(f"[metrics] snapshot={SNAPSHOT_SECS}s  push={PUSH_SECS}s  repo={METRICS_REPO or '(local only)'}")
+    hf_pull()
     wait_for_backends()
     last_push, backends, first_push_done = 0.0, [], False
     while True: