Bonsai-demo-cpu

Runtime error

App Files Files Community

PrismML Deploy commited on Apr 3

Commit

0a882d1

1 Parent(s): 0633a27

Fix: metrics push (summary not commit_message), crash resilience + watchdog

Browse files

Files changed (2) hide show

entrypoint.sh +10 -2
metrics_pusher.py +54 -31

entrypoint.sh CHANGED Viewed

@@ -99,8 +99,16 @@ echo '{"ts":null,"gpus":[]}' > /tmp/gpu-stats.json
 echo '# waiting for first metrics scrape...' > /tmp/llama-metrics.txt
 echo '{"updated_at":null,"summary_24h":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"summary_total":{"requests":0,"unique_users":0},"requests_by_hour":[],"requests_by_day":[],"top_users":[]}' > /tmp/analytics.json
-# ── Start metrics pusher ──────────────────────────────────────────────────────
-python3 /app/metrics_pusher.py &
 echo ""
 echo "=== Bonsai-demo ==="

 echo '# waiting for first metrics scrape...' > /tmp/llama-metrics.txt
 echo '{"updated_at":null,"summary_24h":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"summary_total":{"requests":0,"unique_users":0},"requests_by_hour":[],"requests_by_day":[],"top_users":[]}' > /tmp/analytics.json
+# ── Start metrics pusher with watchdog ────────────────────────────────────────
+start_metrics_pusher() {
+    while true; do
+        echo "[watchdog] Starting metrics_pusher.py..."
+        python3 /app/metrics_pusher.py || true
+        echo "[watchdog] metrics_pusher.py exited — restarting in 5s..."
+        sleep 5
+    done
+}
+start_metrics_pusher &
 echo ""
 echo "=== Bonsai-demo ==="

metrics_pusher.py CHANGED Viewed

@@ -192,7 +192,7 @@ def hf_push(local_path):
     if not METRICS_REPO or not HF_TOKEN: return
     dest = f"metrics/{local_path.name}"
     content = base64.b64encode(local_path.read_bytes()).decode()
-    payload = json.dumps({"commit_message": f"update {local_path.name}",
         "files": [{"path": dest, "encoding": "base64", "content": content}]}).encode()
     req = urllib.request.Request(
         f"https://huggingface.co/api/datasets/{METRICS_REPO}/commit/main",
@@ -208,10 +208,13 @@ def gpu_loop():
     """Fast loop: update gpu-stats.json every GPU_INTERVAL_SECS seconds."""
     print(f"[gpu] polling every {GPU_INTERVAL_SECS}s")
     while True:
-        gpus = scrape_gpus()
-        if gpus:
-            ts = now_utc().isoformat()
-            GPU_STATS_FILE.write_text(json.dumps({"ts": ts, "gpus": gpus}))
         time.sleep(GPU_INTERVAL_SECS)
 def wait_for_backends():
@@ -235,35 +238,55 @@ def metrics_loop():
     wait_for_backends()
     last_push, backends, first_push_done = 0.0, [], False
     while True:
-        if not backends:
-            backends = detect_backends()
-            print(f"[metrics] backends: {backends}")
-        ts   = now_utc()
-        data = scrape(backends)
-        compute_analytics()
-        if data is None:
-            print(f"[metrics] scrape returned no data — will retry next tick")
-            backends = []  # force re-detect next iteration
-        else:
-            gpus = json.loads(GPU_STATS_FILE.read_text()).get("gpus", []) if GPU_STATS_FILE.exists() else []
-            gpu_s = {f"gpu{g['index']}_util": g["util_gpu"] for g in gpus}
-            gpu_s.update({f"gpu{g['index']}_mem_used_mib": g["mem_used_mib"] for g in gpus})
-            row  = {"ts": ts.isoformat(), **data, **gpu_s}
-            day  = ts.strftime("%Y-%m-%d")
-            path = LOG_DIR / f"metrics-{day}.jsonl"
-            with open(path, "a") as f: f.write(json.dumps(row) + "\n")
-            gpu_str = "  ".join(f"GPU{g['index']} {g['util_gpu']:.0f}% {g['mem_used_mib']/1024:.1f}GB {g['temp_c']:.0f}°C" for g in gpus)
-            print(f"[metrics] {ts.strftime('%H:%M:%S')}  gen={data.get('predicted_tokens_seconds',0):.0f} tok/s  active={data.get('requests_processing',0):.0f}  {gpu_str}")
-            if not first_push_done or time.time() - last_push >= PUSH_SECS:
-                hf_push(path)
-                last_push = time.time()
-                first_push_done = True
         time.sleep(SNAPSHOT_SECS)
 def main():
-    t = threading.Thread(target=gpu_loop, daemon=True)
-    t.start()
-    metrics_loop()
 if __name__ == "__main__":
     main()

     if not METRICS_REPO or not HF_TOKEN: return
     dest = f"metrics/{local_path.name}"
     content = base64.b64encode(local_path.read_bytes()).decode()
+    payload = json.dumps({"summary": f"update {local_path.name}",
         "files": [{"path": dest, "encoding": "base64", "content": content}]}).encode()
     req = urllib.request.Request(
         f"https://huggingface.co/api/datasets/{METRICS_REPO}/commit/main",
     """Fast loop: update gpu-stats.json every GPU_INTERVAL_SECS seconds."""
     print(f"[gpu] polling every {GPU_INTERVAL_SECS}s")
     while True:
+        try:
+            gpus = scrape_gpus()
+            if gpus:
+                ts = now_utc().isoformat()
+                GPU_STATS_FILE.write_text(json.dumps({"ts": ts, "gpus": gpus}))
+        except Exception as e:
+            print(f"[gpu] error: {e}")
         time.sleep(GPU_INTERVAL_SECS)
 def wait_for_backends():
     wait_for_backends()
     last_push, backends, first_push_done = 0.0, [], False
     while True:
+        try:
+            if not backends:
+                backends = detect_backends()
+                print(f"[metrics] backends: {backends}")
+            ts   = now_utc()
+            data = scrape(backends)
+            compute_analytics()
+            if data is None:
+                print(f"[metrics] scrape returned no data — will retry next tick")
+                backends = []
+            else:
+                try:
+                    gpus = json.loads(GPU_STATS_FILE.read_text()).get("gpus", []) if GPU_STATS_FILE.exists() else []
+                except Exception:
+                    gpus = []
+                gpu_s = {f"gpu{g['index']}_util": g["util_gpu"] for g in gpus}
+                gpu_s.update({f"gpu{g['index']}_mem_used_mib": g["mem_used_mib"] for g in gpus})
+                row  = {"ts": ts.isoformat(), **data, **gpu_s}
+                day  = ts.strftime("%Y-%m-%d")
+                path = LOG_DIR / f"metrics-{day}.jsonl"
+                with open(path, "a") as f: f.write(json.dumps(row) + "\n")
+                gpu_str = "  ".join(f"GPU{g['index']} {g['util_gpu']:.0f}% {g['mem_used_mib']/1024:.1f}GB {g['temp_c']:.0f}°C" for g in gpus)
+                print(f"[metrics] {ts.strftime('%H:%M:%S')}  gen={data.get('predicted_tokens_seconds',0):.0f} tok/s  active={data.get('requests_processing',0):.0f}  {gpu_str}")
+                if not first_push_done or time.time() - last_push >= PUSH_SECS:
+                    hf_push(path)
+                    last_push = time.time()
+                    first_push_done = True
+        except Exception as e:
+            print(f"[metrics] loop error (will retry): {e}")
+            backends = []
         time.sleep(SNAPSHOT_SECS)
+def run_with_restart(name, func):
+    """Run a function in a loop, restarting on crash with backoff."""
+    backoff = 1
+    while True:
+        try:
+            func()
+        except Exception as e:
+            print(f"[{name}] CRASHED: {e} — restarting in {backoff}s")
+            time.sleep(backoff)
+            backoff = min(backoff * 2, 60)
+        else:
+            break
 def main():
+    gpu_thread = threading.Thread(target=run_with_restart, args=("gpu", gpu_loop), daemon=True)
+    gpu_thread.start()
+    run_with_restart("metrics", metrics_loop)
 if __name__ == "__main__":
     main()