Spaces:
Paused
Paused
v2: complete redesign — single entrypoint, tar.zst persistence
Browse filesReplace 13 scripts with 3 files (+ Dockerfile):
- entrypoint.py: single Python PID 1 (init, sync, services, log streamer)
- nginx.conf: reverse proxy (7860 → ttyd, ssh, logs)
- ws_ssh_bridge.py: WebSocket-to-SSH bridge (unchanged)
Key changes:
- Persistence: tar.zst of /home + /root via upload_file (not folder sync)
- No more rsync permission corruption (tar -p preserves modes)
- No more .cache path rejections (single archive file)
- No more upload_large_folder hangs (upload_file is fast)
- Log streamer runs as thread (not separate process)
- Sync runs as thread (not separate daemon)
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- Dockerfile +20 -33
- entrypoint.py +456 -0
- ubuntu-server/nginx.conf → nginx.conf +2 -3
- scripts/curl_logs.sh +0 -33
- scripts/entrypoint.sh +0 -92
- scripts/entrypoint_wrapper.py +0 -53
- scripts/ssh_connect.sh +0 -38
- scripts/stream_logs.py +0 -105
- scripts/sync_hf.py +0 -233
- scripts/test_local.sh +0 -143
- ubuntu-server/git_sync_daemon.py +0 -369
- ubuntu-server/log_streamer.py +0 -74
- ubuntu-server/start-server.sh +0 -114
- ubuntu-server/upload_sync.py +0 -61
- ubuntu-server/ws-ssh-bridge.py → ws_ssh_bridge.py +1 -1
Dockerfile
CHANGED
|
@@ -1,32 +1,26 @@
|
|
| 1 |
-
#
|
| 2 |
# Single port 7860: nginx → ttyd (web terminal) + SSH-over-WebSocket
|
| 3 |
-
#
|
| 4 |
FROM ubuntu:24.04
|
| 5 |
|
| 6 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
|
| 8 |
-
# System + Python +
|
| 9 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
-
ca-certificates curl wget
|
|
|
|
|
|
|
|
|
|
| 11 |
&& pip3 install --no-cache-dir --break-system-packages huggingface_hub websockets \
|
| 12 |
&& git lfs install \
|
| 13 |
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
|
| 15 |
-
# Server: SSH + nginx + ttyd + tools
|
| 16 |
-
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 17 |
-
openssh-server openssh-client \
|
| 18 |
-
nginx \
|
| 19 |
-
ttyd \
|
| 20 |
-
procps htop vim nano less tmux \
|
| 21 |
-
build-essential rsync \
|
| 22 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 23 |
-
|
| 24 |
# Node.js 20 LTS (for Claude Code)
|
| 25 |
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
| 26 |
&& apt-get install -y nodejs \
|
| 27 |
&& rm -rf /var/lib/apt/lists/*
|
| 28 |
|
| 29 |
-
# Claude Code
|
| 30 |
RUN npm install -g @anthropic-ai/claude-code 2>/dev/null \
|
| 31 |
|| (curl -fsSL https://claude.ai/install.sh | bash && \
|
| 32 |
cp /root/.local/bin/claude /usr/local/bin/claude 2>/dev/null || true)
|
|
@@ -34,30 +28,23 @@ RUN npm install -g @anthropic-ai/claude-code 2>/dev/null \
|
|
| 34 |
# Snapshot base package list (to detect user-added packages later)
|
| 35 |
RUN dpkg-query -W -f='${Package}\n' | sort > /etc/base-packages.list
|
| 36 |
|
| 37 |
-
# SSH
|
| 38 |
RUN ssh-keygen -A && mkdir -p /run/sshd
|
| 39 |
|
| 40 |
-
# User account
|
| 41 |
-
RUN useradd -m -u 1000 -s /bin/bash
|
| 42 |
-
|
| 43 |
-
|
|
|
|
| 44 |
RUN mkdir -p /data
|
| 45 |
|
| 46 |
-
#
|
| 47 |
-
|
| 48 |
-
COPY
|
| 49 |
-
COPY
|
| 50 |
-
COPY ubuntu-server/ws-ssh-bridge.py /scripts/ws-ssh-bridge.py
|
| 51 |
-
COPY ubuntu-server/git_sync_daemon.py /scripts/git_sync_daemon.py
|
| 52 |
-
COPY ubuntu-server/log_streamer.py /scripts/log_streamer.py
|
| 53 |
-
COPY ubuntu-server/start-server.sh /scripts/start-server.sh
|
| 54 |
-
RUN chmod +x /scripts/entrypoint.sh /scripts/start-server.sh
|
| 55 |
|
| 56 |
ENV PERSIST_PATH=/data
|
| 57 |
ENV PYTHONUNBUFFERED=1
|
| 58 |
-
ENV RUN_CMD="stdbuf -oL -eL /scripts/start-server.sh"
|
| 59 |
-
ENV SSH_PORT=2222
|
| 60 |
-
|
| 61 |
-
# Run as root (needed for: apt install persistence, bind mounts, sshd)
|
| 62 |
EXPOSE 7860
|
| 63 |
-
|
|
|
|
|
|
| 1 |
+
# HuggingRun v2 — Ubuntu Server on HuggingFace Spaces
|
| 2 |
# Single port 7860: nginx → ttyd (web terminal) + SSH-over-WebSocket
|
| 3 |
+
# Persistence: tar.zst archive of /home + /root → HF Dataset
|
| 4 |
FROM ubuntu:24.04
|
| 5 |
|
| 6 |
ENV DEBIAN_FRONTEND=noninteractive
|
| 7 |
|
| 8 |
+
# System + Python + tools (single layer)
|
| 9 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
ca-certificates curl wget git git-lfs \
|
| 11 |
+
python3 python3-pip python3-venv \
|
| 12 |
+
openssh-server nginx ttyd rsync zstd \
|
| 13 |
+
procps htop vim nano less tmux build-essential \
|
| 14 |
&& pip3 install --no-cache-dir --break-system-packages huggingface_hub websockets \
|
| 15 |
&& git lfs install \
|
| 16 |
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# Node.js 20 LTS (for Claude Code)
|
| 19 |
RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
|
| 20 |
&& apt-get install -y nodejs \
|
| 21 |
&& rm -rf /var/lib/apt/lists/*
|
| 22 |
|
| 23 |
+
# Claude Code
|
| 24 |
RUN npm install -g @anthropic-ai/claude-code 2>/dev/null \
|
| 25 |
|| (curl -fsSL https://claude.ai/install.sh | bash && \
|
| 26 |
cp /root/.local/bin/claude /usr/local/bin/claude 2>/dev/null || true)
|
|
|
|
| 28 |
# Snapshot base package list (to detect user-added packages later)
|
| 29 |
RUN dpkg-query -W -f='${Package}\n' | sort > /etc/base-packages.list
|
| 30 |
|
| 31 |
+
# SSH host keys
|
| 32 |
RUN ssh-keygen -A && mkdir -p /run/sshd
|
| 33 |
|
| 34 |
+
# User account
|
| 35 |
+
RUN useradd -m -u 1000 -s /bin/bash user 2>/dev/null || true \
|
| 36 |
+
&& echo "user:huggingrun" | chpasswd \
|
| 37 |
+
&& echo "root:huggingrun" | chpasswd
|
| 38 |
+
|
| 39 |
RUN mkdir -p /data
|
| 40 |
|
| 41 |
+
# v2: only 3 files
|
| 42 |
+
COPY entrypoint.py /entrypoint.py
|
| 43 |
+
COPY nginx.conf /etc/nginx/nginx.conf
|
| 44 |
+
COPY ws_ssh_bridge.py /ws_ssh_bridge.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
ENV PERSIST_PATH=/data
|
| 47 |
ENV PYTHONUNBUFFERED=1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
EXPOSE 7860
|
| 49 |
+
|
| 50 |
+
ENTRYPOINT ["python3", "-u", "/entrypoint.py"]
|
entrypoint.py
ADDED
|
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
HuggingRun v2 — Single entrypoint for Ubuntu Server on HuggingFace Spaces.
|
| 4 |
+
|
| 5 |
+
Replaces: entrypoint.sh, start-server.sh, git_sync_daemon.py, log_streamer.py
|
| 6 |
+
|
| 7 |
+
Lifecycle:
|
| 8 |
+
1. Resolve config (HF_TOKEN, dataset repo)
|
| 9 |
+
2. Download state.tar.zst from HF Dataset → extract /home, /root
|
| 10 |
+
3. Reinstall user-added packages
|
| 11 |
+
4. Start services (sshd, ttyd, ws-bridge, log streamer, sync thread)
|
| 12 |
+
5. exec nginx (becomes PID 1, opens port 7860)
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import http.server
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import signal
|
| 19 |
+
import subprocess
|
| 20 |
+
import sys
|
| 21 |
+
import threading
|
| 22 |
+
import time
|
| 23 |
+
from datetime import datetime, timezone
|
| 24 |
+
|
| 25 |
+
# ── Config ────────────────────────────────────────────────────────────
|
| 26 |
+
PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
|
| 27 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 28 |
+
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
|
| 29 |
+
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "120"))
|
| 30 |
+
SSH_PORT = os.environ.get("SSH_PORT", "2222")
|
| 31 |
+
TTYD_PORT = os.environ.get("TTYD_PORT", "7681")
|
| 32 |
+
LOGFILE = "/var/log/huggingrun.log"
|
| 33 |
+
STATE_FILE = os.path.join(PERSIST_PATH, "state.tar.zst")
|
| 34 |
+
PKG_FILE = os.path.join(PERSIST_PATH, "user-packages.list")
|
| 35 |
+
BASE_PKG_FILE = "/etc/base-packages.list"
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def log(msg):
|
| 39 |
+
ts = time.strftime("%H:%M:%S", time.gmtime())
|
| 40 |
+
line = f"[{ts}] {msg}"
|
| 41 |
+
print(line, file=sys.stderr, flush=True)
|
| 42 |
+
try:
|
| 43 |
+
with open(LOGFILE, "a") as f:
|
| 44 |
+
f.write(line + "\n")
|
| 45 |
+
except Exception:
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def run(cmd):
|
| 50 |
+
log(f" $ {cmd}")
|
| 51 |
+
t0 = time.time()
|
| 52 |
+
r = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
|
| 53 |
+
stderr=subprocess.STDOUT, text=True)
|
| 54 |
+
elapsed = time.time() - t0
|
| 55 |
+
if r.returncode != 0:
|
| 56 |
+
log(f" exit={r.returncode} ({elapsed:.1f}s)")
|
| 57 |
+
for line in (r.stdout or "").strip().split("\n")[:5]:
|
| 58 |
+
if line.strip():
|
| 59 |
+
log(f" {line}")
|
| 60 |
+
else:
|
| 61 |
+
log(f" ok ({elapsed:.1f}s)")
|
| 62 |
+
return r.returncode, (r.stdout or "").strip()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# ── Config Resolution ─────────────────────────────────────────────────
|
| 66 |
+
def resolve_config():
|
| 67 |
+
global HF_TOKEN, HF_DATASET_REPO
|
| 68 |
+
|
| 69 |
+
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 70 |
+
space_id = os.environ.get("SPACE_ID", "")
|
| 71 |
+
|
| 72 |
+
if not HF_DATASET_REPO:
|
| 73 |
+
if space_id:
|
| 74 |
+
HF_DATASET_REPO = f"{space_id}-data"
|
| 75 |
+
elif HF_TOKEN:
|
| 76 |
+
try:
|
| 77 |
+
from huggingface_hub import HfApi
|
| 78 |
+
name = HfApi(token=HF_TOKEN).whoami()["name"]
|
| 79 |
+
HF_DATASET_REPO = f"{name}/HuggingRun-data"
|
| 80 |
+
except Exception:
|
| 81 |
+
pass
|
| 82 |
+
os.environ["HF_DATASET_REPO"] = HF_DATASET_REPO
|
| 83 |
+
|
| 84 |
+
log("========================================")
|
| 85 |
+
log("HuggingRun v2 starting")
|
| 86 |
+
log(f" Date: {datetime.now(timezone.utc).isoformat()}")
|
| 87 |
+
log(f" HF_TOKEN: {'set (' + str(len(HF_TOKEN)) + ' chars)' if HF_TOKEN else 'NOT SET'}")
|
| 88 |
+
log(f" HF_DATASET_REPO: {HF_DATASET_REPO or 'NOT SET'}")
|
| 89 |
+
log(f" PERSIST_PATH: {PERSIST_PATH}")
|
| 90 |
+
log(f" SYNC_INTERVAL: {SYNC_INTERVAL}s")
|
| 91 |
+
log("========================================")
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def ensure_dataset_repo():
|
| 95 |
+
if not HF_TOKEN or not HF_DATASET_REPO:
|
| 96 |
+
log("persistence disabled (no token/repo)")
|
| 97 |
+
return
|
| 98 |
+
try:
|
| 99 |
+
from huggingface_hub import HfApi
|
| 100 |
+
api = HfApi(token=HF_TOKEN)
|
| 101 |
+
try:
|
| 102 |
+
api.repo_info(repo_id=HF_DATASET_REPO, repo_type="dataset")
|
| 103 |
+
log(f"dataset exists: {HF_DATASET_REPO}")
|
| 104 |
+
except Exception:
|
| 105 |
+
api.create_repo(repo_id=HF_DATASET_REPO, repo_type="dataset", private=True)
|
| 106 |
+
log(f"created dataset: {HF_DATASET_REPO}")
|
| 107 |
+
except Exception as e:
|
| 108 |
+
log(f"dataset check failed: {e}")
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ── Restore ───────────────────────────────────────────────────────────
|
| 112 |
+
def restore_state():
|
| 113 |
+
if not HF_TOKEN or not HF_DATASET_REPO:
|
| 114 |
+
os.makedirs(PERSIST_PATH, exist_ok=True)
|
| 115 |
+
return
|
| 116 |
+
|
| 117 |
+
log("── RESTORE: downloading state")
|
| 118 |
+
os.makedirs(PERSIST_PATH, exist_ok=True)
|
| 119 |
+
t0 = time.time()
|
| 120 |
+
try:
|
| 121 |
+
from huggingface_hub import hf_hub_download
|
| 122 |
+
path = hf_hub_download(
|
| 123 |
+
repo_id=HF_DATASET_REPO,
|
| 124 |
+
filename="state.tar.zst",
|
| 125 |
+
repo_type="dataset",
|
| 126 |
+
local_dir=PERSIST_PATH,
|
| 127 |
+
token=HF_TOKEN,
|
| 128 |
+
)
|
| 129 |
+
elapsed = time.time() - t0
|
| 130 |
+
size_mb = os.path.getsize(path) / 1024 / 1024
|
| 131 |
+
log(f" downloaded {size_mb:.1f}MB ({elapsed:.1f}s)")
|
| 132 |
+
except Exception as e:
|
| 133 |
+
log(f" no state to restore: {e}")
|
| 134 |
+
return
|
| 135 |
+
|
| 136 |
+
log("── RESTORE: extracting state.tar.zst")
|
| 137 |
+
t0 = time.time()
|
| 138 |
+
rc, out = run(f"tar --zstd -xpf {STATE_FILE} -C /")
|
| 139 |
+
elapsed = time.time() - t0
|
| 140 |
+
if rc == 0:
|
| 141 |
+
log(f" extracted ({elapsed:.1f}s)")
|
| 142 |
+
else:
|
| 143 |
+
log(f" extraction failed ({elapsed:.1f}s)")
|
| 144 |
+
|
| 145 |
+
# Also download package list if it exists
|
| 146 |
+
try:
|
| 147 |
+
from huggingface_hub import hf_hub_download
|
| 148 |
+
hf_hub_download(
|
| 149 |
+
repo_id=HF_DATASET_REPO,
|
| 150 |
+
filename="user-packages.list",
|
| 151 |
+
repo_type="dataset",
|
| 152 |
+
local_dir=PERSIST_PATH,
|
| 153 |
+
token=HF_TOKEN,
|
| 154 |
+
)
|
| 155 |
+
except Exception:
|
| 156 |
+
pass
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
def restore_packages():
|
| 160 |
+
if not os.path.exists(PKG_FILE) or not os.path.exists(BASE_PKG_FILE):
|
| 161 |
+
log("── PACKAGES: no saved list, skipping")
|
| 162 |
+
return
|
| 163 |
+
try:
|
| 164 |
+
with open(BASE_PKG_FILE) as f:
|
| 165 |
+
base = set(f.read().strip().split("\n"))
|
| 166 |
+
with open(PKG_FILE) as f:
|
| 167 |
+
saved = set(f.read().strip().split("\n"))
|
| 168 |
+
to_install = sorted(saved - base)
|
| 169 |
+
if not to_install:
|
| 170 |
+
log("── PACKAGES: no extra packages to install")
|
| 171 |
+
return
|
| 172 |
+
log(f"── PACKAGES: reinstalling {len(to_install)} packages")
|
| 173 |
+
run(f"apt-get update -qq && apt-get install -y --no-install-recommends {' '.join(to_install)}")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
log(f"── PACKAGES: error: {e}")
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
def ensure_passwords():
|
| 179 |
+
log("── PASSWORDS")
|
| 180 |
+
run("id user >/dev/null 2>&1 || useradd -m -s /bin/bash user")
|
| 181 |
+
run('echo "user:huggingrun" | chpasswd')
|
| 182 |
+
run('echo "root:huggingrun" | chpasswd')
|
| 183 |
+
run("ldconfig 2>/dev/null || true")
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
# ── Save + Upload ─────────────────────────────────────────────────────
|
| 187 |
+
def save_and_upload():
|
| 188 |
+
if not HF_TOKEN or not HF_DATASET_REPO:
|
| 189 |
+
return
|
| 190 |
+
from huggingface_hub import HfApi
|
| 191 |
+
|
| 192 |
+
log("══ SYNC: save + upload ══")
|
| 193 |
+
os.makedirs(PERSIST_PATH, exist_ok=True)
|
| 194 |
+
|
| 195 |
+
# Save package list
|
| 196 |
+
try:
|
| 197 |
+
rc, out = run("dpkg-query -W -f='${Package}\\n'")
|
| 198 |
+
if rc == 0 and out:
|
| 199 |
+
with open(PKG_FILE, "w") as f:
|
| 200 |
+
f.write(out + "\n")
|
| 201 |
+
except Exception:
|
| 202 |
+
pass
|
| 203 |
+
|
| 204 |
+
# Create tar.zst of /home and /root
|
| 205 |
+
t0 = time.time()
|
| 206 |
+
dirs_to_persist = []
|
| 207 |
+
for d in ["home", "root"]:
|
| 208 |
+
if os.path.isdir(f"/{d}"):
|
| 209 |
+
dirs_to_persist.append(d)
|
| 210 |
+
if not dirs_to_persist:
|
| 211 |
+
log(" nothing to persist")
|
| 212 |
+
return
|
| 213 |
+
|
| 214 |
+
rc, out = run(f"tar --zstd -cpf {STATE_FILE} -C / {' '.join(dirs_to_persist)}")
|
| 215 |
+
elapsed = time.time() - t0
|
| 216 |
+
if rc != 0:
|
| 217 |
+
log(f" tar failed ({elapsed:.1f}s)")
|
| 218 |
+
return
|
| 219 |
+
|
| 220 |
+
size_mb = os.path.getsize(STATE_FILE) / 1024 / 1024
|
| 221 |
+
log(f" tar: {size_mb:.1f}MB ({elapsed:.1f}s)")
|
| 222 |
+
|
| 223 |
+
# Upload both files
|
| 224 |
+
api = HfApi(token=HF_TOKEN)
|
| 225 |
+
ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
|
| 226 |
+
|
| 227 |
+
t0 = time.time()
|
| 228 |
+
try:
|
| 229 |
+
api.upload_file(
|
| 230 |
+
path_or_fileobj=STATE_FILE,
|
| 231 |
+
path_in_repo="state.tar.zst",
|
| 232 |
+
repo_id=HF_DATASET_REPO,
|
| 233 |
+
repo_type="dataset",
|
| 234 |
+
commit_message=f"sync {ts}",
|
| 235 |
+
)
|
| 236 |
+
elapsed = time.time() - t0
|
| 237 |
+
log(f" uploaded state.tar.zst ({elapsed:.1f}s)")
|
| 238 |
+
except Exception as e:
|
| 239 |
+
log(f" upload state.tar.zst failed: {e}")
|
| 240 |
+
|
| 241 |
+
# Upload package list (separate commit to avoid conflicts)
|
| 242 |
+
if os.path.exists(PKG_FILE):
|
| 243 |
+
try:
|
| 244 |
+
api.upload_file(
|
| 245 |
+
path_or_fileobj=PKG_FILE,
|
| 246 |
+
path_in_repo="user-packages.list",
|
| 247 |
+
repo_id=HF_DATASET_REPO,
|
| 248 |
+
repo_type="dataset",
|
| 249 |
+
commit_message=f"packages {ts}",
|
| 250 |
+
)
|
| 251 |
+
log(" uploaded user-packages.list")
|
| 252 |
+
except Exception as e:
|
| 253 |
+
log(f" upload packages failed: {e}")
|
| 254 |
+
|
| 255 |
+
log("══ SYNC: done ══")
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
# ── Sync Thread ───────────────────────────────────────────────────────
|
| 259 |
+
def sync_loop():
|
| 260 |
+
log("sync thread: waiting 60s before first sync")
|
| 261 |
+
time.sleep(60)
|
| 262 |
+
cycle = 0
|
| 263 |
+
while True:
|
| 264 |
+
cycle += 1
|
| 265 |
+
log(f"── sync cycle #{cycle}")
|
| 266 |
+
try:
|
| 267 |
+
save_and_upload()
|
| 268 |
+
except Exception as e:
|
| 269 |
+
log(f" sync error: {e}")
|
| 270 |
+
time.sleep(SYNC_INTERVAL)
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
# ── Log Streamer (SSE) ───────────────────────────────────────────────
|
| 274 |
+
class LogSSEHandler(http.server.BaseHTTPRequestHandler):
|
| 275 |
+
def log_message(self, format, *args):
|
| 276 |
+
pass
|
| 277 |
+
|
| 278 |
+
def do_GET(self):
|
| 279 |
+
if self.path == "/stream":
|
| 280 |
+
self.send_response(200)
|
| 281 |
+
self.send_header("Content-Type", "text/event-stream")
|
| 282 |
+
self.send_header("Cache-Control", "no-cache")
|
| 283 |
+
self.send_header("Connection", "keep-alive")
|
| 284 |
+
self.send_header("Access-Control-Allow-Origin", "*")
|
| 285 |
+
self.end_headers()
|
| 286 |
+
try:
|
| 287 |
+
if os.path.exists(LOGFILE):
|
| 288 |
+
with open(LOGFILE) as f:
|
| 289 |
+
for line in f:
|
| 290 |
+
line = line.rstrip("\n")
|
| 291 |
+
if line:
|
| 292 |
+
event = json.dumps({
|
| 293 |
+
"data": line + "\n",
|
| 294 |
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
| 295 |
+
})
|
| 296 |
+
self.wfile.write(f"data: {event}\n\n".encode())
|
| 297 |
+
self.wfile.flush()
|
| 298 |
+
with open(LOGFILE) as f:
|
| 299 |
+
f.seek(0, 2)
|
| 300 |
+
while True:
|
| 301 |
+
line = f.readline()
|
| 302 |
+
if line:
|
| 303 |
+
line = line.rstrip("\n")
|
| 304 |
+
if line:
|
| 305 |
+
event = json.dumps({
|
| 306 |
+
"data": line + "\n",
|
| 307 |
+
"timestamp": datetime.now(timezone.utc).isoformat()
|
| 308 |
+
})
|
| 309 |
+
self.wfile.write(f"data: {event}\n\n".encode())
|
| 310 |
+
self.wfile.flush()
|
| 311 |
+
else:
|
| 312 |
+
self.wfile.write(b": keep-alive\n\n")
|
| 313 |
+
self.wfile.flush()
|
| 314 |
+
time.sleep(1)
|
| 315 |
+
except (BrokenPipeError, ConnectionResetError):
|
| 316 |
+
pass
|
| 317 |
+
else:
|
| 318 |
+
self.send_response(404)
|
| 319 |
+
self.end_headers()
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
def start_log_streamer():
|
| 323 |
+
server = http.server.HTTPServer(("127.0.0.1", 7863), LogSSEHandler)
|
| 324 |
+
t = threading.Thread(target=server.serve_forever, daemon=True)
|
| 325 |
+
t.start()
|
| 326 |
+
log("[ OK ] log streamer on 127.0.0.1:7863")
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
# ── Service Management ────────────────────────────────────────────────
|
| 330 |
+
def start_sshd():
|
| 331 |
+
log("starting sshd on 127.0.0.1:{} ...".format(SSH_PORT))
|
| 332 |
+
os.makedirs("/run/sshd", exist_ok=True)
|
| 333 |
+
proc = subprocess.Popen([
|
| 334 |
+
"/usr/sbin/sshd", "-D", "-e",
|
| 335 |
+
"-o", f"Port={SSH_PORT}",
|
| 336 |
+
"-o", "ListenAddress=127.0.0.1",
|
| 337 |
+
"-o", "PermitRootLogin=yes",
|
| 338 |
+
"-o", "PasswordAuthentication=yes",
|
| 339 |
+
"-o", "PermitEmptyPasswords=no",
|
| 340 |
+
"-o", "UsePAM=yes",
|
| 341 |
+
])
|
| 342 |
+
time.sleep(1)
|
| 343 |
+
if proc.poll() is None:
|
| 344 |
+
log(f"[ OK ] sshd PID={proc.pid}")
|
| 345 |
+
else:
|
| 346 |
+
log("[FAILED] sshd")
|
| 347 |
+
return proc
|
| 348 |
+
|
| 349 |
+
|
| 350 |
+
def start_ttyd():
|
| 351 |
+
log(f"starting ttyd on 127.0.0.1:{TTYD_PORT} ...")
|
| 352 |
+
proc = subprocess.Popen([
|
| 353 |
+
"ttyd", "--port", TTYD_PORT, "--writable", "--base-path", "/",
|
| 354 |
+
"bash", "--login"
|
| 355 |
+
])
|
| 356 |
+
time.sleep(1)
|
| 357 |
+
if proc.poll() is None:
|
| 358 |
+
log(f"[ OK ] ttyd PID={proc.pid}")
|
| 359 |
+
else:
|
| 360 |
+
log("[FAILED] ttyd")
|
| 361 |
+
return proc
|
| 362 |
+
|
| 363 |
+
|
| 364 |
+
def start_ws_bridge():
|
| 365 |
+
log("starting ws-ssh-bridge on 127.0.0.1:7862 ...")
|
| 366 |
+
proc = subprocess.Popen([sys.executable, "/ws_ssh_bridge.py"])
|
| 367 |
+
time.sleep(1)
|
| 368 |
+
if proc.poll() is None:
|
| 369 |
+
log(f"[ OK ] ws-bridge PID={proc.pid}")
|
| 370 |
+
else:
|
| 371 |
+
log("[FAILED] ws-bridge")
|
| 372 |
+
return proc
|
| 373 |
+
|
| 374 |
+
|
| 375 |
+
def system_info():
|
| 376 |
+
log("── System Info ──")
|
| 377 |
+
rc, out = run("uname -r")
|
| 378 |
+
if rc == 0:
|
| 379 |
+
log(f" Kernel: {out}")
|
| 380 |
+
rc, out = run("nproc")
|
| 381 |
+
if rc == 0:
|
| 382 |
+
log(f" CPU: {out} cores")
|
| 383 |
+
rc, out = run("free -h 2>/dev/null | grep Mem: | tr -s ' ' | cut -d' ' -f2")
|
| 384 |
+
if rc == 0 and out:
|
| 385 |
+
log(f" Memory: {out}")
|
| 386 |
+
rc, out = run("df -h / 2>/dev/null | tail -1 | tr -s ' ' | cut -d' ' -f2,4 | sed 's/ / total, /;s/$/ free/'")
|
| 387 |
+
if rc == 0 and out:
|
| 388 |
+
log(f" Disk: {out}")
|
| 389 |
+
|
| 390 |
+
|
| 391 |
+
# ── Heartbeat ─────────────────────────────────────────────────────────
|
| 392 |
+
def heartbeat_loop():
|
| 393 |
+
while True:
|
| 394 |
+
time.sleep(60)
|
| 395 |
+
try:
|
| 396 |
+
rc, loadavg = run("cat /proc/loadavg 2>/dev/null | cut -d' ' -f1-3")
|
| 397 |
+
rc2, mem = run("free -h 2>/dev/null | grep Mem: | tr -s ' ' | cut -d' ' -f3,2 | sed 's/ /\\//'")
|
| 398 |
+
log(f"heartbeat: load={loadavg} mem={mem}")
|
| 399 |
+
except Exception:
|
| 400 |
+
log("heartbeat: ok")
|
| 401 |
+
|
| 402 |
+
|
| 403 |
+
# ── Main ──────────────────────────────────────────────────────────────
|
| 404 |
+
def main():
|
| 405 |
+
os.makedirs(PERSIST_PATH, exist_ok=True)
|
| 406 |
+
open(LOGFILE, "a").close()
|
| 407 |
+
|
| 408 |
+
resolve_config()
|
| 409 |
+
system_info()
|
| 410 |
+
ensure_dataset_repo()
|
| 411 |
+
restore_state()
|
| 412 |
+
restore_packages()
|
| 413 |
+
ensure_passwords()
|
| 414 |
+
|
| 415 |
+
# Write env for other processes
|
| 416 |
+
with open("/etc/huggingrun.env", "w") as f:
|
| 417 |
+
f.write(f'export HF_TOKEN="{HF_TOKEN}"\n')
|
| 418 |
+
f.write(f'export HF_DATASET_REPO="{HF_DATASET_REPO}"\n')
|
| 419 |
+
f.write(f'export PERSIST_PATH="{PERSIST_PATH}"\n')
|
| 420 |
+
|
| 421 |
+
# Start services
|
| 422 |
+
start_sshd()
|
| 423 |
+
start_ws_bridge()
|
| 424 |
+
start_ttyd()
|
| 425 |
+
start_log_streamer()
|
| 426 |
+
|
| 427 |
+
# Background threads
|
| 428 |
+
threading.Thread(target=sync_loop, daemon=True).start()
|
| 429 |
+
threading.Thread(target=heartbeat_loop, daemon=True).start()
|
| 430 |
+
|
| 431 |
+
log("========================================")
|
| 432 |
+
log("system ready")
|
| 433 |
+
log(f" Terminal: https://<space>.hf.space/")
|
| 434 |
+
log(f" Logs: https://<space>.hf.space/runlog")
|
| 435 |
+
log(f" SSE: https://<space>.hf.space/runlog/stream")
|
| 436 |
+
log("========================================")
|
| 437 |
+
|
| 438 |
+
# Final save on SIGTERM
|
| 439 |
+
def on_sigterm(sig, frame):
|
| 440 |
+
log(f"signal {sig} — final save ...")
|
| 441 |
+
try:
|
| 442 |
+
save_and_upload()
|
| 443 |
+
except Exception as e:
|
| 444 |
+
log(f"final save error: {e}")
|
| 445 |
+
sys.exit(0)
|
| 446 |
+
|
| 447 |
+
signal.signal(signal.SIGTERM, on_sigterm)
|
| 448 |
+
signal.signal(signal.SIGINT, on_sigterm)
|
| 449 |
+
|
| 450 |
+
# exec nginx — becomes PID 1, opens port 7860
|
| 451 |
+
log("starting nginx on 0.0.0.0:7860 ...")
|
| 452 |
+
os.execvp("nginx", ["nginx", "-c", "/etc/nginx/nginx.conf", "-g", "daemon off;"])
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
if __name__ == "__main__":
|
| 456 |
+
main()
|
ubuntu-server/nginx.conf → nginx.conf
RENAMED
|
@@ -22,7 +22,7 @@ http {
|
|
| 22 |
server {
|
| 23 |
listen 7860;
|
| 24 |
|
| 25 |
-
# /ssh → WebSocket-to-SSH bridge
|
| 26 |
location /ssh {
|
| 27 |
proxy_pass http://127.0.0.1:7862;
|
| 28 |
proxy_http_version 1.1;
|
|
@@ -34,7 +34,6 @@ http {
|
|
| 34 |
}
|
| 35 |
|
| 36 |
# /runlog → full log file (static)
|
| 37 |
-
# Note: /logs and /api/* are intercepted by HF's proxy
|
| 38 |
location = /runlog {
|
| 39 |
default_type text/plain;
|
| 40 |
add_header Cache-Control "no-cache, no-store";
|
|
@@ -52,7 +51,7 @@ http {
|
|
| 52 |
proxy_read_timeout 86400;
|
| 53 |
}
|
| 54 |
|
| 55 |
-
# Everything else → ttyd web terminal
|
| 56 |
location / {
|
| 57 |
proxy_pass http://127.0.0.1:7681;
|
| 58 |
proxy_http_version 1.1;
|
|
|
|
| 22 |
server {
|
| 23 |
listen 7860;
|
| 24 |
|
| 25 |
+
# /ssh → WebSocket-to-SSH bridge
|
| 26 |
location /ssh {
|
| 27 |
proxy_pass http://127.0.0.1:7862;
|
| 28 |
proxy_http_version 1.1;
|
|
|
|
| 34 |
}
|
| 35 |
|
| 36 |
# /runlog → full log file (static)
|
|
|
|
| 37 |
location = /runlog {
|
| 38 |
default_type text/plain;
|
| 39 |
add_header Cache-Control "no-cache, no-store";
|
|
|
|
| 51 |
proxy_read_timeout 86400;
|
| 52 |
}
|
| 53 |
|
| 54 |
+
# Everything else → ttyd web terminal
|
| 55 |
location / {
|
| 56 |
proxy_pass http://127.0.0.1:7681;
|
| 57 |
proxy_http_version 1.1;
|
scripts/curl_logs.sh
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
# Stream HF Space runtime/build logs via JWT-based SSE endpoint
|
| 3 |
-
# Usage: ./scripts/curl_logs.sh [run|build] (default: run)
|
| 4 |
-
# Requires HF_TOKEN in .env or environment
|
| 5 |
-
|
| 6 |
-
set -e
|
| 7 |
-
cd "$(dirname "$0")/.."
|
| 8 |
-
if [ -f .env ]; then
|
| 9 |
-
HF_TOKEN="${HF_TOKEN:-$(awk -F= '/^HF_TOKEN=/ {print $2; exit}' .env | tr -d '\r\n" ')}"
|
| 10 |
-
export HF_TOKEN
|
| 11 |
-
fi
|
| 12 |
-
# Fallback to cached token
|
| 13 |
-
if [ -z "$HF_TOKEN" ] && [ -f ~/.cache/huggingface/token ]; then
|
| 14 |
-
HF_TOKEN="$(cat ~/.cache/huggingface/token)"
|
| 15 |
-
export HF_TOKEN
|
| 16 |
-
fi
|
| 17 |
-
if [ -z "$HF_TOKEN" ]; then
|
| 18 |
-
echo "HF_TOKEN not set (add to .env or export)" >&2
|
| 19 |
-
exit 1
|
| 20 |
-
fi
|
| 21 |
-
|
| 22 |
-
SPACE_ID="${SPACE_ID:-tao-shen/HuggingRun}"
|
| 23 |
-
LOG="${1:-run}"
|
| 24 |
-
|
| 25 |
-
echo "=== Streaming ${LOG} logs for ${SPACE_ID} ===" >&2
|
| 26 |
-
echo "=== Method 1: Direct API (huggingface.co) ===" >&2
|
| 27 |
-
echo "=== Press Ctrl+C to stop ===" >&2
|
| 28 |
-
echo "" >&2
|
| 29 |
-
|
| 30 |
-
# Method 1: Direct HF API (works for most cases)
|
| 31 |
-
curl -sN \
|
| 32 |
-
-H "Authorization: Bearer $HF_TOKEN" \
|
| 33 |
-
"https://huggingface.co/api/spaces/$SPACE_ID/logs/$LOG"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/entrypoint.sh
DELETED
|
@@ -1,92 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 3 |
-
# HuggingRun Entrypoint
|
| 4 |
-
# 1. Init: download dataset + restore filesystem (foreground, logs visible)
|
| 5 |
-
# 2. Start sync-loop daemon in background
|
| 6 |
-
# 3. exec user command (start-server.sh opens port 7860)
|
| 7 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 8 |
-
set -e
|
| 9 |
-
|
| 10 |
-
LOGFILE="/var/log/huggingrun.log"
|
| 11 |
-
touch "$LOGFILE"
|
| 12 |
-
|
| 13 |
-
# Log to stderr (HF captures stderr for Docker SDK spaces) + logfile
|
| 14 |
-
log() {
|
| 15 |
-
echo "$@" >&2
|
| 16 |
-
echo "$@" >> "$LOGFILE"
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
log "========================================"
|
| 20 |
-
log "[entrypoint] HuggingRun starting ..."
|
| 21 |
-
log "[entrypoint] Date: $(date -u)"
|
| 22 |
-
log "[entrypoint] SPACE_ID=${SPACE_ID:-not set}"
|
| 23 |
-
log "[entrypoint] HF_TOKEN=${HF_TOKEN:+set (${#HF_TOKEN} chars)}${HF_TOKEN:-NOT SET}"
|
| 24 |
-
log "[entrypoint] HF_DATASET_REPO=${HF_DATASET_REPO:-not set}"
|
| 25 |
-
log "[entrypoint] PERSIST_PATH=${PERSIST_PATH:-/data}"
|
| 26 |
-
log "[entrypoint] RUN_CMD=${RUN_CMD:-default}"
|
| 27 |
-
log "========================================"
|
| 28 |
-
|
| 29 |
-
# Determine dataset repo
|
| 30 |
-
if [ -z "$HF_DATASET_REPO" ]; then
|
| 31 |
-
if [ -n "$SPACE_ID" ]; then
|
| 32 |
-
export HF_DATASET_REPO="${SPACE_ID}-data"
|
| 33 |
-
log "[entrypoint] Auto-set HF_DATASET_REPO=${HF_DATASET_REPO}"
|
| 34 |
-
elif [ -n "$HF_TOKEN" ]; then
|
| 35 |
-
log "[entrypoint] Resolving HF_DATASET_REPO from token ..."
|
| 36 |
-
export HF_DATASET_REPO=$(python3 -c "
|
| 37 |
-
from huggingface_hub import HfApi
|
| 38 |
-
import os
|
| 39 |
-
try:
|
| 40 |
-
name = HfApi(token=os.environ['HF_TOKEN']).whoami()['name']
|
| 41 |
-
print(name + '/HuggingRun-data')
|
| 42 |
-
except:
|
| 43 |
-
print('')
|
| 44 |
-
" 2>/dev/null)
|
| 45 |
-
log "[entrypoint] Resolved: ${HF_DATASET_REPO}"
|
| 46 |
-
fi
|
| 47 |
-
fi
|
| 48 |
-
|
| 49 |
-
# Ensure dataset repo exists
|
| 50 |
-
if [ -n "$HF_TOKEN" ] && [ -n "$HF_DATASET_REPO" ]; then
|
| 51 |
-
log "[entrypoint] Verifying dataset: ${HF_DATASET_REPO} ..."
|
| 52 |
-
python3 -c "
|
| 53 |
-
from huggingface_hub import HfApi
|
| 54 |
-
import os
|
| 55 |
-
api = HfApi(token=os.environ['HF_TOKEN'])
|
| 56 |
-
repo = os.environ['HF_DATASET_REPO']
|
| 57 |
-
try:
|
| 58 |
-
api.repo_info(repo_id=repo, repo_type='dataset')
|
| 59 |
-
print(f'[entrypoint] Dataset exists: {repo}', flush=True)
|
| 60 |
-
except:
|
| 61 |
-
api.create_repo(repo_id=repo, repo_type='dataset', private=True)
|
| 62 |
-
print(f'[entrypoint] Created dataset: {repo}', flush=True)
|
| 63 |
-
" 2>&1 || echo "[entrypoint] WARNING: Could not verify dataset" >&2
|
| 64 |
-
else
|
| 65 |
-
log "[entrypoint] WARNING: persistence disabled (no token/repo)"
|
| 66 |
-
fi
|
| 67 |
-
|
| 68 |
-
# Write env for other processes
|
| 69 |
-
cat > /etc/huggingrun.env << ENVEOF
|
| 70 |
-
export HF_TOKEN="${HF_TOKEN}"
|
| 71 |
-
export HF_DATASET_REPO="${HF_DATASET_REPO}"
|
| 72 |
-
export PERSIST_PATH="${PERSIST_PATH:-/data}"
|
| 73 |
-
ENVEOF
|
| 74 |
-
log "[entrypoint] Wrote /etc/huggingrun.env"
|
| 75 |
-
|
| 76 |
-
# Step 1: Init — download dataset + restore filesystem (foreground so logs are visible)
|
| 77 |
-
log "[entrypoint] ── Step 1: Init (download + restore) ──"
|
| 78 |
-
python3 -u /scripts/git_sync_daemon.py init
|
| 79 |
-
log "[entrypoint] ── Init done ──"
|
| 80 |
-
|
| 81 |
-
# Step 2: Start background sync-loop daemon
|
| 82 |
-
log "[entrypoint] ── Step 2: Starting sync-loop daemon ──"
|
| 83 |
-
python3 -u /scripts/git_sync_daemon.py sync-loop &
|
| 84 |
-
SYNC_PID=$!
|
| 85 |
-
log "[entrypoint] Sync daemon PID=${SYNC_PID}"
|
| 86 |
-
|
| 87 |
-
# Step 3: Run user command (opens port 7860)
|
| 88 |
-
CMD="${RUN_CMD:-python3 /app/demo_app.py}"
|
| 89 |
-
log "[entrypoint] ── Step 3: exec user command ──"
|
| 90 |
-
log "[entrypoint] CMD=${CMD}"
|
| 91 |
-
log "========================================"
|
| 92 |
-
exec $CMD
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/entrypoint_wrapper.py
DELETED
|
@@ -1,53 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""PID 1 wrapper for HuggingRun.
|
| 3 |
-
|
| 4 |
-
HF's runtime log SSE captures stdout from PID 1. Gradio (Python) apps work.
|
| 5 |
-
Bash entrypoints do not. This Python wrapper becomes PID 1 and ensures all
|
| 6 |
-
subprocess output is captured and forwarded to stdout.
|
| 7 |
-
"""
|
| 8 |
-
import os
|
| 9 |
-
import signal
|
| 10 |
-
import subprocess
|
| 11 |
-
import sys
|
| 12 |
-
import threading
|
| 13 |
-
|
| 14 |
-
def stream_pipe(pipe, prefix=""):
|
| 15 |
-
"""Read lines from a pipe and print them to stdout (flushed)."""
|
| 16 |
-
for line in iter(pipe.readline, b""):
|
| 17 |
-
text = line.decode("utf-8", errors="replace").rstrip("\n")
|
| 18 |
-
print(f"{prefix}{text}", flush=True)
|
| 19 |
-
pipe.close()
|
| 20 |
-
|
| 21 |
-
def main():
|
| 22 |
-
print("[wrapper] PID 1 Python wrapper starting", flush=True)
|
| 23 |
-
print(f"[wrapper] Python {sys.version}", flush=True)
|
| 24 |
-
print(f"[wrapper] PID={os.getpid()}", flush=True)
|
| 25 |
-
|
| 26 |
-
# Run the real entrypoint as a subprocess, capture its output
|
| 27 |
-
proc = subprocess.Popen(
|
| 28 |
-
["/bin/bash", "/scripts/entrypoint.sh"],
|
| 29 |
-
stdout=subprocess.PIPE,
|
| 30 |
-
stderr=subprocess.STDOUT, # merge stderr into stdout
|
| 31 |
-
bufsize=0, # unbuffered
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
# Forward signals
|
| 35 |
-
def forward_signal(sig, frame):
|
| 36 |
-
print(f"[wrapper] Forwarding signal {sig} to PID {proc.pid}", flush=True)
|
| 37 |
-
proc.send_signal(sig)
|
| 38 |
-
|
| 39 |
-
signal.signal(signal.SIGTERM, forward_signal)
|
| 40 |
-
signal.signal(signal.SIGINT, forward_signal)
|
| 41 |
-
|
| 42 |
-
# Stream output in a thread
|
| 43 |
-
t = threading.Thread(target=stream_pipe, args=(proc.stdout,), daemon=True)
|
| 44 |
-
t.start()
|
| 45 |
-
|
| 46 |
-
# Wait for process
|
| 47 |
-
proc.wait()
|
| 48 |
-
t.join(timeout=2)
|
| 49 |
-
print(f"[wrapper] entrypoint.sh exited with code {proc.returncode}", flush=True)
|
| 50 |
-
sys.exit(proc.returncode)
|
| 51 |
-
|
| 52 |
-
if __name__ == "__main__":
|
| 53 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/ssh_connect.sh
DELETED
|
@@ -1,38 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 3 |
-
# SSH into HuggingRun via WebSocket (single port 7860)
|
| 4 |
-
# No jumphost needed — SSH tunnels through the HF Space's web port.
|
| 5 |
-
#
|
| 6 |
-
# Prerequisites:
|
| 7 |
-
# brew install websocat # macOS
|
| 8 |
-
# # or download from https://github.com/nicehash/websocat/releases
|
| 9 |
-
#
|
| 10 |
-
# Usage:
|
| 11 |
-
# bash scripts/ssh_connect.sh # default: tao-shen-huggingrun.hf.space
|
| 12 |
-
# bash scripts/ssh_connect.sh my-space.hf.space # custom space URL
|
| 13 |
-
# SSH_USER=root bash scripts/ssh_connect.sh # login as root
|
| 14 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 15 |
-
set -euo pipefail
|
| 16 |
-
|
| 17 |
-
SPACE_HOST="${1:-tao-shen-huggingrun.hf.space}"
|
| 18 |
-
SSH_USER="${SSH_USER:-user}"
|
| 19 |
-
WS_URL="wss://${SPACE_HOST}/ssh"
|
| 20 |
-
|
| 21 |
-
# Check websocat
|
| 22 |
-
if ! command -v websocat &>/dev/null; then
|
| 23 |
-
echo "websocat not found. Install it:"
|
| 24 |
-
echo " macOS: brew install websocat"
|
| 25 |
-
echo " Linux: curl -L -o ~/.local/bin/websocat https://github.com/nicehash/websocat/releases/latest/download/websocat.x86_64-unknown-linux-musl && chmod +x ~/.local/bin/websocat"
|
| 26 |
-
exit 1
|
| 27 |
-
fi
|
| 28 |
-
|
| 29 |
-
echo "Connecting to ${SSH_USER}@${SPACE_HOST} via WebSocket SSH ..."
|
| 30 |
-
echo " WebSocket: ${WS_URL}"
|
| 31 |
-
echo " Password: huggingrun"
|
| 32 |
-
echo ""
|
| 33 |
-
|
| 34 |
-
# SSH with WebSocket as ProxyCommand
|
| 35 |
-
ssh -o "ProxyCommand=websocat --binary ${WS_URL}" \
|
| 36 |
-
-o StrictHostKeyChecking=no \
|
| 37 |
-
-o UserKnownHostsFile=/dev/null \
|
| 38 |
-
"${SSH_USER}@huggingrun"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/stream_logs.py
DELETED
|
@@ -1,105 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""Stream HF Space runtime logs via JWT-authenticated SSE endpoint.
|
| 3 |
-
|
| 4 |
-
Usage:
|
| 5 |
-
python3 scripts/stream_logs.py [run|build]
|
| 6 |
-
|
| 7 |
-
This uses the same endpoint as the HF web UI's "Container logs" tab:
|
| 8 |
-
1. Fetch JWT from https://huggingface.co/api/spaces/{id}/jwt
|
| 9 |
-
2. Stream SSE from https://api.hf.space/v1/{id}/logs/{level}
|
| 10 |
-
"""
|
| 11 |
-
import json
|
| 12 |
-
import os
|
| 13 |
-
import sys
|
| 14 |
-
|
| 15 |
-
def get_token():
|
| 16 |
-
"""Get HF token from env, .env file, or cached token."""
|
| 17 |
-
token = os.environ.get("HF_TOKEN", "")
|
| 18 |
-
if not token:
|
| 19 |
-
env_file = os.path.join(os.path.dirname(__file__), "..", ".env")
|
| 20 |
-
if os.path.exists(env_file):
|
| 21 |
-
for line in open(env_file):
|
| 22 |
-
if line.startswith("HF_TOKEN="):
|
| 23 |
-
token = line.split("=", 1)[1].strip().strip('"').strip("'")
|
| 24 |
-
break
|
| 25 |
-
if not token:
|
| 26 |
-
cached = os.path.expanduser("~/.cache/huggingface/token")
|
| 27 |
-
if os.path.exists(cached):
|
| 28 |
-
token = open(cached).read().strip()
|
| 29 |
-
return token
|
| 30 |
-
|
| 31 |
-
def stream_logs(space_id, level="run"):
|
| 32 |
-
import requests
|
| 33 |
-
|
| 34 |
-
token = get_token()
|
| 35 |
-
if not token:
|
| 36 |
-
print("ERROR: HF_TOKEN not found", file=sys.stderr)
|
| 37 |
-
sys.exit(1)
|
| 38 |
-
|
| 39 |
-
print(f"=== Streaming {level} logs for {space_id} ===", file=sys.stderr)
|
| 40 |
-
|
| 41 |
-
# Step 1: Get JWT token
|
| 42 |
-
print("Fetching JWT token...", file=sys.stderr)
|
| 43 |
-
resp = requests.get(
|
| 44 |
-
f"https://huggingface.co/api/spaces/{space_id}/jwt",
|
| 45 |
-
headers={"Authorization": f"Bearer {token}"},
|
| 46 |
-
)
|
| 47 |
-
if resp.status_code != 200:
|
| 48 |
-
print(f"JWT fetch failed ({resp.status_code}): {resp.text}", file=sys.stderr)
|
| 49 |
-
print("Falling back to direct API...", file=sys.stderr)
|
| 50 |
-
# Fallback to direct API
|
| 51 |
-
resp = requests.get(
|
| 52 |
-
f"https://huggingface.co/api/spaces/{space_id}/logs/{level}",
|
| 53 |
-
headers={"Authorization": f"Bearer {token}"},
|
| 54 |
-
stream=True,
|
| 55 |
-
)
|
| 56 |
-
for line in resp.iter_lines():
|
| 57 |
-
if line:
|
| 58 |
-
line = line.decode("utf-8", errors="replace")
|
| 59 |
-
if line.startswith("data: "):
|
| 60 |
-
try:
|
| 61 |
-
event = json.loads(line[6:])
|
| 62 |
-
ts = event.get("timestamp", "")
|
| 63 |
-
data = event.get("data", "")
|
| 64 |
-
print(f"[{ts}] {data}", end="", flush=True)
|
| 65 |
-
except json.JSONDecodeError:
|
| 66 |
-
print(line, flush=True)
|
| 67 |
-
else:
|
| 68 |
-
print(line, flush=True)
|
| 69 |
-
return
|
| 70 |
-
|
| 71 |
-
jwt = resp.json()["token"]
|
| 72 |
-
print(f"JWT obtained (expires in token payload)", file=sys.stderr)
|
| 73 |
-
|
| 74 |
-
# Step 2: Stream logs via SSE
|
| 75 |
-
print(f"Connecting to api.hf.space SSE stream...", file=sys.stderr)
|
| 76 |
-
print("---", file=sys.stderr)
|
| 77 |
-
|
| 78 |
-
resp = requests.get(
|
| 79 |
-
f"https://api.hf.space/v1/{space_id}/logs/{level}",
|
| 80 |
-
headers={"Authorization": f"Bearer {jwt}"},
|
| 81 |
-
stream=True,
|
| 82 |
-
)
|
| 83 |
-
|
| 84 |
-
if resp.status_code != 200:
|
| 85 |
-
print(f"SSE stream failed ({resp.status_code}): {resp.text}", file=sys.stderr)
|
| 86 |
-
sys.exit(1)
|
| 87 |
-
|
| 88 |
-
for line in resp.iter_lines():
|
| 89 |
-
if line:
|
| 90 |
-
line = line.decode("utf-8", errors="replace")
|
| 91 |
-
if line.startswith("data: "):
|
| 92 |
-
try:
|
| 93 |
-
event = json.loads(line[6:])
|
| 94 |
-
ts = event.get("timestamp", "")
|
| 95 |
-
data = event.get("data", "")
|
| 96 |
-
print(f"[{ts}] {data}", end="", flush=True)
|
| 97 |
-
except json.JSONDecodeError:
|
| 98 |
-
print(line, flush=True)
|
| 99 |
-
else:
|
| 100 |
-
print(line, flush=True)
|
| 101 |
-
|
| 102 |
-
if __name__ == "__main__":
|
| 103 |
-
space_id = os.environ.get("SPACE_ID", "tao-shen/HuggingRun")
|
| 104 |
-
level = sys.argv[1] if len(sys.argv) > 1 else "run"
|
| 105 |
-
stream_logs(space_id, level)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/sync_hf.py
DELETED
|
@@ -1,233 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
HuggingRun — Generic persistence for any app on Hugging Face Spaces
|
| 4 |
-
====================================================================
|
| 5 |
-
|
| 6 |
-
- Startup: snapshot_download from HF Dataset → PERSIST_PATH
|
| 7 |
-
- Periodic: upload_folder PERSIST_PATH → dataset
|
| 8 |
-
- Shutdown: final upload
|
| 9 |
-
- Then: exec user's RUN_CMD (or default demo app)
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import os
|
| 13 |
-
import sys
|
| 14 |
-
import time
|
| 15 |
-
import threading
|
| 16 |
-
import subprocess
|
| 17 |
-
import signal
|
| 18 |
-
import shutil
|
| 19 |
-
import tempfile
|
| 20 |
-
import traceback
|
| 21 |
-
from pathlib import Path
|
| 22 |
-
from datetime import datetime
|
| 23 |
-
|
| 24 |
-
os.environ.setdefault("HF_HUB_DOWNLOAD_TIMEOUT", "300")
|
| 25 |
-
os.environ.setdefault("HF_HUB_UPLOAD_TIMEOUT", "600")
|
| 26 |
-
|
| 27 |
-
from huggingface_hub import HfApi, snapshot_download
|
| 28 |
-
|
| 29 |
-
# ── Configuration ───────────────────────────────────────────────────────────
|
| 30 |
-
# See docs/HF_LIMITATIONS.md for how HuggingRun addresses HF Spaces limits.
|
| 31 |
-
|
| 32 |
-
HF_TOKEN = os.environ.get("HF_TOKEN")
|
| 33 |
-
PERSIST_PATH = Path(os.environ.get("PERSIST_PATH", "/data"))
|
| 34 |
-
RUN_CMD = os.environ.get("RUN_CMD", "")
|
| 35 |
-
APP_PORT = os.environ.get("APP_PORT", os.environ.get("PORT", "7860")) # Single port exposed by HF
|
| 36 |
-
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "60"))
|
| 37 |
-
AUTO_CREATE_DATASET = os.environ.get("AUTO_CREATE_DATASET", "false").lower() in ("true", "1", "yes")
|
| 38 |
-
# Store directly at dataset root (no subfolder)
|
| 39 |
-
DATASET_SUBFOLDER = ""
|
| 40 |
-
|
| 41 |
-
SPACE_ID = os.environ.get("SPACE_ID", "")
|
| 42 |
-
HF_REPO_ID = os.environ.get("HF_DATASET_REPO", "")
|
| 43 |
-
if not HF_REPO_ID and SPACE_ID:
|
| 44 |
-
HF_REPO_ID = f"{SPACE_ID}-data"
|
| 45 |
-
if not HF_REPO_ID and HF_TOKEN:
|
| 46 |
-
try:
|
| 47 |
-
api = HfApi(token=HF_TOKEN)
|
| 48 |
-
uname = api.whoami()["name"]
|
| 49 |
-
HF_REPO_ID = f"{uname}/HuggingRun-data"
|
| 50 |
-
del api, uname
|
| 51 |
-
except Exception:
|
| 52 |
-
HF_REPO_ID = ""
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
class GenericSync:
|
| 56 |
-
"""Upload/download PERSIST_PATH to/from HF Dataset."""
|
| 57 |
-
|
| 58 |
-
def __init__(self):
|
| 59 |
-
self.enabled = False
|
| 60 |
-
self.dataset_exists = False
|
| 61 |
-
self.api = None
|
| 62 |
-
|
| 63 |
-
if not HF_TOKEN:
|
| 64 |
-
print("[HuggingRun] HF_TOKEN not set. Persistence disabled.")
|
| 65 |
-
return
|
| 66 |
-
if not HF_REPO_ID:
|
| 67 |
-
print("[HuggingRun] HF_DATASET_REPO/SPACE_ID not set. Persistence disabled.")
|
| 68 |
-
return
|
| 69 |
-
|
| 70 |
-
self.enabled = True
|
| 71 |
-
self.api = HfApi(token=HF_TOKEN)
|
| 72 |
-
self.dataset_exists = self._ensure_repo_exists()
|
| 73 |
-
|
| 74 |
-
def _ensure_repo_exists(self):
|
| 75 |
-
try:
|
| 76 |
-
self.api.repo_info(repo_id=HF_REPO_ID, repo_type="dataset")
|
| 77 |
-
print(f"[HuggingRun] Dataset found: {HF_REPO_ID}")
|
| 78 |
-
return True
|
| 79 |
-
except Exception:
|
| 80 |
-
if not AUTO_CREATE_DATASET:
|
| 81 |
-
print(f"[HuggingRun] Dataset not found: {HF_REPO_ID}. Set AUTO_CREATE_DATASET=true to create.")
|
| 82 |
-
return False
|
| 83 |
-
try:
|
| 84 |
-
self.api.create_repo(
|
| 85 |
-
repo_id=HF_REPO_ID,
|
| 86 |
-
repo_type="dataset",
|
| 87 |
-
private=True,
|
| 88 |
-
)
|
| 89 |
-
print(f"[HuggingRun] Created dataset: {HF_REPO_ID}")
|
| 90 |
-
return True
|
| 91 |
-
except Exception as e:
|
| 92 |
-
print(f"[HuggingRun] Failed to create dataset: {e}")
|
| 93 |
-
return False
|
| 94 |
-
|
| 95 |
-
def load_from_repo(self):
|
| 96 |
-
"""Download dataset → PERSIST_PATH."""
|
| 97 |
-
if not self.enabled or not self.dataset_exists:
|
| 98 |
-
PERSIST_PATH.mkdir(parents=True, exist_ok=True)
|
| 99 |
-
return
|
| 100 |
-
|
| 101 |
-
try:
|
| 102 |
-
files = self.api.list_repo_files(repo_id=HF_REPO_ID, repo_type="dataset")
|
| 103 |
-
# Filter out metadata files (.gitattributes, README.md, etc.)
|
| 104 |
-
data_files = [f for f in files if not f.startswith(".") and f != "README.md"]
|
| 105 |
-
if not data_files:
|
| 106 |
-
print(f"[HuggingRun] Dataset empty. Starting fresh.")
|
| 107 |
-
PERSIST_PATH.mkdir(parents=True, exist_ok=True)
|
| 108 |
-
return
|
| 109 |
-
|
| 110 |
-
print(f"[HuggingRun] Restoring {PERSIST_PATH} from {HF_REPO_ID} ({len(data_files)} files) ...")
|
| 111 |
-
PERSIST_PATH.mkdir(parents=True, exist_ok=True)
|
| 112 |
-
for attempt in range(2):
|
| 113 |
-
try:
|
| 114 |
-
snapshot_download(
|
| 115 |
-
repo_id=HF_REPO_ID,
|
| 116 |
-
repo_type="dataset",
|
| 117 |
-
local_dir=str(PERSIST_PATH),
|
| 118 |
-
token=HF_TOKEN,
|
| 119 |
-
ignore_patterns=[".git*", "README.md"],
|
| 120 |
-
)
|
| 121 |
-
break
|
| 122 |
-
except Exception as e:
|
| 123 |
-
if attempt == 0:
|
| 124 |
-
print(f"[HuggingRun] Restore attempt {attempt + 1} failed: {e}. Retrying...")
|
| 125 |
-
time.sleep(3)
|
| 126 |
-
else:
|
| 127 |
-
raise
|
| 128 |
-
print("[HuggingRun] Restore completed.")
|
| 129 |
-
except Exception as e:
|
| 130 |
-
print(f"[HuggingRun] Restore failed: {e}")
|
| 131 |
-
traceback.print_exc()
|
| 132 |
-
PERSIST_PATH.mkdir(parents=True, exist_ok=True)
|
| 133 |
-
|
| 134 |
-
def save_to_repo(self):
|
| 135 |
-
"""Upload PERSIST_PATH → dataset root."""
|
| 136 |
-
if not self.enabled or not self.dataset_exists:
|
| 137 |
-
return
|
| 138 |
-
if not PERSIST_PATH.exists():
|
| 139 |
-
return
|
| 140 |
-
|
| 141 |
-
try:
|
| 142 |
-
if not self._ensure_repo_exists():
|
| 143 |
-
return
|
| 144 |
-
self.api.upload_folder(
|
| 145 |
-
folder_path=str(PERSIST_PATH),
|
| 146 |
-
path_in_repo="", # directly at dataset root
|
| 147 |
-
repo_id=HF_REPO_ID,
|
| 148 |
-
repo_type="dataset",
|
| 149 |
-
token=HF_TOKEN,
|
| 150 |
-
commit_message=f"HuggingRun sync — {datetime.now().isoformat()}",
|
| 151 |
-
ignore_patterns=["__pycache__", "*.pyc", ".git", ".git*"],
|
| 152 |
-
)
|
| 153 |
-
print(f"[HuggingRun] Upload completed.")
|
| 154 |
-
except Exception as e:
|
| 155 |
-
print(f"[HuggingRun] Upload failed: {e}")
|
| 156 |
-
traceback.print_exc()
|
| 157 |
-
|
| 158 |
-
def background_sync_loop(self, stop_event):
|
| 159 |
-
while not stop_event.is_set():
|
| 160 |
-
if stop_event.wait(timeout=SYNC_INTERVAL):
|
| 161 |
-
break
|
| 162 |
-
self.save_to_repo()
|
| 163 |
-
|
| 164 |
-
def run_user_cmd(self):
|
| 165 |
-
"""Run RUN_CMD (or default demo). Returns Popen process or None."""
|
| 166 |
-
cmd = (RUN_CMD or "python3 /app/demo_app.py").strip()
|
| 167 |
-
if not cmd:
|
| 168 |
-
print("[HuggingRun] RUN_CMD empty and no default demo. Exiting.")
|
| 169 |
-
return None
|
| 170 |
-
print(f"[HuggingRun] Running: {cmd}")
|
| 171 |
-
try:
|
| 172 |
-
env = os.environ.copy()
|
| 173 |
-
env.setdefault("PORT", APP_PORT)
|
| 174 |
-
env.setdefault("APP_PORT", APP_PORT)
|
| 175 |
-
process = subprocess.Popen(
|
| 176 |
-
cmd,
|
| 177 |
-
shell=True,
|
| 178 |
-
env=env,
|
| 179 |
-
stdout=sys.stdout,
|
| 180 |
-
stderr=sys.stderr,
|
| 181 |
-
)
|
| 182 |
-
print(f"[HuggingRun] Process started PID={process.pid}")
|
| 183 |
-
return process
|
| 184 |
-
except Exception as e:
|
| 185 |
-
print(f"[HuggingRun] Failed to start: {e}")
|
| 186 |
-
traceback.print_exc()
|
| 187 |
-
return None
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
def main():
|
| 191 |
-
try:
|
| 192 |
-
sync = GenericSync()
|
| 193 |
-
sync.load_from_repo()
|
| 194 |
-
|
| 195 |
-
stop_event = threading.Event()
|
| 196 |
-
t = threading.Thread(target=sync.background_sync_loop, args=(stop_event,), daemon=True)
|
| 197 |
-
t.start()
|
| 198 |
-
|
| 199 |
-
process = sync.run_user_cmd()
|
| 200 |
-
if process is None:
|
| 201 |
-
stop_event.set()
|
| 202 |
-
t.join(timeout=5)
|
| 203 |
-
sys.exit(1)
|
| 204 |
-
|
| 205 |
-
def on_signal(sig, frame):
|
| 206 |
-
print(f"\n[HuggingRun] Signal {sig}. Shutting down...")
|
| 207 |
-
stop_event.set()
|
| 208 |
-
t.join(timeout=10)
|
| 209 |
-
if process.poll() is None:
|
| 210 |
-
process.terminate()
|
| 211 |
-
try:
|
| 212 |
-
process.wait(timeout=5)
|
| 213 |
-
except subprocess.TimeoutExpired:
|
| 214 |
-
process.kill()
|
| 215 |
-
sync.save_to_repo()
|
| 216 |
-
sys.exit(0)
|
| 217 |
-
|
| 218 |
-
signal.signal(signal.SIGINT, on_signal)
|
| 219 |
-
signal.signal(signal.SIGTERM, on_signal)
|
| 220 |
-
|
| 221 |
-
code = process.wait()
|
| 222 |
-
stop_event.set()
|
| 223 |
-
t.join(timeout=10)
|
| 224 |
-
sync.save_to_repo()
|
| 225 |
-
sys.exit(code)
|
| 226 |
-
except Exception as e:
|
| 227 |
-
print(f"[HuggingRun] FATAL: {e}")
|
| 228 |
-
traceback.print_exc()
|
| 229 |
-
sys.exit(1)
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
if __name__ == "__main__":
|
| 233 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
scripts/test_local.sh
DELETED
|
@@ -1,143 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env bash
|
| 2 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 3 |
-
# HuggingRun: Local integration test for Ubuntu Server
|
| 4 |
-
# Build Docker → run container → wait → test ttyd + SSH + persistence stress → cleanup
|
| 5 |
-
# Exit 0 only when ALL tests pass. Iterative TDD style.
|
| 6 |
-
#
|
| 7 |
-
# Usage:
|
| 8 |
-
# bash scripts/test_local.sh # full run
|
| 9 |
-
# SKIP_BUILD=1 bash scripts/test_local.sh # reuse existing image
|
| 10 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 11 |
-
set -euo pipefail
|
| 12 |
-
|
| 13 |
-
REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
| 14 |
-
cd "$REPO_ROOT"
|
| 15 |
-
|
| 16 |
-
IMAGE_NAME="huggingrun-ubuntu-server-test"
|
| 17 |
-
CONTAINER_NAME="huggingrun-test-$$"
|
| 18 |
-
TTYD_PORT=7860
|
| 19 |
-
SSH_PORT=2222
|
| 20 |
-
HOST_TTYD_PORT="${HOST_TTYD_PORT:-17860}"
|
| 21 |
-
HOST_SSH_PORT="${HOST_SSH_PORT:-12222}"
|
| 22 |
-
MAX_WAIT=120 # seconds to wait for services to be ready
|
| 23 |
-
SSH_USER="user"
|
| 24 |
-
SSH_STRESS_N="${SSH_STRESS_N:-30}"
|
| 25 |
-
SSH_CONCURRENCY="${SSH_CONCURRENCY:-10}"
|
| 26 |
-
|
| 27 |
-
RED='\033[0;31m'
|
| 28 |
-
GREEN='\033[0;32m'
|
| 29 |
-
YELLOW='\033[1;33m'
|
| 30 |
-
NC='\033[0m'
|
| 31 |
-
|
| 32 |
-
cleanup() {
|
| 33 |
-
echo ""
|
| 34 |
-
echo -e "${YELLOW}[cleanup] Stopping and removing container ${CONTAINER_NAME}...${NC}"
|
| 35 |
-
docker stop "$CONTAINER_NAME" 2>/dev/null || true
|
| 36 |
-
docker rm -f "$CONTAINER_NAME" 2>/dev/null || true
|
| 37 |
-
}
|
| 38 |
-
trap cleanup EXIT
|
| 39 |
-
|
| 40 |
-
# ── Phase 0: Build ──────────────────────────────────────────────────
|
| 41 |
-
if [ "${SKIP_BUILD:-}" != "1" ]; then
|
| 42 |
-
echo -e "${YELLOW}[build] Building Docker image: ${IMAGE_NAME}${NC}"
|
| 43 |
-
docker build -t "$IMAGE_NAME" . 2>&1 | tail -20
|
| 44 |
-
echo -e "${GREEN}[build] Image built successfully${NC}"
|
| 45 |
-
else
|
| 46 |
-
echo -e "${YELLOW}[build] SKIP_BUILD=1, using existing image${NC}"
|
| 47 |
-
fi
|
| 48 |
-
|
| 49 |
-
# ── Phase 1: Run container ──────────────────────────────────────────
|
| 50 |
-
echo ""
|
| 51 |
-
echo -e "${YELLOW}[run] Starting container: ${CONTAINER_NAME}${NC}"
|
| 52 |
-
echo -e "${YELLOW}[run] ttyd: localhost:${HOST_TTYD_PORT} → :${TTYD_PORT}${NC}"
|
| 53 |
-
echo -e "${YELLOW}[run] SSH: localhost:${HOST_SSH_PORT} → :${SSH_PORT}${NC}"
|
| 54 |
-
|
| 55 |
-
docker run -d \
|
| 56 |
-
--name "$CONTAINER_NAME" \
|
| 57 |
-
-p "${HOST_TTYD_PORT}:${TTYD_PORT}" \
|
| 58 |
-
-p "${HOST_SSH_PORT}:${SSH_PORT}" \
|
| 59 |
-
-e SSH_LISTEN=0.0.0.0 \
|
| 60 |
-
-e SSH_PORT=${SSH_PORT} \
|
| 61 |
-
"$IMAGE_NAME"
|
| 62 |
-
|
| 63 |
-
echo -e "${GREEN}[run] Container started${NC}"
|
| 64 |
-
|
| 65 |
-
# ── Phase 2: Wait for ttyd web terminal ──────────────────────────────
|
| 66 |
-
echo ""
|
| 67 |
-
echo -e "${YELLOW}[wait] Waiting for ttyd on localhost:${HOST_TTYD_PORT} (max ${MAX_WAIT}s)...${NC}"
|
| 68 |
-
START=$(date +%s)
|
| 69 |
-
TTYD_READY=false
|
| 70 |
-
while [ $(($(date +%s) - START)) -lt "$MAX_WAIT" ]; do
|
| 71 |
-
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:${HOST_TTYD_PORT}/" 2>/dev/null || echo "000")
|
| 72 |
-
if [ "$HTTP_CODE" = "200" ]; then
|
| 73 |
-
TTYD_READY=true
|
| 74 |
-
break
|
| 75 |
-
fi
|
| 76 |
-
echo -e " ttyd not ready (HTTP ${HTTP_CODE}), waiting 3s..."
|
| 77 |
-
sleep 3
|
| 78 |
-
done
|
| 79 |
-
|
| 80 |
-
if [ "$TTYD_READY" = false ]; then
|
| 81 |
-
echo -e "${RED}[FAIL] ttyd did not become ready within ${MAX_WAIT}s${NC}"
|
| 82 |
-
echo ""
|
| 83 |
-
echo "=== Container logs (last 50 lines) ==="
|
| 84 |
-
docker logs --tail 50 "$CONTAINER_NAME" 2>&1
|
| 85 |
-
exit 1
|
| 86 |
-
fi
|
| 87 |
-
echo -e "${GREEN}[wait] ttyd is ready (HTTP 200)${NC}"
|
| 88 |
-
|
| 89 |
-
# ── Phase 3: Wait for SSH ───────────────────────────────────────────
|
| 90 |
-
echo ""
|
| 91 |
-
echo -e "${YELLOW}[wait] Waiting for SSH on localhost:${HOST_SSH_PORT} (max 60s)...${NC}"
|
| 92 |
-
START=$(date +%s)
|
| 93 |
-
SSH_READY=false
|
| 94 |
-
while [ $(($(date +%s) - START)) -lt 60 ]; do
|
| 95 |
-
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \
|
| 96 |
-
-o ConnectTimeout=3 -o LogLevel=ERROR \
|
| 97 |
-
-p "$HOST_SSH_PORT" "${SSH_USER}@localhost" "echo ok" 2>/dev/null | grep -q "ok"; then
|
| 98 |
-
SSH_READY=true
|
| 99 |
-
break
|
| 100 |
-
fi
|
| 101 |
-
echo " SSH not ready, waiting 3s..."
|
| 102 |
-
sleep 3
|
| 103 |
-
done
|
| 104 |
-
|
| 105 |
-
if [ "$SSH_READY" = false ]; then
|
| 106 |
-
echo -e "${RED}[FAIL] SSH did not become ready within 60s${NC}"
|
| 107 |
-
echo ""
|
| 108 |
-
echo "=== Container logs (last 50 lines) ==="
|
| 109 |
-
docker logs --tail 50 "$CONTAINER_NAME" 2>&1
|
| 110 |
-
exit 1
|
| 111 |
-
fi
|
| 112 |
-
echo -e "${GREEN}[wait] SSH is ready${NC}"
|
| 113 |
-
|
| 114 |
-
# ── Phase 4: Run HTTP tests (ttyd) ──────────────────────────────────
|
| 115 |
-
echo ""
|
| 116 |
-
echo -e "${YELLOW}[test] Phase 4: HTTP basic + stress test on ttyd${NC}"
|
| 117 |
-
python3 scripts/monitor_and_test.py \
|
| 118 |
-
--test \
|
| 119 |
-
--url "http://localhost:${HOST_TTYD_PORT}" \
|
| 120 |
-
--expect "ttyd" --expect "terminal" \
|
| 121 |
-
--stress-n 50
|
| 122 |
-
echo -e "${GREEN}[test] HTTP tests PASSED${NC}"
|
| 123 |
-
|
| 124 |
-
# ── Phase 5: Run SSH + persistence stress tests ─────────────────────
|
| 125 |
-
echo ""
|
| 126 |
-
echo -e "${YELLOW}[test] Phase 5: SSH connect + command + stress + persistence${NC}"
|
| 127 |
-
python3 scripts/monitor_and_test.py \
|
| 128 |
-
--ssh-test \
|
| 129 |
-
--ssh-host localhost \
|
| 130 |
-
--ssh-port "$HOST_SSH_PORT" \
|
| 131 |
-
--ssh-user "$SSH_USER" \
|
| 132 |
-
--ssh-stress-n "$SSH_STRESS_N" \
|
| 133 |
-
--ssh-concurrency "$SSH_CONCURRENCY"
|
| 134 |
-
echo -e "${GREEN}[test] SSH + persistence tests PASSED${NC}"
|
| 135 |
-
|
| 136 |
-
# ── Summary ─────────────────────────────────────────────────────────
|
| 137 |
-
echo ""
|
| 138 |
-
echo "============================================================"
|
| 139 |
-
echo -e "${GREEN} ALL TESTS PASSED${NC}"
|
| 140 |
-
echo ""
|
| 141 |
-
echo " Web terminal: http://localhost:${HOST_TTYD_PORT}/"
|
| 142 |
-
echo " SSH access: ssh -p ${HOST_SSH_PORT} ${SSH_USER}@localhost"
|
| 143 |
-
echo "============================================================"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubuntu-server/git_sync_daemon.py
DELETED
|
@@ -1,369 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
git_sync_daemon.py — Full disk persistence via HuggingFace Dataset
|
| 4 |
-
|
| 5 |
-
Usage:
|
| 6 |
-
python3 git_sync_daemon.py init # Download dataset → restore filesystem
|
| 7 |
-
python3 git_sync_daemon.py sync-loop # Background sync loop
|
| 8 |
-
python3 git_sync_daemon.py save # One-time save + upload
|
| 9 |
-
|
| 10 |
-
Dataset = disk. The dataset stores the entire filesystem under rootfs/.
|
| 11 |
-
On startup: download dataset → rsync rootfs/ → /
|
| 12 |
-
On sync: rsync / → rootfs/ → upload to dataset
|
| 13 |
-
"""
|
| 14 |
-
|
| 15 |
-
import os
|
| 16 |
-
import sys
|
| 17 |
-
import time
|
| 18 |
-
import subprocess
|
| 19 |
-
import signal
|
| 20 |
-
import shutil
|
| 21 |
-
import threading
|
| 22 |
-
|
| 23 |
-
# ── Config ─────────────────────────────────────────────────────────
|
| 24 |
-
PERSIST_PATH = os.environ.get("PERSIST_PATH", "/data")
|
| 25 |
-
HF_TOKEN = os.environ.get("HF_TOKEN", "")
|
| 26 |
-
HF_DATASET_REPO = os.environ.get("HF_DATASET_REPO", "")
|
| 27 |
-
ROOTFS = os.path.join(PERSIST_PATH, "rootfs")
|
| 28 |
-
|
| 29 |
-
SYNC_INTERVAL = int(os.environ.get("SYNC_INTERVAL", "120"))
|
| 30 |
-
|
| 31 |
-
# SAVE: only persist these directories (user data + config).
|
| 32 |
-
# System dirs (/bin, /lib, /usr/bin, etc.) come from Docker image;
|
| 33 |
-
# restoring them breaks permissions since HF git doesn't preserve mode bits.
|
| 34 |
-
PERSIST_DIRS = [
|
| 35 |
-
"/home", "/root", "/etc", "/opt", "/var",
|
| 36 |
-
"/usr/local",
|
| 37 |
-
]
|
| 38 |
-
|
| 39 |
-
# Paths to never touch (Docker-managed or virtual)
|
| 40 |
-
SAVE_EXCLUDES = [
|
| 41 |
-
"/etc/hostname", "/etc/hosts", "/etc/resolv.conf", "/etc/mtab",
|
| 42 |
-
"/etc/nginx", # comes from Docker image; don't overwrite with stale config
|
| 43 |
-
"/etc/ssh/sshd_config",
|
| 44 |
-
"*.sock", "__pycache__", "*.pyc",
|
| 45 |
-
".cache", # HF API rejects files under .cache/ paths
|
| 46 |
-
]
|
| 47 |
-
|
| 48 |
-
UPLOAD_IGNORE = [
|
| 49 |
-
"__pycache__", "*.pyc",
|
| 50 |
-
".git", ".git*",
|
| 51 |
-
"*.sock", "*.lock",
|
| 52 |
-
".huggingface",
|
| 53 |
-
".cache",
|
| 54 |
-
]
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
LOGFILE = "/var/log/huggingrun.log"
|
| 58 |
-
|
| 59 |
-
def log(msg):
|
| 60 |
-
ts = time.strftime("%H:%M:%S", time.gmtime())
|
| 61 |
-
line = f"[sync {ts}] {msg}"
|
| 62 |
-
print(line, file=sys.stderr, flush=True)
|
| 63 |
-
try:
|
| 64 |
-
with open(LOGFILE, "a") as f:
|
| 65 |
-
f.write(line + "\n")
|
| 66 |
-
except Exception:
|
| 67 |
-
pass
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
def run(cmd, cwd=None):
|
| 71 |
-
"""Run a shell command, return (returncode, stdout). Logs the command."""
|
| 72 |
-
log(f" $ {cmd}")
|
| 73 |
-
t0 = time.time()
|
| 74 |
-
r = subprocess.run(
|
| 75 |
-
cmd, shell=True, cwd=cwd,
|
| 76 |
-
stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True,
|
| 77 |
-
)
|
| 78 |
-
elapsed = time.time() - t0
|
| 79 |
-
if r.returncode != 0:
|
| 80 |
-
log(f" ✗ exit={r.returncode} ({elapsed:.1f}s)")
|
| 81 |
-
if r.stdout.strip():
|
| 82 |
-
for line in r.stdout.strip().split('\n')[:10]:
|
| 83 |
-
log(f" {line}")
|
| 84 |
-
else:
|
| 85 |
-
log(f" ✓ ({elapsed:.1f}s)")
|
| 86 |
-
return r.returncode, r.stdout.strip()
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def disk_usage(path):
|
| 90 |
-
"""Get disk usage of a path in human-readable form."""
|
| 91 |
-
try:
|
| 92 |
-
total = 0
|
| 93 |
-
count = 0
|
| 94 |
-
for dirpath, dirnames, filenames in os.walk(path):
|
| 95 |
-
count += len(filenames)
|
| 96 |
-
for f in filenames:
|
| 97 |
-
fp = os.path.join(dirpath, f)
|
| 98 |
-
try:
|
| 99 |
-
total += os.path.getsize(fp)
|
| 100 |
-
except OSError:
|
| 101 |
-
pass
|
| 102 |
-
if total > 1024 * 1024 * 1024:
|
| 103 |
-
return f"{total / 1024 / 1024 / 1024:.1f}GB, {count} files"
|
| 104 |
-
return f"{total / 1024 / 1024:.0f}MB, {count} files"
|
| 105 |
-
except Exception:
|
| 106 |
-
return "unknown"
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def download_dataset():
|
| 110 |
-
"""Download the full dataset to PERSIST_PATH using huggingface_hub."""
|
| 111 |
-
from huggingface_hub import snapshot_download
|
| 112 |
-
|
| 113 |
-
log(f"── DOWNLOAD: {HF_DATASET_REPO} → {PERSIST_PATH}")
|
| 114 |
-
log(f" Token: {HF_TOKEN[:10]}...{HF_TOKEN[-4:]}" if len(HF_TOKEN) > 14 else " Token: (short)")
|
| 115 |
-
t0 = time.time()
|
| 116 |
-
try:
|
| 117 |
-
path = snapshot_download(
|
| 118 |
-
repo_id=HF_DATASET_REPO,
|
| 119 |
-
repo_type="dataset",
|
| 120 |
-
local_dir=PERSIST_PATH,
|
| 121 |
-
token=HF_TOKEN,
|
| 122 |
-
)
|
| 123 |
-
elapsed = time.time() - t0
|
| 124 |
-
log(f" Download OK → {path} ({elapsed:.1f}s)")
|
| 125 |
-
# Show what was downloaded
|
| 126 |
-
if os.path.isdir(PERSIST_PATH):
|
| 127 |
-
items = os.listdir(PERSIST_PATH)
|
| 128 |
-
log(f" /data contents: {items}")
|
| 129 |
-
if os.path.isdir(ROOTFS):
|
| 130 |
-
items = os.listdir(ROOTFS)
|
| 131 |
-
log(f" /data/rootfs dirs: {len(items)} → {items[:10]}")
|
| 132 |
-
log(f" /data/rootfs size: {disk_usage(ROOTFS)}")
|
| 133 |
-
else:
|
| 134 |
-
log(f" /data/rootfs: does not exist")
|
| 135 |
-
except Exception as e:
|
| 136 |
-
elapsed = time.time() - t0
|
| 137 |
-
log(f" ✗ Download failed ({elapsed:.1f}s): {e}")
|
| 138 |
-
os.makedirs(PERSIST_PATH, exist_ok=True)
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
def restore_system():
|
| 142 |
-
"""Restore only user directories from /data/rootfs/ → /."""
|
| 143 |
-
log("── RESTORE: targeted directories only")
|
| 144 |
-
if not os.path.isdir(ROOTFS):
|
| 145 |
-
log(f" ✗ {ROOTFS} does not exist, starting fresh")
|
| 146 |
-
os.makedirs(ROOTFS, exist_ok=True)
|
| 147 |
-
return
|
| 148 |
-
|
| 149 |
-
excludes = " ".join(f"--exclude='{e}'" for e in SAVE_EXCLUDES)
|
| 150 |
-
restored = 0
|
| 151 |
-
# Dirs with executables get F755; others get F644
|
| 152 |
-
exec_dirs = {"/usr/local", "/opt"}
|
| 153 |
-
for d in PERSIST_DIRS:
|
| 154 |
-
src = os.path.join(ROOTFS, d.lstrip("/"))
|
| 155 |
-
if not os.path.isdir(src):
|
| 156 |
-
continue
|
| 157 |
-
os.makedirs(d, exist_ok=True)
|
| 158 |
-
fmode = "F755" if d in exec_dirs else "F644"
|
| 159 |
-
cmd = (f"rsync -rlD --no-specials --no-devices "
|
| 160 |
-
f"--no-perms --chmod=D755,{fmode} "
|
| 161 |
-
f"{excludes} {src}/ {d}/")
|
| 162 |
-
t0 = time.time()
|
| 163 |
-
rc, out = run(cmd)
|
| 164 |
-
elapsed = time.time() - t0
|
| 165 |
-
status = "OK" if rc == 0 else f"rc={rc}"
|
| 166 |
-
log(f" {d}: {status} ({elapsed:.1f}s)")
|
| 167 |
-
restored += 1
|
| 168 |
-
|
| 169 |
-
if restored == 0:
|
| 170 |
-
log(" No directories to restore (fresh start)")
|
| 171 |
-
else:
|
| 172 |
-
log(f" Restored {restored} directories")
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
def restore_packages():
|
| 176 |
-
"""Reinstall user-added packages from saved package list."""
|
| 177 |
-
pkg_list = os.path.join(PERSIST_PATH, "user-packages.list")
|
| 178 |
-
base_list = "/etc/base-packages.list"
|
| 179 |
-
if not os.path.exists(pkg_list):
|
| 180 |
-
log("── PACKAGES: no saved package list, skipping")
|
| 181 |
-
return
|
| 182 |
-
if not os.path.exists(base_list):
|
| 183 |
-
log("── PACKAGES: no base package list, skipping")
|
| 184 |
-
return
|
| 185 |
-
log("── PACKAGES: checking for user-installed packages")
|
| 186 |
-
try:
|
| 187 |
-
with open(base_list) as f:
|
| 188 |
-
base = set(f.read().strip().split("\n"))
|
| 189 |
-
with open(pkg_list) as f:
|
| 190 |
-
saved = set(f.read().strip().split("\n"))
|
| 191 |
-
to_install = sorted(saved - base)
|
| 192 |
-
if not to_install:
|
| 193 |
-
log(" No additional packages to install")
|
| 194 |
-
return
|
| 195 |
-
log(f" {len(to_install)} packages to reinstall: {to_install[:10]}...")
|
| 196 |
-
run(f"apt-get update -qq && apt-get install -y --no-install-recommends {' '.join(to_install)}")
|
| 197 |
-
except Exception as e:
|
| 198 |
-
log(f" ✗ Package restore error: {e}")
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
def save_package_list():
|
| 202 |
-
"""Save list of currently installed packages for restore."""
|
| 203 |
-
pkg_list = os.path.join(PERSIST_PATH, "user-packages.list")
|
| 204 |
-
try:
|
| 205 |
-
rc, out = run("dpkg-query -W -f='${Package}\\n'")
|
| 206 |
-
if rc == 0 and out:
|
| 207 |
-
with open(pkg_list, "w") as f:
|
| 208 |
-
f.write(out + "\n")
|
| 209 |
-
except Exception as e:
|
| 210 |
-
log(f" ✗ Package list save error: {e}")
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
def ensure_passwords():
|
| 214 |
-
"""Re-ensure user/root passwords after restore."""
|
| 215 |
-
log("── PASSWORDS")
|
| 216 |
-
run("id user >/dev/null 2>&1 || useradd -m -s /bin/bash user")
|
| 217 |
-
run('usermod -p "$(openssl passwd -6 huggingrun)" user')
|
| 218 |
-
run('usermod -p "$(openssl passwd -6 huggingrun)" root')
|
| 219 |
-
run("ldconfig")
|
| 220 |
-
log(" Passwords set for user + root")
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
def save_and_upload():
|
| 224 |
-
"""Rsync filesystem → rootfs, upload to dataset."""
|
| 225 |
-
from huggingface_hub import HfApi
|
| 226 |
-
|
| 227 |
-
log("══════════════════════════════════════════════")
|
| 228 |
-
log("── SAVE: targeted directories → /data/rootfs/")
|
| 229 |
-
os.makedirs(ROOTFS, exist_ok=True)
|
| 230 |
-
save_package_list()
|
| 231 |
-
|
| 232 |
-
excludes = " ".join(f"--exclude='{e}'" for e in SAVE_EXCLUDES)
|
| 233 |
-
t0_all = time.time()
|
| 234 |
-
for d in PERSIST_DIRS:
|
| 235 |
-
if not os.path.isdir(d):
|
| 236 |
-
continue
|
| 237 |
-
dst = os.path.join(ROOTFS, d.lstrip("/"))
|
| 238 |
-
os.makedirs(dst, exist_ok=True)
|
| 239 |
-
cmd = (f"rsync -rlD --no-specials --no-devices --delete "
|
| 240 |
-
f"{excludes} {d}/ {dst}/")
|
| 241 |
-
rc, out = run(cmd)
|
| 242 |
-
elapsed = time.time() - t0_all
|
| 243 |
-
log(f" rsync done ({elapsed:.1f}s)")
|
| 244 |
-
log(f" rootfs size: {disk_usage(ROOTFS)}")
|
| 245 |
-
|
| 246 |
-
# Clean up dirs that HF API rejects
|
| 247 |
-
for reject_dir in [".cache"]:
|
| 248 |
-
for dirpath, dirnames, filenames in os.walk(ROOTFS):
|
| 249 |
-
for d in list(dirnames):
|
| 250 |
-
if d == reject_dir:
|
| 251 |
-
full = os.path.join(dirpath, d)
|
| 252 |
-
log(f" Removing {full} (rejected by HF API)")
|
| 253 |
-
shutil.rmtree(full, ignore_errors=True)
|
| 254 |
-
dirnames.remove(d)
|
| 255 |
-
|
| 256 |
-
log("── UPLOAD: /data → dataset")
|
| 257 |
-
api = HfApi(token=HF_TOKEN)
|
| 258 |
-
ts = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())
|
| 259 |
-
log(f" repo={HF_DATASET_REPO}, method=upload_folder")
|
| 260 |
-
log(f" ignore={UPLOAD_IGNORE}")
|
| 261 |
-
|
| 262 |
-
t0 = time.time()
|
| 263 |
-
try:
|
| 264 |
-
api.upload_folder(
|
| 265 |
-
folder_path=PERSIST_PATH,
|
| 266 |
-
repo_id=HF_DATASET_REPO,
|
| 267 |
-
repo_type="dataset",
|
| 268 |
-
path_in_repo="",
|
| 269 |
-
commit_message=f"sync {ts}",
|
| 270 |
-
ignore_patterns=UPLOAD_IGNORE,
|
| 271 |
-
)
|
| 272 |
-
elapsed = time.time() - t0
|
| 273 |
-
log(f" ✓ Upload completed ({elapsed:.1f}s)")
|
| 274 |
-
except Exception as e:
|
| 275 |
-
elapsed = time.time() - t0
|
| 276 |
-
log(f" ✗ Upload failed ({elapsed:.1f}s): {e}")
|
| 277 |
-
|
| 278 |
-
log("══════════════════════════════════════════════")
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
def cmd_init():
|
| 282 |
-
"""Run once at startup: download dataset + restore filesystem."""
|
| 283 |
-
log("╔══════════════════════════════════════════════╗")
|
| 284 |
-
log("║ INIT — Full Disk Persistence ║")
|
| 285 |
-
log("╚══════════════════════════════════════════════╝")
|
| 286 |
-
log(f" HF_TOKEN: {'set (' + HF_TOKEN[:10] + '...)' if HF_TOKEN else 'NOT SET'}")
|
| 287 |
-
log(f" HF_DATASET_REPO: {HF_DATASET_REPO or 'NOT SET'}")
|
| 288 |
-
log(f" PERSIST_PATH: {PERSIST_PATH}")
|
| 289 |
-
log(f" ROOTFS: {ROOTFS}")
|
| 290 |
-
log(f" SYNC_INTERVAL: {SYNC_INTERVAL}s")
|
| 291 |
-
log(f" PERSIST_DIRS: {PERSIST_DIRS}")
|
| 292 |
-
|
| 293 |
-
if not HF_TOKEN or not HF_DATASET_REPO:
|
| 294 |
-
log(" ✗ No HF_TOKEN or HF_DATASET_REPO — persistence DISABLED")
|
| 295 |
-
os.makedirs(PERSIST_PATH, exist_ok=True)
|
| 296 |
-
return
|
| 297 |
-
|
| 298 |
-
t0 = time.time()
|
| 299 |
-
download_dataset()
|
| 300 |
-
restore_system()
|
| 301 |
-
restore_packages()
|
| 302 |
-
ensure_passwords()
|
| 303 |
-
elapsed = time.time() - t0
|
| 304 |
-
log(f" Init complete ({elapsed:.1f}s)")
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
def cmd_sync_loop():
|
| 308 |
-
"""Background daemon: periodic save + upload."""
|
| 309 |
-
log("┌──────────────────────────────────────────────┐")
|
| 310 |
-
log("│ SYNC-LOOP daemon started │")
|
| 311 |
-
log("└──────────────────────────────────────────────┘")
|
| 312 |
-
log(f" HF_TOKEN: {'set' if HF_TOKEN else 'NOT SET'}")
|
| 313 |
-
log(f" HF_DATASET_REPO: {HF_DATASET_REPO or 'NOT SET'}")
|
| 314 |
-
log(f" SYNC_INTERVAL: {SYNC_INTERVAL}s")
|
| 315 |
-
|
| 316 |
-
if not HF_TOKEN or not HF_DATASET_REPO:
|
| 317 |
-
log(" ✗ No credentials — sync-loop DISABLED, sleeping forever")
|
| 318 |
-
signal.pause()
|
| 319 |
-
return
|
| 320 |
-
|
| 321 |
-
def on_signal(sig, frame):
|
| 322 |
-
log(f" Signal {sig} received, final save ...")
|
| 323 |
-
try:
|
| 324 |
-
save_and_upload()
|
| 325 |
-
except Exception as e:
|
| 326 |
-
log(f" Final save error: {e}")
|
| 327 |
-
sys.exit(0)
|
| 328 |
-
|
| 329 |
-
signal.signal(signal.SIGTERM, on_signal)
|
| 330 |
-
signal.signal(signal.SIGINT, on_signal)
|
| 331 |
-
|
| 332 |
-
log(f" Waiting {60}s before first sync (let services start) ...")
|
| 333 |
-
time.sleep(60)
|
| 334 |
-
|
| 335 |
-
cycle = 0
|
| 336 |
-
while True:
|
| 337 |
-
cycle += 1
|
| 338 |
-
log(f"── Sync cycle #{cycle}")
|
| 339 |
-
try:
|
| 340 |
-
save_and_upload()
|
| 341 |
-
except Exception as e:
|
| 342 |
-
log(f" ✗ Sync cycle #{cycle} error: {e}")
|
| 343 |
-
import traceback
|
| 344 |
-
traceback.print_exc(file=sys.stderr)
|
| 345 |
-
log(f" Sleeping {SYNC_INTERVAL}s until next cycle ...")
|
| 346 |
-
time.sleep(SYNC_INTERVAL)
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
def cmd_save():
|
| 350 |
-
"""One-time save + upload."""
|
| 351 |
-
log("── ONE-TIME SAVE")
|
| 352 |
-
if not HF_TOKEN or not HF_DATASET_REPO:
|
| 353 |
-
log(" ✗ No credentials, cannot save")
|
| 354 |
-
return
|
| 355 |
-
save_and_upload()
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
if __name__ == "__main__":
|
| 359 |
-
cmd = sys.argv[1] if len(sys.argv) > 1 else "init"
|
| 360 |
-
log(f"Command: {cmd} (pid={os.getpid()})")
|
| 361 |
-
if cmd == "init":
|
| 362 |
-
cmd_init()
|
| 363 |
-
elif cmd == "sync-loop":
|
| 364 |
-
cmd_sync_loop()
|
| 365 |
-
elif cmd == "save":
|
| 366 |
-
cmd_save()
|
| 367 |
-
else:
|
| 368 |
-
print(f"Usage: {sys.argv[0]} [init|sync-loop|save]")
|
| 369 |
-
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubuntu-server/log_streamer.py
DELETED
|
@@ -1,74 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
SSE log streamer — serves /var/log/huggingrun.log as Server-Sent Events.
|
| 4 |
-
Mimics HF's runtime log API format so you can use the same curl -N pattern.
|
| 5 |
-
|
| 6 |
-
Usage:
|
| 7 |
-
curl -N https://tao-shen-huggingrun.hf.space/logs/stream
|
| 8 |
-
"""
|
| 9 |
-
import http.server
|
| 10 |
-
import time
|
| 11 |
-
import os
|
| 12 |
-
import json
|
| 13 |
-
from datetime import datetime, timezone
|
| 14 |
-
|
| 15 |
-
LOGFILE = "/var/log/huggingrun.log"
|
| 16 |
-
PORT = 7863
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
class SSEHandler(http.server.BaseHTTPRequestHandler):
|
| 20 |
-
def log_message(self, format, *args):
|
| 21 |
-
pass # suppress access logs
|
| 22 |
-
|
| 23 |
-
def do_GET(self):
|
| 24 |
-
if self.path == "/stream":
|
| 25 |
-
self.send_response(200)
|
| 26 |
-
self.send_header("Content-Type", "text/event-stream")
|
| 27 |
-
self.send_header("Cache-Control", "no-cache")
|
| 28 |
-
self.send_header("Connection", "keep-alive")
|
| 29 |
-
self.send_header("Access-Control-Allow-Origin", "*")
|
| 30 |
-
self.end_headers()
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
# Send existing log content first (history)
|
| 34 |
-
if os.path.exists(LOGFILE):
|
| 35 |
-
with open(LOGFILE, "r") as f:
|
| 36 |
-
for line in f:
|
| 37 |
-
line = line.rstrip("\n")
|
| 38 |
-
if line:
|
| 39 |
-
event = json.dumps({
|
| 40 |
-
"data": line + "\n",
|
| 41 |
-
"timestamp": datetime.now(timezone.utc).isoformat()
|
| 42 |
-
})
|
| 43 |
-
self.wfile.write(f"data: {event}\n\n".encode())
|
| 44 |
-
self.wfile.flush()
|
| 45 |
-
|
| 46 |
-
# Then tail for new lines
|
| 47 |
-
with open(LOGFILE, "r") as f:
|
| 48 |
-
f.seek(0, 2) # end of file
|
| 49 |
-
while True:
|
| 50 |
-
line = f.readline()
|
| 51 |
-
if line:
|
| 52 |
-
line = line.rstrip("\n")
|
| 53 |
-
if line:
|
| 54 |
-
event = json.dumps({
|
| 55 |
-
"data": line + "\n",
|
| 56 |
-
"timestamp": datetime.now(timezone.utc).isoformat()
|
| 57 |
-
})
|
| 58 |
-
self.wfile.write(f"data: {event}\n\n".encode())
|
| 59 |
-
self.wfile.flush()
|
| 60 |
-
else:
|
| 61 |
-
# Send keep-alive comment every 15s
|
| 62 |
-
self.wfile.write(b": keep-alive\n\n")
|
| 63 |
-
self.wfile.flush()
|
| 64 |
-
time.sleep(1)
|
| 65 |
-
except (BrokenPipeError, ConnectionResetError):
|
| 66 |
-
pass
|
| 67 |
-
else:
|
| 68 |
-
self.send_response(404)
|
| 69 |
-
self.end_headers()
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
if __name__ == "__main__":
|
| 73 |
-
server = http.server.HTTPServer(("127.0.0.1", PORT), SSEHandler)
|
| 74 |
-
server.serve_forever()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubuntu-server/start-server.sh
DELETED
|
@@ -1,114 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 3 |
-
# HuggingRun Ubuntu Server: ttyd + SSH-over-WebSocket + nginx
|
| 4 |
-
# Port 7860 (nginx): web terminal + SSH
|
| 5 |
-
# ─────────────────────────────────────────────────────────────────────
|
| 6 |
-
|
| 7 |
-
LOGFILE="/var/log/huggingrun.log"
|
| 8 |
-
> "$LOGFILE" # truncate on start
|
| 9 |
-
|
| 10 |
-
export SSH_PORT="${SSH_PORT:-2222}"
|
| 11 |
-
export TTYD_PORT="${TTYD_PORT:-7681}"
|
| 12 |
-
|
| 13 |
-
# Log to stderr (HF captures stderr) + logfile (for /runlog endpoint)
|
| 14 |
-
log() {
|
| 15 |
-
echo "$*" >&2
|
| 16 |
-
echo "$*" >> "$LOGFILE"
|
| 17 |
-
}
|
| 18 |
-
|
| 19 |
-
# ── Boot info ─────────────────────────────────────────────────────
|
| 20 |
-
log "========================================"
|
| 21 |
-
log "[ubuntu] HuggingRun Ubuntu Server"
|
| 22 |
-
log "[ubuntu] $(date -u)"
|
| 23 |
-
log "[ubuntu] Kernel: $(uname -r)"
|
| 24 |
-
log "[ubuntu] Arch: $(uname -m)"
|
| 25 |
-
log "[ubuntu] Hostname: $(hostname)"
|
| 26 |
-
log "[ubuntu] CPU: $(nproc) cores"
|
| 27 |
-
log "[ubuntu] Memory: $(free -h 2>/dev/null | grep Mem: | tr -s ' ' | cut -d' ' -f2 || echo 'unknown')"
|
| 28 |
-
log "[ubuntu] Disk: $(df -h / 2>/dev/null | tail -1 | tr -s ' ' | cut -d' ' -f2,4 | sed 's/ / total, /;s/$/ free/' || echo 'unknown')"
|
| 29 |
-
log "[ubuntu] User: $(whoami) (uid=$(id -u))"
|
| 30 |
-
log "========================================"
|
| 31 |
-
|
| 32 |
-
# ── Network info ──────────────────────────────────────────────────
|
| 33 |
-
log "[ubuntu] Network interfaces:"
|
| 34 |
-
ip -4 addr show 2>/dev/null | grep -E 'inet |^[0-9]' | while IFS= read -r line; do log " $line"; done
|
| 35 |
-
|
| 36 |
-
# ── sshd ──────────────────────────────────────────────────────────
|
| 37 |
-
mkdir -p /run/sshd
|
| 38 |
-
log "[ubuntu] Starting sshd on 127.0.0.1:${SSH_PORT} ..."
|
| 39 |
-
/usr/sbin/sshd -o "Port=$SSH_PORT" \
|
| 40 |
-
-o "ListenAddress=127.0.0.1" \
|
| 41 |
-
-o "PermitRootLogin=yes" \
|
| 42 |
-
-o "PasswordAuthentication=yes" \
|
| 43 |
-
-o "PermitEmptyPasswords=no" \
|
| 44 |
-
-o "UsePAM=yes" \
|
| 45 |
-
-D -e &
|
| 46 |
-
SSHD_PID=$!
|
| 47 |
-
sleep 1
|
| 48 |
-
if kill -0 $SSHD_PID 2>/dev/null; then
|
| 49 |
-
log "[ubuntu] [ OK ] sshd started (PID=${SSHD_PID})"
|
| 50 |
-
else
|
| 51 |
-
log "[ubuntu] [FAILED] sshd failed to start"
|
| 52 |
-
fi
|
| 53 |
-
|
| 54 |
-
# ── SSE log streamer ──────────────────────────────────────────────
|
| 55 |
-
log "[ubuntu] Starting SSE log streamer on 127.0.0.1:7863 ..."
|
| 56 |
-
python3 /scripts/log_streamer.py &
|
| 57 |
-
STREAMER_PID=$!
|
| 58 |
-
sleep 1
|
| 59 |
-
if kill -0 $STREAMER_PID 2>/dev/null; then
|
| 60 |
-
log "[ubuntu] [ OK ] SSE log streamer started (PID=${STREAMER_PID})"
|
| 61 |
-
else
|
| 62 |
-
log "[ubuntu] [FAILED] SSE log streamer failed to start"
|
| 63 |
-
fi
|
| 64 |
-
|
| 65 |
-
# ── WebSocket-to-SSH bridge ───────────────────────────────────────
|
| 66 |
-
log "[ubuntu] Starting WS-SSH bridge on 127.0.0.1:7862 ..."
|
| 67 |
-
python3 /scripts/ws-ssh-bridge.py &
|
| 68 |
-
BRIDGE_PID=$!
|
| 69 |
-
sleep 1
|
| 70 |
-
if kill -0 $BRIDGE_PID 2>/dev/null; then
|
| 71 |
-
log "[ubuntu] [ OK ] WS-SSH bridge started (PID=${BRIDGE_PID})"
|
| 72 |
-
else
|
| 73 |
-
log "[ubuntu] [FAILED] WS-SSH bridge failed to start"
|
| 74 |
-
fi
|
| 75 |
-
|
| 76 |
-
# ── ttyd (web terminal) ──────────────────────────────────────────
|
| 77 |
-
log "[ubuntu] Starting ttyd on 127.0.0.1:${TTYD_PORT} ..."
|
| 78 |
-
ttyd --port "$TTYD_PORT" --writable --base-path / bash --login &
|
| 79 |
-
TTYD_PID=$!
|
| 80 |
-
sleep 1
|
| 81 |
-
if kill -0 $TTYD_PID 2>/dev/null; then
|
| 82 |
-
log "[ubuntu] [ OK ] ttyd started (PID=${TTYD_PID})"
|
| 83 |
-
else
|
| 84 |
-
log "[ubuntu] [FAILED] ttyd failed to start"
|
| 85 |
-
fi
|
| 86 |
-
|
| 87 |
-
# ── Process summary ───────────────────────────────────────────────
|
| 88 |
-
log "========================================"
|
| 89 |
-
log "[ubuntu] Services:"
|
| 90 |
-
log "[ubuntu] sshd PID=${SSHD_PID} 127.0.0.1:${SSH_PORT}"
|
| 91 |
-
log "[ubuntu] log-streamer PID=${STREAMER_PID} 127.0.0.1:7863"
|
| 92 |
-
log "[ubuntu] ws-ssh-bridge PID=${BRIDGE_PID} 127.0.0.1:7862"
|
| 93 |
-
log "[ubuntu] ttyd PID=${TTYD_PID} 127.0.0.1:${TTYD_PORT}"
|
| 94 |
-
log "========================================"
|
| 95 |
-
log "[ubuntu] Base packages: $(wc -l < /etc/base-packages.list 2>/dev/null || echo '?')"
|
| 96 |
-
log "[ubuntu] Current packages: $(dpkg-query -W -f='\n' 2>/dev/null | wc -l)"
|
| 97 |
-
|
| 98 |
-
log "[ubuntu] All processes:"
|
| 99 |
-
ps aux --no-headers 2>/dev/null | while read user pid rest; do log "[ubuntu] ${user} PID=${pid}"; done
|
| 100 |
-
|
| 101 |
-
log "[ubuntu] ══ System ready ══"
|
| 102 |
-
log "[ubuntu] View logs: curl https://<space>.hf.space/runlog"
|
| 103 |
-
log "[ubuntu] Stream SSE: curl -N https://<space>.hf.space/runlog/stream"
|
| 104 |
-
|
| 105 |
-
# ── Heartbeat ─────────────────────────────────────────────────────
|
| 106 |
-
(while true; do
|
| 107 |
-
sleep 60
|
| 108 |
-
log "[ubuntu] heartbeat: $(date -u) | load=$(cat /proc/loadavg 2>/dev/null | cut -d' ' -f1-3) | mem=$(free -h 2>/dev/null | grep Mem: | tr -s ' ' | cut -d' ' -f3,2 | sed 's/ /\//' || echo '?')"
|
| 109 |
-
done) &
|
| 110 |
-
|
| 111 |
-
# ── Start nginx LAST → opens port 7860 → HF transitions to RUNNING ──
|
| 112 |
-
log "[ubuntu] nginx config: /scripts/nginx.conf"
|
| 113 |
-
log "[ubuntu] Starting nginx on 0.0.0.0:7860 (HF will mark RUNNING) ..."
|
| 114 |
-
exec nginx -c /scripts/nginx.conf -g 'daemon off;'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubuntu-server/upload_sync.py
DELETED
|
@@ -1,61 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""Upload /data to HF dataset. Called by start-server.sh save_system()."""
|
| 3 |
-
import os
|
| 4 |
-
import sys
|
| 5 |
-
from datetime import datetime
|
| 6 |
-
|
| 7 |
-
def main():
|
| 8 |
-
token = os.environ.get("HF_TOKEN", "")
|
| 9 |
-
repo = os.environ.get("HF_DATASET_REPO", "")
|
| 10 |
-
persist_path = os.environ.get("PERSIST_PATH", "/data")
|
| 11 |
-
|
| 12 |
-
if not token or not repo:
|
| 13 |
-
print("[persist] No HF_TOKEN or repo, skip upload", file=sys.stderr)
|
| 14 |
-
return
|
| 15 |
-
|
| 16 |
-
from huggingface_hub import HfApi
|
| 17 |
-
|
| 18 |
-
api = HfApi(token=token)
|
| 19 |
-
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 20 |
-
|
| 21 |
-
print(f"[persist] Uploading {persist_path} to {repo} ...", file=sys.stderr)
|
| 22 |
-
|
| 23 |
-
try:
|
| 24 |
-
# Use upload_large_folder for reliability with large data
|
| 25 |
-
api.upload_large_folder(
|
| 26 |
-
folder_path=persist_path,
|
| 27 |
-
repo_id=repo,
|
| 28 |
-
repo_type="dataset",
|
| 29 |
-
commit_message=f"sync {ts}",
|
| 30 |
-
ignore_patterns=[
|
| 31 |
-
"__pycache__", "*.pyc",
|
| 32 |
-
".git", ".git*",
|
| 33 |
-
"*.lock", "*.pid", "*.sock",
|
| 34 |
-
".huggingface",
|
| 35 |
-
],
|
| 36 |
-
)
|
| 37 |
-
print("[persist] Upload completed", file=sys.stderr)
|
| 38 |
-
except Exception as e:
|
| 39 |
-
print(f"[persist] upload_large_folder failed: {e}", file=sys.stderr)
|
| 40 |
-
# Fallback to regular upload_folder
|
| 41 |
-
try:
|
| 42 |
-
api.upload_folder(
|
| 43 |
-
folder_path=persist_path,
|
| 44 |
-
repo_id=repo,
|
| 45 |
-
repo_type="dataset",
|
| 46 |
-
path_in_repo="",
|
| 47 |
-
commit_message=f"sync {ts}",
|
| 48 |
-
ignore_patterns=[
|
| 49 |
-
"__pycache__", "*.pyc",
|
| 50 |
-
".git", ".git*",
|
| 51 |
-
"*.lock", "*.pid", "*.sock",
|
| 52 |
-
".huggingface",
|
| 53 |
-
],
|
| 54 |
-
)
|
| 55 |
-
print("[persist] Upload completed (fallback)", file=sys.stderr)
|
| 56 |
-
except Exception as e2:
|
| 57 |
-
print(f"[persist] Upload failed: {e2}", file=sys.stderr)
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
if __name__ == "__main__":
|
| 61 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubuntu-server/ws-ssh-bridge.py → ws_ssh_bridge.py
RENAMED
|
@@ -73,7 +73,7 @@ async def bridge(websocket):
|
|
| 73 |
|
| 74 |
|
| 75 |
async def main():
|
| 76 |
-
print(f"[ws-ssh-bridge] Listening on 127.0.0.1:{WS_PORT}
|
| 77 |
file=sys.stderr)
|
| 78 |
async with serve(bridge, "127.0.0.1", WS_PORT,
|
| 79 |
ping_interval=30, ping_timeout=120,
|
|
|
|
| 73 |
|
| 74 |
|
| 75 |
async def main():
|
| 76 |
+
print(f"[ws-ssh-bridge] Listening on 127.0.0.1:{WS_PORT} -> sshd {SSH_HOST}:{SSH_PORT}",
|
| 77 |
file=sys.stderr)
|
| 78 |
async with serve(bridge, "127.0.0.1", WS_PORT,
|
| 79 |
ping_interval=30, ping_timeout=120,
|