hermes-bot / start.sh
Z User
fix: webui auto-update silent failure + bump to v0.5.9
e10e850
#!/bin/bash
# Hermes Bot — HuggingFace Space Startup
# NOTE: No 'set -e' — gateway restarts should not kill the entire script
echo "=== Hermes Bot — HuggingFace Space Startup ==="
# Ensure system timezone matches config (logging timestamps use system TZ)
export TZ="${TZ:-Asia/Shanghai}"
# Ensure persistent storage directories exist
mkdir -p /data/hermes/{sessions,memories,uploads,logs,palace,skills,weixin}
# Create symlinks from hermes home to persistent storage
HERMES_HOME="/root/.hermes"
for dir in sessions memories uploads logs palace skills; do
target="$HERMES_HOME/$dir"
if [ ! -L "$target" ] && [ ! -d "$target" ]; then
ln -sf "/data/hermes/$dir" "$target"
echo "Created symlink: $dir -> /data/hermes/$dir"
elif [ -L "$target" ]; then
echo "Symlink exists: $dir"
fi
done
# Persist WeChat/Weixin session data across container rebuilds
# Weixin adapter stores auth tokens, context tokens, and sync cursors in ~/.hermes/weixin/
# Without this, WeChat binding breaks on every container rebuild
WEIXIN_DIR="$HERMES_HOME/weixin"
if [ -d "$WEIXIN_DIR" ] && [ ! -L "$WEIXIN_DIR" ]; then
# Migrate existing session data to persistent storage
cp -rn "$WEIXIN_DIR"/* /data/hermes/weixin/ 2>/dev/null
rm -rf "$WEIXIN_DIR"
fi
if [ ! -L "$WEIXIN_DIR" ]; then
ln -sf "/data/hermes/weixin" "$WEIXIN_DIR"
echo "Symlink: weixin -> /data/hermes/weixin"
fi
# ── WeChat credential persistence ──
# Priority: HF Space Secrets > persisted account JSON file > .env file
# Once set via HF Space Secrets, WeChat survives ALL container rebuilds.
ACCOUNTS_DIR="/data/hermes/weixin/accounts"
mkdir -p "$ACCOUNTS_DIR"
if [ -z "$WEIXIN_ACCOUNT_ID" ] || [ -z "$WEIXIN_TOKEN" ]; then
# Fallback: restore from persisted account JSON file
if [ -z "$WEIXIN_ACCOUNT_ID" ] && [ -d "$ACCOUNTS_DIR" ]; then
LATEST=$(find "$ACCOUNTS_DIR" -name "*.json" ! -name "*.context-tokens.json" ! -name "*.sync.json" -type f -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | awk '{print $2}')
if [ -n "$LATEST" ]; then
DISCOVERED_ID=$(basename "$LATEST" .json)
export WEIXIN_ACCOUNT_ID="$DISCOVERED_ID"
echo "Auto-discovered WEIXIN_ACCOUNT_ID=$DISCOVERED_ID"
fi
fi
if [ -z "$WEIXIN_TOKEN" ] && [ -n "$WEIXIN_ACCOUNT_ID" ]; then
ACCOUNT_FILE="$ACCOUNTS_DIR/${WEIXIN_ACCOUNT_ID}.json"
if [ -f "$ACCOUNT_FILE" ]; then
DISCOVERED_TOKEN=$(python3 -c "import json; d=json.load(open('$ACCOUNT_FILE')); print(d.get('token',''))" 2>/dev/null)
if [ -n "$DISCOVERED_TOKEN" ]; then
export WEIXIN_TOKEN="$DISCOVERED_TOKEN"
echo "Restored WEIXIN_TOKEN from persisted account file"
fi
fi
fi
fi
if [ -n "$WEIXIN_ACCOUNT_ID" ] && [ -n "$WEIXIN_TOKEN" ]; then
echo "WeChat credentials ready (account=$(_mask_val "$WEIXIN_ACCOUNT_ID"))"
# Persist credentials to account JSON so gateway's load_weixin_account() also finds them
ACCOUNT_FILE="$ACCOUNTS_DIR/${WEIXIN_ACCOUNT_ID}.json"
if [ ! -f "$ACCOUNT_FILE" ] || ! python3 -c "import json; d=json.load(open('$ACCOUNT_FILE')); exit(0 if d.get('token') else 1)" 2>/dev/null; then
python3 -c "
import json, time
payload = {'token': '$WEIXIN_TOKEN', 'base_url': 'https://ilinkai.weixin.qq.com', 'saved_at': time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())}
with open('$ACCOUNT_FILE', 'w') as f: json.dump(payload, f)
" 2>/dev/null && chmod 600 "$ACCOUNT_FILE"
echo "WeChat credentials persisted to account file"
fi
else
echo "WARNING: WeChat not configured (no token/account). Run 'hermes gateway setup' to scan QR."
fi
# -- Persist .env across container rebuilds --
# Priority: Space Secrets (env vars) > persistent storage
# SECURITY: .env is NO LONGER in git repo -- use HF Space Secrets
ENV_FILE="$HERMES_HOME/.env"
ENV_DATA="/data/hermes/.env"
# Helper: mask a secret value for safe logging (show first 6 + **** + last 4)
_mask_val() {
local val="$1"
if [ -z "$val" ] || [ ${#val} -lt 12 ]; then echo "****"; return; fi
echo "${val:0:6}****${val: -4}"
}
# Generate .env from Space Secrets (environment variables injected by HF)
# SECURITY: secrets are written to file ONLY — never echoed to stdout/build logs
if [ ! -f "$ENV_DATA" ] && [ -n "$OPENROUTER_API_KEY" ]; then
echo "Generating .env from Space Secrets..."
{
echo "OPENROUTER_API_KEY=$OPENROUTER_API_KEY"
[ -n "$OPENAI_API_KEY" ] && echo "OPENAI_API_KEY=$OPENAI_API_KEY"
[ -n "$OPENAI_BASE_URL" ] && echo "OPENAI_BASE_URL=$OPENAI_BASE_URL"
[ -n "$FEISHU_APP_ID" ] && echo "FEISHU_APP_ID=$FEISHU_APP_ID"
[ -n "$FEISHU_APP_SECRET" ] && echo "FEISHU_APP_SECRET=$FEISHU_APP_SECRET"
echo "GATEWAY_ALLOW_ALL_USERS=true"
echo "HERMES_ACCEPT_HOOKS=1"
[ -n "$MEMPALACE_PALACE_PATH" ] && echo "MEMPALACE_PALACE_PATH=$MEMPALACE_PALACE_PATH"
[ -n "$FIRECRAWL_API_KEY" ] && echo "FIRECRAWL_API_KEY=$FIRECRAWL_API_KEY"
[ -n "$WEIXIN_ACCOUNT_ID" ] && echo "WEIXIN_ACCOUNT_ID=$WEIXIN_ACCOUNT_ID"
[ -n "$WEIXIN_TOKEN" ] && echo "WEIXIN_TOKEN=$WEIXIN_TOKEN"
} > "$ENV_DATA"
chmod 600 "$ENV_DATA"
echo "Created .env from Space Secrets (keys masked below)"
echo " OPENROUTER_API_KEY=$(_mask_val "$OPENROUTER_API_KEY")"
[ -n "$OPENAI_API_KEY" ] && echo " OPENAI_API_KEY=$(_mask_val "$OPENAI_API_KEY")"
[ -n "$FEISHU_APP_ID" ] && echo " FEISHU_APP_ID=$FEISHU_APP_ID"
[ -n "$FEISHU_APP_SECRET" ] && echo " FEISHU_APP_SECRET=$(_mask_val "$FEISHU_APP_SECRET")"
[ -n "$FIRECRAWL_API_KEY" ] && echo " FIRECRAWL_API_KEY=$(_mask_val "$FIRECRAWL_API_KEY")"
[ -n "$WEIXIN_TOKEN" ] && echo " WEIXIN_TOKEN=$(_mask_val "$WEIXIN_TOKEN")"
fi
# Fallback: if no secrets and no persistent data
if [ ! -f "$ENV_DATA" ] && [ -f "/app/.env.example" ]; then
cp "/app/.env.example" "$ENV_DATA"
echo "WARNING: No .env found. Set API keys via HF Space Secrets!"
fi
# Always symlink
if [ ! -L "$ENV_FILE" ]; then
rm -f "$ENV_FILE"
ln -sf "$ENV_DATA" "$ENV_FILE"
echo "Symlink: .env -> $ENV_DATA"
else
echo "Symlink exists: .env"
fi
# Ensure WEIXIN_TOKEN/ACCOUNT_ID are in .env even if file was created earlier without them
if [ -f "$ENV_DATA" ] && [ -n "$WEIXIN_TOKEN" ] && ! grep -q '^WEIXIN_TOKEN=' "$ENV_DATA" 2>/dev/null; then
echo "WEIXIN_TOKEN=$WEIXIN_TOKEN" >> "$ENV_DATA"
fi
if [ -f "$ENV_DATA" ] && [ -n "$WEIXIN_ACCOUNT_ID" ] && ! grep -q '^WEIXIN_ACCOUNT_ID=' "$ENV_DATA" 2>/dev/null; then
echo "WEIXIN_ACCOUNT_ID=$WEIXIN_ACCOUNT_ID" >> "$ENV_DATA"
fi
# ── Persist config.yaml across container rebuilds ──
# WebUI settings page and WeChat save flow update ~/.hermes/config.yaml at runtime
CFG_FILE="$HERMES_HOME/config.yaml"
CFG_DATA="/data/hermes/config.yaml"
if [ -f "$CFG_FILE" ] && [ ! -L "$CFG_FILE" ] && [ ! -f "$CFG_DATA" ]; then
# First time: migrate build-time config to persistent storage
cp "$CFG_FILE" "$CFG_DATA"
echo "Migrated config.yaml to persistent storage"
elif [ -L "$CFG_FILE" ] && [ ! -f "$CFG_DATA" ]; then
# Symlink exists but target missing — recreate from repo copy
if [ -f "/app/config.yaml" ]; then
cp "/app/config.yaml" "$CFG_DATA"
echo "Restored config.yaml from repo fallback"
fi
fi
if [ ! -L "$CFG_FILE" ]; then
rm -f "$CFG_FILE"
ln -sf "$CFG_DATA" "$CFG_FILE"
echo "Symlink: config.yaml -> $CFG_DATA"
else
echo "Symlink exists: config.yaml"
fi
echo "Persistent storage ready."
# ── Persist WebUI credentials across rebuilds ──
WEBUI_HOME="/root/.hermes-web-ui"
WEBUI_DATA="/data/hermes/webui"
mkdir -p "$WEBUI_DATA"
if [ ! -L "$WEBUI_HOME" ] && [ -d "$WEBUI_HOME" ]; then
# Migrate existing credentials to persistent storage
if [ -f "$WEBUI_HOME/.credentials" ] && [ ! -f "$WEBUI_DATA/.credentials" ]; then
cp "$WEBUI_HOME/.credentials" "$WEBUI_DATA/.credentials"
echo "Migrated WebUI credentials to persistent storage"
fi
rm -rf "$WEBUI_HOME"
fi
if [ ! -L "$WEBUI_HOME" ]; then
ln -sf "$WEBUI_DATA" "$WEBUI_HOME"
echo "Symlink: hermes-web-ui -> $WEBUI_DATA"
fi
# ── Persist agency-agents across container rebuilds ──
# 211 expert role prompts for instant role switching
AGENCY_SRC="/app/agency-agents"
AGENCY_DST="/data/hermes/agency-agents"
AGENCY_LINK="$HERMES_HOME/agency-agents"
if [ -d "$AGENCY_SRC" ] && [ ! -d "$AGENCY_DST" ]; then
cp -r "$AGENCY_SRC" "$AGENCY_DST"
echo "Copied agency-agents to persistent storage"
fi
# Merge custom agents (Hermes extensions) into agency-agents directory
if [ -d "/app/custom-agents" ] && [ -d "$AGENCY_DST" ]; then
cp -rn /app/custom-agents/* "$AGENCY_DST/" 2>/dev/null
echo "Merged custom agents into agency-agents"
fi
if [ ! -L "$AGENCY_LINK" ]; then
rm -rf "$AGENCY_LINK"
if [ -d "$AGENCY_DST" ]; then
ln -sf "$AGENCY_DST" "$AGENCY_LINK"
echo "Symlink: agency-agents -> $AGENCY_DST"
fi
fi
# Generate agent index JSON for fast role lookup
if [ -d "$AGENCY_DST" ] && command -v python3 &>/dev/null; then
python3 << 'AGENT_INDEX'
import os, json, re, yaml
agents = []
base = '/data/hermes/agency-agents'
skip = {'README.md','README.zh-TW.md','CATALOG.md','AGENT-LIST.md','CONTRIBUTING.md','LICENSE','UPSTREAM.md','.gitattributes'}
for root, dirs, files in os.walk(base):
for f in files:
if f.endswith('.md') and f not in skip:
path = os.path.join(root, f)
rel = os.path.relpath(path, base)
try:
with open(path, encoding='utf-8') as fh:
content = fh.read()
name = desc = dept = ''
if content.startswith('---'):
parts = content.split('---', 2)
if len(parts) >= 3:
meta = yaml.safe_load(parts[1]) or {}
name = meta.get('name', '')
desc = meta.get('description', '')
if not name:
name = f.replace('.md', '').replace('-', ' ').title()
dept = rel.split('/')[0] if '/' in rel else 'root'
agents.append({'id': f.replace('.md',''), 'name': name, 'desc': desc[:80], 'dept': dept, 'path': rel})
except Exception:
pass
agents.sort(key=lambda x: (x['dept'], x['name']))
idx_path = os.path.join(base, '.agent-index.json')
with open(idx_path, 'w', encoding='utf-8') as out:
json.dump(agents, out, ensure_ascii=False, indent=2)
print(f"Agency agents indexed: {len(agents)} roles ready")
AGENT_INDEX
fi
# ── Force-kill any residual gateway processes from previous crash ──
echo "Cleaning up residual gateway processes..."
# Kill any lingering hermes gateway processes (prevents Feishu lock conflict)
for pid in $(pgrep -f "hermes_cli.main.*gateway" 2>/dev/null); do
echo " Killing residual gateway PID=$pid"
kill -9 "$pid" 2>/dev/null
done
sleep 2 # Wait for processes and ports to be fully released
# ── Clean up stale PID/lock files from previous crash ──
echo "Cleaning up stale state..."
rm -f "$HERMES_HOME/gateway.pid" 2>/dev/null
rm -f "$HERMES_HOME/.gateway_runtime_lock" 2>/dev/null
rm -f "$HERMES_HOME/.gateway_takeover" 2>/dev/null
rm -f /tmp/hermes-gateway.pid 2>/dev/null
# Clean Feishu lock files
rm -f "$HERMES_HOME"/feishu*.lock 2>/dev/null
rm -f "$HERMES_HOME"/*.feishu_lock 2>/dev/null
echo "Stale state cleaned."
# ─── Lifecycle Heartbeat 初始化 ──────────────────────
HERMES_DATA_DIR="/data/hermes"
# 确保 scripts 目录在持久化区存在,同步 lifecycle 脚本
mkdir -p "$HERMES_DATA_DIR/scripts"
if [ -f "/app/scripts/hermes-lifecycle.sh" ]; then
cp -f "/app/scripts/hermes-lifecycle.sh" "$HERMES_DATA_DIR/scripts/hermes-lifecycle.sh"
chmod +x "$HERMES_DATA_DIR/scripts/hermes-lifecycle.sh"
echo "Lifecycle script synced to persistent storage."
fi
# 初始化 identity.md(仅首次创建,容器重启不覆盖)
if [ ! -f "$HERMES_DATA_DIR/identity.md" ]; then
cat > "$HERMES_DATA_DIR/identity.md" <<'IDENTITY'
# Hermes 身份记忆
# 这个文件定义了 Hermes 对自己的认知
# /reset 后此文件不会被清除(在持久化目录中)
## 基础信息
- 名字: Hermes
- 通道: 飞书(WebSocket) / 微信
- 主人: 用户344064
## 性格特征
- 中文为主,简洁有力
- 结果先行,解释后补
- 偶尔幽默但不影响效率
- 有工具、有记忆、有判断力
## 主人偏好
- 不喜欢废话,喜欢直给
- 欣赏有深度的技术分析
- 喜欢直来直去的沟通
## 运维记忆
<!-- 自动追加,不要手动编辑此节 -->
IDENTITY
echo "identity.md initialized."
else
echo "identity.md exists (preserved)."
fi
# 初始化 insights.md(仅首次创建)
if [ ! -f "$HERMES_DATA_DIR/insights.md" ]; then
cat > "$HERMES_DATA_DIR/insights.md" <<'INSIGHTS'
# Hermes 洞察日志 (insights.md)
# 自动记录异常、观察、值得汇报的事
# 类别: 通道异常 / 系统异常 / 用户洞察 / 技术发现 / 待办提醒 / 运维记忆
INSIGHTS
echo "insights.md initialized."
else
echo "insights.md exists (preserved)."
fi
# 初始化 heartbeat-state.json(仅首次创建)
if [ ! -f "$HERMES_DATA_DIR/heartbeat-state.json" ]; then
echo '{"lastCheck":null,"lastConfigCheck":null,"totalRuns":0,"totalErrors":0,"consecutiveErrors":0,"lastError":null}' > "$HERMES_DATA_DIR/heartbeat-state.json"
echo "heartbeat-state.json initialized."
else
echo "heartbeat-state.json exists (preserved)."
fi
# 确保 cron 目录持久化
mkdir -p "$HERMES_DATA_DIR/cron"
echo "Lifecycle heartbeat ready."
# Initialize MemPalace if not already
PALACE_PATH="${MEMPALACE_PALACE_PATH:-/data/hermes/palace}"
if [ ! -f "$PALACE_PATH/.palace_initialized" ]; then
echo "Initializing MemPalace at $PALACE_PATH..."
mempalace init "$PALACE_PATH" 2>/dev/null || echo "MemPalace init skipped (may already exist)"
touch "$PALACE_PATH/.palace_initialized"
echo "MemPalace initialized."
else
echo "MemPalace already initialized."
fi
# ─── Auto-register Lifecycle Cron Job ─────────────────
# 合并后的单一 cron:lifecycle-heartbeat
# 包含:健康检查 / 配置完整性 / 日志分析 / 洞察记录 / 清理 / 状态更新
# 每 2 小时执行一次
CRON_DIR="$HERMES_DATA_DIR/cron"
CRON_JOBS="$CRON_DIR/jobs.json"
if [ -f "$CRON_JOBS" ]; then
# 已有 cron 配置,检查 lifecycle-heartbeat 是否存在
if ! python3 -c "
import json
d=json.load(open('$CRON_JOBS'))
jobs=[j for j in d.get('jobs',[]) if j.get('name')=='lifecycle-heartbeat']
print('found' if jobs else 'missing')
" 2>/dev/null | grep -q "found"; then
echo "Cron exists but lifecycle-heartbeat missing, injecting..."
python3 -c "
import json, uuid
from datetime import datetime, timezone, timedelta
f='$CRON_JOBS'
d=json.load(open(f))
now=datetime.now(timezone(timedelta(hours=8)))
next_run=now.replace(minute=0,second=0,microsecond=0)+timedelta(hours=2)
d['jobs'].append({
'id': uuid.uuid4().hex[:12],
'name': 'lifecycle-heartbeat',
'prompt': 'Execute lifecycle heartbeat: health check, config integrity, log analysis, insights, cleanup.',
'skills': [], 'skill': None, 'model': None, 'provider': None, 'base_url': None,
'script': 'hermes-lifecycle.sh',
'context_from': None,
'schedule': {'kind': 'cron', 'expr': '0 0/2 * * *', 'display': '0 0/2 * * *'},
'schedule_display': '0 0/2 * * *',
'repeat': {'times': None, 'completed': 0},
'enabled': True, 'state': 'scheduled',
'paused_at': None, 'paused_reason': None,
'created_at': now.isoformat(),
'next_run_at': next_run.isoformat(),
'last_run_at': None, 'last_status': None, 'last_error': None,
'last_delivery_error': None,
'deliver': ['local'],
'origin': 'start.sh-auto-inject',
'enabled_toolsets': None,
'workdir': '/data/hermes'
})
d['updated_at']=now.isoformat()
json.dump(d,open(f,'w'),indent=2)
print('lifecycle-heartbeat cron injected')
" 2>/dev/null && echo "OK" || echo "WARN: Failed to inject cron job"
else
echo "lifecycle-heartbeat cron already configured."
fi
else
# 首次创建 cron jobs.json
mkdir -p "$CRON_DIR"
python3 -c "
import json, uuid
from datetime import datetime, timezone, timedelta
now=datetime.now(timezone(timedelta(hours=8)))
next_run=now.replace(minute=0,second=0,microsecond=0)+timedelta(hours=2)
d={
'jobs': [{
'id': uuid.uuid4().hex[:12],
'name': 'lifecycle-heartbeat',
'prompt': 'Execute lifecycle heartbeat: health check, config integrity, log analysis, insights, cleanup.',
'skills': [], 'skill': None, 'model': None, 'provider': None, 'base_url': None,
'script': 'hermes-lifecycle.sh',
'context_from': None,
'schedule': {'kind': 'cron', 'expr': '0 0/2 * * *', 'display': '0 0/2 * * *'},
'schedule_display': '0 0/2 * * *',
'repeat': {'times': None, 'completed': 0},
'enabled': True, 'state': 'scheduled',
'paused_at': None, 'paused_reason': None,
'created_at': now.isoformat(),
'next_run_at': next_run.isoformat(),
'last_run_at': None, 'last_status': None, 'last_error': None,
'last_delivery_error': None,
'deliver': ['local'],
'origin': 'start.sh-auto-inject',
'enabled_toolsets': None,
'workdir': '/data/hermes'
}],
'updated_at': now.isoformat()
}
json.dump(d,open('$CRON_JOBS','w'),indent=2)
print('lifecycle-heartbeat cron created')
" 2>/dev/null && echo "Cron job auto-created." || echo "WARN: Failed to create cron job"
fi
# ── Gateway startup is handled by entry.py watchdog ──
# Do NOT start gateway here — entry.py's _gateway_watchdog thread manages
# the full lifecycle (start, monitor, zombie-detect, restart with --replace).
# Starting gateway from both start.sh AND entry.py causes PID conflicts
# and "Another gateway already using this Feishu app_id" errors.
echo "[$(date)] Gateway will be started by entry.py watchdog"
echo "[$(date)] Waiting for gateway to be ready on :8642..."
for i in $(seq 1 60); do
if curl -s http://127.0.0.1:8642/health > /dev/null 2>&1; then
echo "[$(date)] Gateway is ready on :8642"
break
fi
sleep 2
done
# ── Auto-update hermes-agent if newer release exists ──
# hermes-agent is pip install -e (editable), so git pull + pip upgrade = instant.
# Safety: update runs in background; if pip fails, old code stays intact.
# Set AGENT_AUTO_UPDATE=false to disable.
update_hermes_agent_background() {
[ "${AGENT_AUTO_UPDATE}" = "false" ] && return
AGENT_REPO="NousResearch/hermes-agent"
AGENT_DIR="/app/hermes-agent"
VERSION_FILE="/data/hermes/agent.version"
API_URL="https://api.github.com/repos/${AGENT_REPO}/releases/latest"
EXTRAS="feishu,mcp,cron,pty"
# ── Phase 0: Unshallow the clone if needed ──
# Dockerfile uses `git clone --depth 1` which prevents checking out
# any tag/commit outside the shallow boundary.
# Without this, `git rev-parse <tag>` ALWAYS fails after a rebuild.
if [ -f "$AGENT_DIR/.git/shallow" ]; then
echo "[$(date)] Agent auto-update: unshallowing clone (Dockerfile --depth 1)..."
if git -C "$AGENT_DIR" fetch --unshallow origin 2>&1 | tail -3; then
echo "[$(date)] Agent auto-update: clone unshallowed successfully"
else
echo "[$(date)] Agent auto-update: unshallow failed, tag checkout may not work"
fi
fi
# ── Phase 1: Detect actual code version vs recorded version ──
# After a HF Space rebuild, /app/hermes-agent is re-cloned at the
# Dockerfile pinned version, but /data/hermes/agent.version (persistent)
# still says the newer version from the previous auto-update.
# This mismatch causes the updater to think it's already up to date.
ACTUAL_TAG=$(git -C "$AGENT_DIR" describe --tags --exact-match 2>/dev/null || echo "")
BUILD_VERSION="$(cat /app/hermes-agent.version 2>/dev/null | head -1)"
# Current version from persistent storage (survives rebuilds)
CURRENT_VERSION="$(cat "$VERSION_FILE" 2>/dev/null | head -1)"
if [ -z "$CURRENT_VERSION" ]; then
CURRENT_VERSION="$BUILD_VERSION"
[ -z "$CURRENT_VERSION" ] && CURRENT_VERSION="v2026.4.30"
echo "$CURRENT_VERSION" > "$VERSION_FILE"
fi
# Detect rebuild mismatch: actual git tag ≠ recorded version
NEED_FORCE=false
if [ -n "$ACTUAL_TAG" ] && [ "$ACTUAL_TAG" != "$CURRENT_VERSION" ]; then
echo "[$(date)] Agent auto-update: REBUILD DETECTED (actual=$ACTUAL_TAG, recorded=$CURRENT_VERSION)"
echo "[$(date)] Agent auto-update: code was reset to Dockerfile version by container rebuild"
NEED_FORCE=true
# Reset comparison baseline to actual (old) code version
CURRENT_VERSION="$ACTUAL_TAG"
fi
# Version comparison helper: strip leading 'v', compare date-style like 2026.4.30
compare_date_versions() {
local a="${1#v}" b="${2#v}"
IFS='.' read -ra A <<< "$a"
IFS='.' read -ra B <<< "$b"
for i in 0 1 2; do
local ai=${A[$i]:-0} bi=${B[$i]:-0}
if [ "$bi" -gt "$ai" ] 2>/dev/null; then return 0; fi
if [ "$bi" -lt "$ai" ] 2>/dev/null; then return 1; fi
done
return 1 # equal or older
}
echo "[$(date)] Agent auto-update: checking (current: $CURRENT_VERSION, actual: $ACTUAL_TAG, latest: querying...)"
# ── Phase 2: Query GitHub API for latest release ──
LATEST_JSON=$(curl -sf --connect-timeout 10 --max-time 20 "$API_URL" 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$LATEST_JSON" ]; then
echo "[$(date)] Agent auto-update: failed to reach GitHub API, skipping"
return
fi
LATEST_TAG=$(echo "$LATEST_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null)
if [ -z "$LATEST_TAG" ]; then
echo "[$(date)] Agent auto-update: could not parse latest tag, skipping"
return
fi
echo "[$(date)] Agent auto-update: latest release is $LATEST_TAG"
# ── Phase 3: Decide if update is needed ──
if compare_date_versions "$CURRENT_VERSION" "$LATEST_TAG"; then
echo "[$(date)] Agent auto-update: upgrading $CURRENT_VERSION$LATEST_TAG ..."
elif [ "$NEED_FORCE" = "true" ]; then
# Rebuild detected: latest = recorded version, but code is still old.
# Re-apply the update to restore correct version.
echo "[$(date)] Agent auto-update: re-applying $LATEST_TAG after rebuild (code was reset to $ACTUAL_TAG)"
else
echo "[$(date)] Agent auto-update: $CURRENT_VERSION is up to date"
return
fi
# ── Phase 4: git fetch + checkout new tag (non-destructive) ──
cd "$AGENT_DIR"
if ! git fetch --tags origin 2>&1 | tail -3; then
echo "[$(date)] Agent auto-update: git fetch failed, aborting"
return
fi
# Verify tag exists (after unshallow, this should succeed)
if ! git rev-parse "$LATEST_TAG" >/dev/null 2>&1; then
echo "[$(date)] Agent auto-update: tag $LATEST_TAG not found locally, fetching explicitly..."
if ! git fetch origin "refs/tags/$LATEST_TAG:refs/tags/$LATEST_TAG" 2>&1; then
echo "[$(date)] Agent auto-update: explicit tag fetch failed, aborting"
return
fi
if ! git rev-parse "$LATEST_TAG" >/dev/null 2>&1; then
echo "[$(date)] Agent auto-update: tag $LATEST_TAG still not found, aborting"
return
fi
fi
# Phase 5: checkout new version
if ! git checkout "$LATEST_TAG" 2>&1 | tail -3; then
echo "[$(date)] Agent auto-update: git checkout failed, aborting"
# Try to recover to previous version
git checkout "$ACTUAL_TAG" 2>/dev/null
return
fi
# Phase 6: update pip dependencies (editable install)
echo "[$(date)] Agent auto-update: updating pip dependencies..."
if ! pip install --quiet -e "/app/hermes-agent[${EXTRAS}]" 2>&1 | tail -10; then
echo "[$(date)] Agent auto-update: pip install failed, rolling back"
git checkout "$ACTUAL_TAG" 2>/dev/null
pip install --quiet -e "/app/hermes-agent[${EXTRAS}]" 2>/dev/null
return
fi
# Phase 7: reinstall our patches on top of new version
echo "[$(date)] Agent auto-update: re-applying Hermes Bot patches..."
if [ -f "/app/scripts/patch_file_delivery.py" ]; then
python3 /app/scripts/patch_file_delivery.py 2>/dev/null
fi
if [ -f "/app/scripts/patch_auto_media.py" ]; then
python3 /app/scripts/patch_auto_media.py 2>/dev/null
fi
if [ -f "/app/scripts/patch_resolve_media_paths.py" ]; then
python3 /app/scripts/patch_resolve_media_paths.py 2>/dev/null
fi
if [ -f "/app/scripts/patch_weixin_cross_loop.py" ]; then
python3 /app/scripts/patch_weixin_cross_loop.py 2>/dev/null
fi
if [ -f "/app/scripts/patch_web_search_fallback.py" ]; then
python3 /app/scripts/patch_web_search_fallback.py 2>/dev/null
fi
if [ -f "/app/scripts/patch_strip_thinking_tags.py" ]; then
python3 /app/scripts/patch_strip_thinking_tags.py 2>/dev/null
fi
if [ -f "/app/scripts/patch_sandbox_isolation.py" ]; then
python3 /app/scripts/patch_sandbox_isolation.py 2>/dev/null
fi
# Copy patch files if they exist
for patch_file in prompt_builder.py send_message_tool.py; do
if [ -f "/app/patches/hermes-agent/agent/$patch_file" ] && [ -f "$AGENT_DIR/agent/$patch_file" ]; then
cp "/app/patches/hermes-agent/agent/$patch_file" "$AGENT_DIR/agent/$patch_file" 2>/dev/null
fi
done
# Save new version
echo "$LATEST_TAG" > "$VERSION_FILE"
echo "$(date '+%Y-%m-%d %H:%M:%S')" >> "$VERSION_FILE"
echo "[$(date)] Agent auto-update: upgraded to $LATEST_TAG ✓ (restart needed for full effect)"
# Phase 8: schedule gateway restart for clean reload
# Send SIGUSR1 to entry.py to trigger gateway restart cycle
ENTRY_PID=$(pgrep -f "python3 /app/entry.py" 2>/dev/null | head -1)
if [ -n "$ENTRY_PID" ]; then
kill -USR1 "$ENTRY_PID" 2>/dev/null && \
echo "[$(date)] Agent auto-update: sent reload signal to entry.py (PID: $ENTRY_PID)" || \
echo "[$(date)] Agent auto-update: gateway will use new code on next conversation"
fi
}
# ── Auto-update hermes-web-ui if newer release exists ──
# Runs asynchronously so it doesn't block startup.
# All output goes to /data/hermes/logs/auto-update.log (not stdout, which gets eaten by exec).
# Set WEBUI_AUTO_UPDATE=false to disable.
update_webui_background() {
[ "${WEBUI_AUTO_UPDATE}" = "false" ] && return
WEBUI_REPO="EKKOLearnAI/hermes-web-ui"
VERSION_FILE="/data/hermes/webui.version"
BUILD_VERSION_FILE="/app/webui.version"
BUILD_TMP="/tmp/webui-update"
WEBUI_INSTALL="/app/webui-server"
WEBUI_CLIENT="/app/webui-client"
API_URL="https://api.github.com/repos/${WEBUI_REPO}/releases/latest"
UPDATE_LOG="/data/hermes/logs/auto-update.log"
# Tee all output to log file for diagnostics
_log() { echo "[$(date)] $*"; }
_log_and_tee() { _log "$*" | tee -a "$UPDATE_LOG"; }
_log_and_tee "=== WebUI auto-update starting ==="
# ── Detect rebuild: Dockerfile writes /app/webui.version, persistent is /data/hermes/ ──
BUILD_VERSION="$(cat "$BUILD_VERSION_FILE" 2>/dev/null | head -1)"
RECORDED_VERSION="$(cat "$VERSION_FILE" 2>/dev/null | head -1)"
NEED_FORCE=false
if [ -n "$BUILD_VERSION" ] && [ -n "$RECORDED_VERSION" ] && [ "$BUILD_VERSION" != "$RECORDED_VERSION" ]; then
_log_and_tee "REBUILD DETECTED (Dockerfile=$BUILD_VERSION, recorded=$RECORDED_VERSION)"
NEED_FORCE=true
CURRENT_VERSION="$BUILD_VERSION"
elif [ -n "$RECORDED_VERSION" ]; then
CURRENT_VERSION="$RECORDED_VERSION"
elif [ -n "$BUILD_VERSION" ]; then
CURRENT_VERSION="$BUILD_VERSION"
else
CURRENT_VERSION="v0.5.5"
echo "$CURRENT_VERSION" > "$VERSION_FILE"
fi
_log_and_tee "Checking: current=$CURRENT_VERSION, Dockerfile=$BUILD_VERSION, latest=?"
# Query GitHub API for latest release tag
LATEST_JSON=$(curl -sf --connect-timeout 10 --max-time 20 "$API_URL" 2>/dev/null)
if [ $? -ne 0 ] || [ -z "$LATEST_JSON" ]; then
_log_and_tee "ERROR: failed to reach GitHub API, skipping"
return
fi
LATEST_TAG=$(echo "$LATEST_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null)
if [ -z "$LATEST_TAG" ]; then
_log_and_tee "ERROR: could not parse latest tag, skipping"
return
fi
_log_and_tee "Latest release: $LATEST_TAG"
# Compare versions
CURRENT_NUM="${CURRENT_VERSION#v}"
LATEST_NUM="${LATEST_TAG#v}"
if [ "$CURRENT_NUM" = "$LATEST_NUM" ] && [ "$NEED_FORCE" = "false" ]; then
_log_and_tee "Already on latest ($CURRENT_VERSION)"
return
fi
update_needed=false
IFS='.' read -ra C <<< "$CURRENT_NUM"
IFS='.' read -ra L <<< "$LATEST_NUM"
for i in 0 1 2; do
c=${C[$i]:-0}; l=${L[$i]:-0}
if [ "$l" -gt "$c" ] 2>/dev/null; then update_needed=true; break; fi
if [ "$l" -lt "$c" ] 2>/dev/null; then break; fi
done
if [ "$update_needed" = "false" ] && [ "$NEED_FORCE" = "false" ]; then
_log_and_tee "Current $CURRENT_VERSION is up to date"
return
fi
if [ "$NEED_FORCE" = "true" ] && [ "$update_needed" = "false" ]; then
_log_and_tee "Re-applying $LATEST_TAG after rebuild (code reset to $BUILD_VERSION)"
else
_log_and_tee "Upgrading $CURRENT_VERSION -> $LATEST_TAG"
fi
# ── Build with retry (2 attempts) ──
for attempt in 1 2; do
_log_and_tee "Build attempt $attempt/2..."
# Clone
rm -rf "$BUILD_TMP"
if ! git clone --depth 1 --branch "$LATEST_TAG" "https://github.com/${WEBUI_REPO}.git" "$BUILD_TMP" 2>&1 | tee -a "$UPDATE_LOG" | tail -3; then
_log_and_tee "ERROR: git clone failed"
rm -rf "$BUILD_TMP"
[ "$attempt" -lt 2 ] && sleep 10 && continue
return
fi
cd "$BUILD_TMP"
# Install (with timeout)
_log_and_tee "Running npm install..."
if ! timeout 120 npm install --ignore-scripts 2>&1 | tee -a "$UPDATE_LOG" | tail -5; then
_log_and_tee "ERROR: npm install failed/timed out"
rm -rf "$BUILD_TMP"
[ "$attempt" -lt 2 ] && sleep 10 && continue
return
fi
# Rebuild native modules (required by node-pty, matching upstream Dockerfile)
_log_and_tee "Running npm rebuild node-pty..."
npm rebuild node-pty 2>&1 | tee -a "$UPDATE_LOG" | tail -5
# Build (with memory limit, matching upstream Dockerfile)
_log_and_tee "Running npm run build (NODE_OPTIONS=--max-old-space-size=4096)..."
if ! timeout 180 env NODE_OPTIONS=--max-old-space-size=4096 npm run build 2>&1 | tee -a "$UPDATE_LOG" | tail -15; then
_log_and_tee "ERROR: npm build failed/timed out"
rm -rf "$BUILD_TMP"
[ "$attempt" -lt 2 ] && sleep 10 && continue
return
fi
# Verify build output
if [ ! -d "$BUILD_TMP/dist/server" ] || [ ! -d "$BUILD_TMP/dist/client" ]; then
_log_and_tee "ERROR: build output missing (no dist/server or dist/client)"
ls -la "$BUILD_TMP/dist/" 2>&1 | tee -a "$UPDATE_LOG"
rm -rf "$BUILD_TMP"
[ "$attempt" -lt 2 ] && sleep 10 && continue
return
fi
_log_and_tee "Build succeeded!"
break # exit retry loop
done
# Hot-swap: kill old WebUI process, replace files, restart
_log_and_tee "Hot-swapping: stopping old WebUI, replacing files..."
OLD_WEBUI_PID=$(pgrep -f "node index.js" 2>/dev/null | head -1)
if [ -n "$OLD_WEBUI_PID" ]; then
kill "$OLD_WEBUI_PID" 2>/dev/null
sleep 2
# Force kill if still running
kill -9 "$OLD_WEBUI_PID" 2>/dev/null
_log_and_tee "Killed old WebUI PID=$OLD_WEBUI_PID"
fi
# Install new files
rm -rf "$WEBUI_INSTALL" "$WEBUI_CLIENT"
mkdir -p "$WEBUI_INSTALL" "$WEBUI_CLIENT"
cp -r "$BUILD_TMP/dist/server/"* "$WEBUI_INSTALL/"
cp -r "$BUILD_TMP/dist/client/"* "$WEBUI_CLIENT/"
cp "$BUILD_TMP/package.json" "$WEBUI_INSTALL/package.json"
# Install production-only node_modules
cd "$BUILD_TMP"
npm prune --omit=dev 2>&1 | tail -3
cp -r node_modules "$WEBUI_INSTALL/node_modules"
# Save new version
echo "$LATEST_TAG" > "$VERSION_FILE"
echo "$(date '+%Y-%m-%d %H:%M:%S')" >> "$VERSION_FILE"
# Restart WebUI
cd "$WEBUI_INSTALL"
export PORT=6060 UPSTREAM=http://127.0.0.1:8642 HERMES_HOME=/root/.hermes
export AUTH_TOKEN="${AUTH_TOKEN:-hermes-bot-2026}" CORS_ORIGINS="*" NODE_ENV=production
node index.js >> /data/hermes/logs/webui.log 2>&1 &
NEW_PID=$!
_log_and_tee "WebUI upgraded to $LATEST_TAG (new PID: $NEW_PID)"
# Verify
sleep 3
if curl -sf http://127.0.0.1:6060/health > /dev/null 2>&1; then
_log_and_tee "$LATEST_TAG is running and healthy"
else
_log_and_tee "WARNING: health check failed after upgrade"
fi
rm -rf "$BUILD_TMP"
_log_and_tee "=== WebUI auto-update complete ==="
}
# ── Start hermes-web-ui Node.js BFF server on :6060 ──
echo "[$(date)] Starting hermes-web-ui BFF..."
export PORT=6060
export UPSTREAM=http://127.0.0.1:8642
export HERMES_HOME=/root/.hermes
export AUTH_TOKEN="${AUTH_TOKEN:-hermes-bot-2026}"
export CORS_ORIGINS="*"
export NODE_ENV=production
cd /app/webui-server
node index.js >> /data/hermes/logs/webui.log 2>&1 &
WEBUI_PID=$!
echo "[$(date)] WebUI BFF PID: $WEBUI_PID"
# ── Force-correct version display ──
# Ensure __init__.py shows semver (e.g. 0.12.0) not git tag date (e.g. 2026.4.30)
# This runs after any potential auto-update has changed the files
HERMES_INIT="/app/hermes-agent/hermes_cli/__init__.py"
if [ -f "$HERMES_INIT" ]; then
# Read the git tag from version file to map date → semver
CURRENT_TAG="$(cat /data/hermes/agent.version 2>/dev/null | head -1)"
CURRENT_TAG="${CURRENT_TAG:-v2026.4.30}"
# Build version mapping: date-tag → semver
case "$CURRENT_TAG" in
v2026.4.30) SEMVER="0.12.0"; RDATE="2026.4.30" ;;
*) SEMVER=""; RDATE="" ;;
esac
if [ -n "$SEMVER" ]; then
sed -i "s/__version__\s*=\s*\"[^"]*\"/__version__ = \"$SEMVER\"/" "$HERMES_INIT"
sed -i "s/__release_date__\s*=\s*\"[^"]*\"/__release_date__ = \"$RDATE\"/" "$HERMES_INIT"
echo "Version patched: v$SEMVER ($RDATE)"
fi
fi
# Trigger hermes-agent auto-update in background (framework first, then UI)
update_hermes_agent_background &
# Trigger WebUI auto-update in background (non-blocking)
# Will check GitHub, build if newer, and hot-swap
update_webui_background &
# Wait for WebUI BFF to be ready
echo "[$(date)] Waiting for WebUI BFF to start..."
for i in $(seq 1 15); do
if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
echo "[$(date)] WebUI BFF is ready on :6060"
break
fi
sleep 2
done
# ── Auto-setup WebUI credentials if not configured ──
AUTH_TOKEN="${AUTH_TOKEN:-hermes-bot-2026}"
WEBUI_USER="${WEBUI_USERNAME:-admin}"
WEBUI_PASS="${WEBUI_PASSWORD:-Hermes2026}"
AUTH_STATUS=$(curl -s http://127.0.0.1:6060/api/auth/status 2>/dev/null)
HAS_PW=$(echo "$AUTH_STATUS" | python3 -c "import json,sys; print(json.load(sys.stdin).get('hasPasswordLogin',False))" 2>/dev/null)
if [ "$HAS_PW" = "False" ]; then
echo "[$(date)] WebUI: No credentials configured, auto-setting up..."
SETUP_RESULT=$(curl -s -w "\n%{http_code}" -X POST http://127.0.0.1:6060/api/auth/setup \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $AUTH_TOKEN" \
-d "{\"username\":\"$WEBUI_USER\",\"password\":\"$WEBUI_PASS\"}" 2>/dev/null)
SETUP_CODE=$(echo "$SETUP_RESULT" | tail -1)
if [ "$SETUP_CODE" = "200" ]; then
echo "[$(date)] WebUI: Credentials auto-configured (user: $WEBUI_USER)"
else
echo "[$(date)] WebUI: Auto-setup failed: $SETUP_RESULT"
fi
else
echo "[$(date)] WebUI: Credentials already configured"
fi
echo ""
echo "=== All services started ==="
echo " Gateway: http://127.0.0.1:8642 (with Python watchdog in entry.py)"
echo " WebUI: http://127.0.0.1:6060"
echo " Proxy: http://0.0.0.0:7860"
echo " Auth Token: $(_mask_val "$AUTH_TOKEN")"
echo ""
# Start Python proxy on :7860 (main HF Space port)
# entry.py contains a Python-based gateway watchdog that will auto-restart
# the gateway if it dies, regardless of what happens to this shell script
exec python3 /app/entry.py