Z User commited on
Commit
e10e850
·
1 Parent(s): 99efc6a

fix: webui auto-update silent failure + bump to v0.5.9

Browse files

Root cause of auto-update never working:

1. Missing NODE_OPTIONS=--max-old-space-size=4096
Upstream Dockerfile uses this. HF Space CPU-only has limited RAM.
Vite build (vue-tsc + vite + esbuild) OOMs silently without it.

2. Missing npm rebuild node-pty
node-pty is a native C++ module. 'npm install --ignore-scripts'
skips compilation. Without 'npm rebuild node-pty', the module
has no compiled .node binary at runtime.

3. All error output went to stdout (lost after 'exec python3 entry.py')
The auto-update runs in background, then exec replaces the shell.
All error messages are silently discarded — no way to diagnose failures.
Fix: Write everything to /data/hermes/logs/auto-update.log

4. No retry on transient failure
Network timeout, disk pressure, etc. caused single-attempt failures.
Fix: 2-attempt retry with 10s backoff.

Changes:
- Dockerfile: v0.5.8 -> v0.5.9 + NODE_OPTIONS + npm rebuild node-pty
- start.sh: Complete rewrite of update_webui_background()
- Full logging to auto-update.log
- NODE_OPTIONS=--max-old-space-size=4096
- npm rebuild node-pty step
- timeout on npm install (120s) and build (180s)
- 2-attempt retry loop
- Clean hot-swap (kill old, rm old dirs, install new, restart)
- Proper environment variable export on restart

Files changed (2) hide show
  1. Dockerfile +8 -6
  2. start.sh +91 -54
Dockerfile CHANGED
@@ -82,13 +82,15 @@ RUN mkdir -p /usr/share/fonts/truetype/noto && \
82
  RUN git clone --depth 1 https://github.com/jnMetaCode/agency-agents-zh.git /app/agency-agents && \
83
  echo "agency-agents-zh cloned ($(find /app/agency-agents -name '*.md' ! -name 'README*' ! -name 'CATALOG*' ! -name 'AGENT-LIST*' ! -name 'CONTRIBUTING*' ! -name 'LICENSE*' ! -name 'UPSTREAM*' | wc -l) agent files)"
84
 
85
- # Build hermes-web-ui v0.5.8
 
86
  RUN rm -rf /tmp/hermes-web-ui && \
87
- git clone --depth 1 --branch v0.5.8 https://github.com/EKKOLearnAI/hermes-web-ui.git /tmp/hermes-web-ui && \
88
  cd /tmp/hermes-web-ui && \
89
- echo "build-v0.5.8-$(date +%Y%m%d)" > .buildstamp && \
90
  npm install --ignore-scripts 2>&1 | tail -5 && \
91
- npm run build 2>&1 | tail -10 && \
 
92
  mkdir -p /app/webui-server && \
93
  cp -r dist/server/* /app/webui-server/ && \
94
  mkdir -p /app/webui-client && \
@@ -97,8 +99,8 @@ RUN rm -rf /tmp/hermes-web-ui && \
97
  npm prune --omit=dev --prefix /tmp/hermes-web-ui 2>&1 | tail -3 && \
98
  cp -r node_modules /app/webui-server/node_modules && \
99
  rm -rf /tmp/hermes-web-ui && \
100
- echo "v0.5.8" > /app/webui.version && \
101
- echo "hermes-web-ui v0.5.8 build done"
102
 
103
  # Create hermes home
104
  RUN mkdir -p /root/.hermes/plugins/image_gen/pollinations
 
82
  RUN git clone --depth 1 https://github.com/jnMetaCode/agency-agents-zh.git /app/agency-agents && \
83
  echo "agency-agents-zh cloned ($(find /app/agency-agents -name '*.md' ! -name 'README*' ! -name 'CATALOG*' ! -name 'AGENT-LIST*' ! -name 'CONTRIBUTING*' ! -name 'LICENSE*' ! -name 'UPSTREAM*' | wc -l) agent files)"
84
 
85
+ # Build hermes-web-ui v0.5.9
86
+ # Aligned with upstream Dockerfile: NODE_OPTIONS + npm rebuild node-pty
87
  RUN rm -rf /tmp/hermes-web-ui && \
88
+ git clone --depth 1 --branch v0.5.9 https://github.com/EKKOLearnAI/hermes-web-ui.git /tmp/hermes-web-ui && \
89
  cd /tmp/hermes-web-ui && \
90
+ echo "build-v0.5.9-$(date +%Y%m%d)" > .buildstamp && \
91
  npm install --ignore-scripts 2>&1 | tail -5 && \
92
+ npm rebuild node-pty 2>&1 | tail -5 && \
93
+ NODE_OPTIONS=--max-old-space-size=4096 npm run build 2>&1 | tail -10 && \
94
  mkdir -p /app/webui-server && \
95
  cp -r dist/server/* /app/webui-server/ && \
96
  mkdir -p /app/webui-client && \
 
99
  npm prune --omit=dev --prefix /tmp/hermes-web-ui 2>&1 | tail -3 && \
100
  cp -r node_modules /app/webui-server/node_modules && \
101
  rm -rf /tmp/hermes-web-ui && \
102
+ echo "v0.5.9" > /app/webui.version && \
103
+ echo "hermes-web-ui v0.5.9 build done"
104
 
105
  # Create hermes home
106
  RUN mkdir -p /root/.hermes/plugins/image_gen/pollinations
start.sh CHANGED
@@ -622,7 +622,7 @@ update_hermes_agent_background() {
622
 
623
  # ── Auto-update hermes-web-ui if newer release exists ──
624
  # Runs asynchronously so it doesn't block startup.
625
- # Check: GitHub releases/latest compare with saved version build if newer.
626
  # Set WEBUI_AUTO_UPDATE=false to disable.
627
  update_webui_background() {
628
  [ "${WEBUI_AUTO_UPDATE}" = "false" ] && return
@@ -634,6 +634,13 @@ update_webui_background() {
634
  WEBUI_INSTALL="/app/webui-server"
635
  WEBUI_CLIENT="/app/webui-client"
636
  API_URL="https://api.github.com/repos/${WEBUI_REPO}/releases/latest"
 
 
 
 
 
 
 
637
 
638
  # ── Detect rebuild: Dockerfile writes /app/webui.version, persistent is /data/hermes/ ──
639
  BUILD_VERSION="$(cat "$BUILD_VERSION_FILE" 2>/dev/null | head -1)"
@@ -641,10 +648,8 @@ update_webui_background() {
641
  NEED_FORCE=false
642
 
643
  if [ -n "$BUILD_VERSION" ] && [ -n "$RECORDED_VERSION" ] && [ "$BUILD_VERSION" != "$RECORDED_VERSION" ]; then
644
- echo "[$(date)] WebUI auto-update: REBUILD DETECTED (Dockerfile=$BUILD_VERSION, recorded=$RECORDED_VERSION)"
645
- echo "[$(date)] WebUI auto-update: code was reset to $BUILD_VERSION by container rebuild"
646
  NEED_FORCE=true
647
- # Reset baseline so version comparison works correctly
648
  CURRENT_VERSION="$BUILD_VERSION"
649
  elif [ -n "$RECORDED_VERSION" ]; then
650
  CURRENT_VERSION="$RECORDED_VERSION"
@@ -655,95 +660,124 @@ update_webui_background() {
655
  echo "$CURRENT_VERSION" > "$VERSION_FILE"
656
  fi
657
 
658
- echo "[$(date)] WebUI auto-update: checking (current: $CURRENT_VERSION, Dockerfile: $BUILD_VERSION, latest: querying...)"
659
 
660
  # Query GitHub API for latest release tag
661
  LATEST_JSON=$(curl -sf --connect-timeout 10 --max-time 20 "$API_URL" 2>/dev/null)
662
  if [ $? -ne 0 ] || [ -z "$LATEST_JSON" ]; then
663
- echo "[$(date)] WebUI auto-update: failed to reach GitHub API, skipping"
664
  return
665
  fi
666
 
667
  LATEST_TAG=$(echo "$LATEST_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null)
668
  if [ -z "$LATEST_TAG" ]; then
669
- echo "[$(date)] WebUI auto-update: could not parse latest tag, skipping"
670
  return
671
  fi
672
 
673
- echo "[$(date)] WebUI auto-update: latest release is $LATEST_TAG"
674
 
675
- # Compare versions (strip leading 'v' for comparison)
676
  CURRENT_NUM="${CURRENT_VERSION#v}"
677
  LATEST_NUM="${LATEST_TAG#v}"
678
 
679
- # Skip if same or current is newer (unless rebuild detected)
680
  if [ "$CURRENT_NUM" = "$LATEST_NUM" ] && [ "$NEED_FORCE" = "false" ]; then
681
- echo "[$(date)] WebUI auto-update: already on latest ($CURRENT_VERSION)"
682
  return
683
  fi
684
 
685
- # Simple version comparison: split by dots and compare
686
  update_needed=false
687
  IFS='.' read -ra C <<< "$CURRENT_NUM"
688
  IFS='.' read -ra L <<< "$LATEST_NUM"
689
  for i in 0 1 2; do
690
  c=${C[$i]:-0}; l=${L[$i]:-0}
691
- if [ "$l" -gt "$c" ] 2>/dev/null; then
692
- update_needed=true; break
693
- elif [ "$l" -lt "$c" ] 2>/dev/null; then
694
- break
695
- fi
696
  done
697
 
698
  if [ "$update_needed" = "false" ] && [ "$NEED_FORCE" = "false" ]; then
699
- echo "[$(date)] WebUI auto-update: current $CURRENT_VERSION is up to date"
700
  return
701
  fi
702
 
703
  if [ "$NEED_FORCE" = "true" ] && [ "$update_needed" = "false" ]; then
704
- echo "[$(date)] WebUI auto-update: re-applying $LATEST_TAG after rebuild (code was reset to $BUILD_VERSION)"
705
  else
706
- echo "[$(date)] WebUI auto-update: upgrading $CURRENT_VERSION $LATEST_TAG ..."
707
  fi
708
 
709
- # Build in temp directory
710
- rm -rf "$BUILD_TMP"
711
- if ! git clone --depth 1 --branch "$LATEST_TAG" "https://github.com/${WEBUI_REPO}.git" "$BUILD_TMP" 2>&1 | tail -3; then
712
- echo "[$(date)] WebUI auto-update: git clone failed, aborting"
713
- rm -rf "$BUILD_TMP"
714
- return
715
- fi
716
 
717
- cd "$BUILD_TMP"
718
- if ! npm install --ignore-scripts 2>&1 | tail -5; then
719
- echo "[$(date)] WebUI auto-update: npm install failed, aborting"
720
  rm -rf "$BUILD_TMP"
721
- return
722
- fi
 
 
 
 
723
 
724
- if ! npm run build 2>&1 | tail -10; then
725
- echo "[$(date)] WebUI auto-update: npm build failed, aborting"
726
- rm -rf "$BUILD_TMP"
727
- return
728
- fi
729
 
730
- # Verify build output exists
731
- if [ ! -d "$BUILD_TMP/dist/server" ] || [ ! -d "$BUILD_TMP/dist/client" ]; then
732
- echo "[$(date)] WebUI auto-update: build output missing, aborting"
733
- rm -rf "$BUILD_TMP"
734
- return
735
- fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
736
 
737
  # Hot-swap: kill old WebUI process, replace files, restart
738
- echo "[$(date)] WebUI auto-update: replacing files and restarting BFF..."
739
  OLD_WEBUI_PID=$(pgrep -f "node index.js" 2>/dev/null | head -1)
740
- [ -n "$OLD_WEBUI_PID" ] && kill "$OLD_WEBUI_PID" 2>/dev/null && sleep 1
741
-
742
- # Backup old, install new
743
- cp -r "$BUILD_TMP/dist/server/"* "$WEBUI_INSTALL/" 2>/dev/null
744
- cp -r "$BUILD_TMP/dist/client/"* "$WEBUI_CLIENT/" 2>/dev/null
 
 
 
 
 
 
 
 
745
  cp "$BUILD_TMP/package.json" "$WEBUI_INSTALL/package.json"
746
- cp -rn "$BUILD_TMP/node_modules/"* "$WEBUI_INSTALL/node_modules/" 2>/dev/null
 
 
 
 
747
 
748
  # Save new version
749
  echo "$LATEST_TAG" > "$VERSION_FILE"
@@ -751,19 +785,22 @@ update_webui_background() {
751
 
752
  # Restart WebUI
753
  cd "$WEBUI_INSTALL"
 
 
754
  node index.js >> /data/hermes/logs/webui.log 2>&1 &
755
  NEW_PID=$!
756
- echo "[$(date)] WebUI auto-update: upgraded to $LATEST_TAG (new PID: $NEW_PID)"
757
 
758
  # Verify
759
  sleep 3
760
- if curl -s http://127.0.0.1:6060/health > /dev/null 2>&1; then
761
- echo "[$(date)] WebUI auto-update: $LATEST_TAG is running and healthy"
762
  else
763
- echo "[$(date)] WebUI auto-update: WARNING - new version health check failed"
764
  fi
765
 
766
  rm -rf "$BUILD_TMP"
 
767
  }
768
 
769
  # ── Start hermes-web-ui Node.js BFF server on :6060 ──
 
622
 
623
  # ── Auto-update hermes-web-ui if newer release exists ──
624
  # Runs asynchronously so it doesn't block startup.
625
+ # All output goes to /data/hermes/logs/auto-update.log (not stdout, which gets eaten by exec).
626
  # Set WEBUI_AUTO_UPDATE=false to disable.
627
  update_webui_background() {
628
  [ "${WEBUI_AUTO_UPDATE}" = "false" ] && return
 
634
  WEBUI_INSTALL="/app/webui-server"
635
  WEBUI_CLIENT="/app/webui-client"
636
  API_URL="https://api.github.com/repos/${WEBUI_REPO}/releases/latest"
637
+ UPDATE_LOG="/data/hermes/logs/auto-update.log"
638
+
639
+ # Tee all output to log file for diagnostics
640
+ _log() { echo "[$(date)] $*"; }
641
+ _log_and_tee() { _log "$*" | tee -a "$UPDATE_LOG"; }
642
+
643
+ _log_and_tee "=== WebUI auto-update starting ==="
644
 
645
  # ── Detect rebuild: Dockerfile writes /app/webui.version, persistent is /data/hermes/ ──
646
  BUILD_VERSION="$(cat "$BUILD_VERSION_FILE" 2>/dev/null | head -1)"
 
648
  NEED_FORCE=false
649
 
650
  if [ -n "$BUILD_VERSION" ] && [ -n "$RECORDED_VERSION" ] && [ "$BUILD_VERSION" != "$RECORDED_VERSION" ]; then
651
+ _log_and_tee "REBUILD DETECTED (Dockerfile=$BUILD_VERSION, recorded=$RECORDED_VERSION)"
 
652
  NEED_FORCE=true
 
653
  CURRENT_VERSION="$BUILD_VERSION"
654
  elif [ -n "$RECORDED_VERSION" ]; then
655
  CURRENT_VERSION="$RECORDED_VERSION"
 
660
  echo "$CURRENT_VERSION" > "$VERSION_FILE"
661
  fi
662
 
663
+ _log_and_tee "Checking: current=$CURRENT_VERSION, Dockerfile=$BUILD_VERSION, latest=?"
664
 
665
  # Query GitHub API for latest release tag
666
  LATEST_JSON=$(curl -sf --connect-timeout 10 --max-time 20 "$API_URL" 2>/dev/null)
667
  if [ $? -ne 0 ] || [ -z "$LATEST_JSON" ]; then
668
+ _log_and_tee "ERROR: failed to reach GitHub API, skipping"
669
  return
670
  fi
671
 
672
  LATEST_TAG=$(echo "$LATEST_JSON" | python3 -c "import json,sys; print(json.load(sys.stdin).get('tag_name',''))" 2>/dev/null)
673
  if [ -z "$LATEST_TAG" ]; then
674
+ _log_and_tee "ERROR: could not parse latest tag, skipping"
675
  return
676
  fi
677
 
678
+ _log_and_tee "Latest release: $LATEST_TAG"
679
 
680
+ # Compare versions
681
  CURRENT_NUM="${CURRENT_VERSION#v}"
682
  LATEST_NUM="${LATEST_TAG#v}"
683
 
 
684
  if [ "$CURRENT_NUM" = "$LATEST_NUM" ] && [ "$NEED_FORCE" = "false" ]; then
685
+ _log_and_tee "Already on latest ($CURRENT_VERSION)"
686
  return
687
  fi
688
 
 
689
  update_needed=false
690
  IFS='.' read -ra C <<< "$CURRENT_NUM"
691
  IFS='.' read -ra L <<< "$LATEST_NUM"
692
  for i in 0 1 2; do
693
  c=${C[$i]:-0}; l=${L[$i]:-0}
694
+ if [ "$l" -gt "$c" ] 2>/dev/null; then update_needed=true; break; fi
695
+ if [ "$l" -lt "$c" ] 2>/dev/null; then break; fi
 
 
 
696
  done
697
 
698
  if [ "$update_needed" = "false" ] && [ "$NEED_FORCE" = "false" ]; then
699
+ _log_and_tee "Current $CURRENT_VERSION is up to date"
700
  return
701
  fi
702
 
703
  if [ "$NEED_FORCE" = "true" ] && [ "$update_needed" = "false" ]; then
704
+ _log_and_tee "Re-applying $LATEST_TAG after rebuild (code reset to $BUILD_VERSION)"
705
  else
706
+ _log_and_tee "Upgrading $CURRENT_VERSION -> $LATEST_TAG"
707
  fi
708
 
709
+ # ── Build with retry (2 attempts) ──
710
+ for attempt in 1 2; do
711
+ _log_and_tee "Build attempt $attempt/2..."
 
 
 
 
712
 
713
+ # Clone
 
 
714
  rm -rf "$BUILD_TMP"
715
+ if ! git clone --depth 1 --branch "$LATEST_TAG" "https://github.com/${WEBUI_REPO}.git" "$BUILD_TMP" 2>&1 | tee -a "$UPDATE_LOG" | tail -3; then
716
+ _log_and_tee "ERROR: git clone failed"
717
+ rm -rf "$BUILD_TMP"
718
+ [ "$attempt" -lt 2 ] && sleep 10 && continue
719
+ return
720
+ fi
721
 
722
+ cd "$BUILD_TMP"
 
 
 
 
723
 
724
+ # Install (with timeout)
725
+ _log_and_tee "Running npm install..."
726
+ if ! timeout 120 npm install --ignore-scripts 2>&1 | tee -a "$UPDATE_LOG" | tail -5; then
727
+ _log_and_tee "ERROR: npm install failed/timed out"
728
+ rm -rf "$BUILD_TMP"
729
+ [ "$attempt" -lt 2 ] && sleep 10 && continue
730
+ return
731
+ fi
732
+
733
+ # Rebuild native modules (required by node-pty, matching upstream Dockerfile)
734
+ _log_and_tee "Running npm rebuild node-pty..."
735
+ npm rebuild node-pty 2>&1 | tee -a "$UPDATE_LOG" | tail -5
736
+
737
+ # Build (with memory limit, matching upstream Dockerfile)
738
+ _log_and_tee "Running npm run build (NODE_OPTIONS=--max-old-space-size=4096)..."
739
+ if ! timeout 180 env NODE_OPTIONS=--max-old-space-size=4096 npm run build 2>&1 | tee -a "$UPDATE_LOG" | tail -15; then
740
+ _log_and_tee "ERROR: npm build failed/timed out"
741
+ rm -rf "$BUILD_TMP"
742
+ [ "$attempt" -lt 2 ] && sleep 10 && continue
743
+ return
744
+ fi
745
+
746
+ # Verify build output
747
+ if [ ! -d "$BUILD_TMP/dist/server" ] || [ ! -d "$BUILD_TMP/dist/client" ]; then
748
+ _log_and_tee "ERROR: build output missing (no dist/server or dist/client)"
749
+ ls -la "$BUILD_TMP/dist/" 2>&1 | tee -a "$UPDATE_LOG"
750
+ rm -rf "$BUILD_TMP"
751
+ [ "$attempt" -lt 2 ] && sleep 10 && continue
752
+ return
753
+ fi
754
+
755
+ _log_and_tee "Build succeeded!"
756
+ break # exit retry loop
757
+ done
758
 
759
  # Hot-swap: kill old WebUI process, replace files, restart
760
+ _log_and_tee "Hot-swapping: stopping old WebUI, replacing files..."
761
  OLD_WEBUI_PID=$(pgrep -f "node index.js" 2>/dev/null | head -1)
762
+ if [ -n "$OLD_WEBUI_PID" ]; then
763
+ kill "$OLD_WEBUI_PID" 2>/dev/null
764
+ sleep 2
765
+ # Force kill if still running
766
+ kill -9 "$OLD_WEBUI_PID" 2>/dev/null
767
+ _log_and_tee "Killed old WebUI PID=$OLD_WEBUI_PID"
768
+ fi
769
+
770
+ # Install new files
771
+ rm -rf "$WEBUI_INSTALL" "$WEBUI_CLIENT"
772
+ mkdir -p "$WEBUI_INSTALL" "$WEBUI_CLIENT"
773
+ cp -r "$BUILD_TMP/dist/server/"* "$WEBUI_INSTALL/"
774
+ cp -r "$BUILD_TMP/dist/client/"* "$WEBUI_CLIENT/"
775
  cp "$BUILD_TMP/package.json" "$WEBUI_INSTALL/package.json"
776
+
777
+ # Install production-only node_modules
778
+ cd "$BUILD_TMP"
779
+ npm prune --omit=dev 2>&1 | tail -3
780
+ cp -r node_modules "$WEBUI_INSTALL/node_modules"
781
 
782
  # Save new version
783
  echo "$LATEST_TAG" > "$VERSION_FILE"
 
785
 
786
  # Restart WebUI
787
  cd "$WEBUI_INSTALL"
788
+ export PORT=6060 UPSTREAM=http://127.0.0.1:8642 HERMES_HOME=/root/.hermes
789
+ export AUTH_TOKEN="${AUTH_TOKEN:-hermes-bot-2026}" CORS_ORIGINS="*" NODE_ENV=production
790
  node index.js >> /data/hermes/logs/webui.log 2>&1 &
791
  NEW_PID=$!
792
+ _log_and_tee "WebUI upgraded to $LATEST_TAG (new PID: $NEW_PID)"
793
 
794
  # Verify
795
  sleep 3
796
+ if curl -sf http://127.0.0.1:6060/health > /dev/null 2>&1; then
797
+ _log_and_tee "$LATEST_TAG is running and healthy"
798
  else
799
+ _log_and_tee "WARNING: health check failed after upgrade"
800
  fi
801
 
802
  rm -rf "$BUILD_TMP"
803
+ _log_and_tee "=== WebUI auto-update complete ==="
804
  }
805
 
806
  # ── Start hermes-web-ui Node.js BFF server on :6060 ──