personalgpt

Runtime error

personalgpt / entrypoint.sh

PrismML Deploy

Fix: metrics push (summary not commit_message), crash resilience + watchdog

0a882d1 about 1 month ago

4.72 kB

	#!/bin/bash
	set -e

	echo "Starting Bonsai-demo entrypoint..."

	# ── Validate required secrets ─────────────────────────────────────────────────
	if [ -z "$MODEL_REPO" ] \|\| [ -z "$MODEL_FILE" ]; then
	echo ""
	echo "ERROR: MODEL_REPO and MODEL_FILE secrets must be set."
	echo " Go to Space Settings → Repository Secrets and add:"
	echo " MODEL_REPO = prism-ml/Bonsai-8B-gguf"
	echo " MODEL_FILE = Bonsai-8B.gguf"
	echo ""
	exit 1
	fi

	MODEL_DIR="/app/models"
	mkdir -p "$MODEL_DIR"

	download_model() {
	local repo file path retries=5
	repo=$(echo "$1" \| tr -d '[:space:]')
	file=$(echo "$2" \| tr -d '[:space:]')
	path="$MODEL_DIR/$file"
	local url="https://huggingface.co/$repo/resolve/main/$file"
	for attempt in $(seq 1 $retries); do
	echo "Downloading (attempt $attempt/$retries): $url"
	rm -f "$path"
	if curl -fL --retry 3 --retry-delay 5 -C - -o "$path" "$url" 2>&1; then
	echo "Downloaded: $(ls -lh "$path")"
	return 0
	fi
	echo "Download failed, retrying in 10s..."
	sleep 10
	done
	echo "ERROR: Failed to download $url after $retries attempts"
	exit 1
	}

	# Primary model (required)
	MODEL_REPO=$(echo "$MODEL_REPO" \| tr -d '[:space:]')
	MODEL_FILE=$(echo "$MODEL_FILE" \| tr -d '[:space:]')
	download_model "$MODEL_REPO" "$MODEL_FILE"

	# Additional models (optional)
	MODELS_MAX=1
	for suffix in B C; do
	repo_var="MODEL_REPO_${suffix}"
	file_var="MODEL_FILE_${suffix}"
	repo=$(echo "${!repo_var:-}" \| tr -d '[:space:]')
	file=$(echo "${!file_var:-}" \| tr -d '[:space:]')
	if [ -n "$repo" ] && [ -n "$file" ]; then
	download_model "$repo" "$file"
	MODELS_MAX=$((MODELS_MAX + 1))
	fi
	done

	# ── Dashboard auth ────────────────────────────────────────────────────────────
	if [ -n "$DASHBOARD_KEY" ]; then
	HASH=$(openssl passwd -apr1 "$DASHBOARD_KEY")
	echo "admin:$HASH" > /tmp/.htpasswd
	echo "Dashboard auth: enabled (user=admin)"
	else
	echo "WARNING: DASHBOARD_KEY not set, /dash-2e215f981f3f is unprotected"
	printf 'admin:$apr1$open$open\n' > /tmp/.htpasswd
	fi

	# ── nginx temp dirs ───────────────────────────────────────────────────────────
	mkdir -p /tmp/nginx-{client-body,proxy,fastcgi,uwsgi,scgi}

	# ── Detect GPUs and start one llama-server per GPU ───────────────────────────
	GPU_COUNT=$(nvidia-smi -L 2>/dev/null \| wc -l \|\| echo 1)
	echo "GPUs detected: $GPU_COUNT"

	BACKENDS=""
	for i in $(seq 0 $((GPU_COUNT - 1))); do
	PORT=$((7861 + i))
	echo "Starting llama-server on GPU $i → port $PORT"
	CUDA_VISIBLE_DEVICES=$i /app/bin/llama-server \
	-m "$MODEL_DIR/$MODEL_FILE" \
	--host 127.0.0.1 \
	--port "$PORT" \
	-ngl 99 \
	-fa on \
	-np 4 \
	-c 262144 \
	--metrics \
	--temp 0.5 --top-p 0.85 --top-k 20 --min-p 0 \
	--reasoning-budget 0 --reasoning-format none \
	--chat-template-kwargs '{"enable_thinking": false}' \
	--log-disable &
	BACKENDS="$BACKENDS server 127.0.0.1:$PORT;\n"
	done

	printf "upstream llama_backends {\n least_conn;\n${BACKENDS}}\n" > /tmp/nginx-upstream.conf

	# ── Write stub files so /gpu and /analytics never 404 before first tick ──────
	echo '{"ts":null,"gpus":[]}' > /tmp/gpu-stats.json
	echo '# waiting for first metrics scrape...' > /tmp/llama-metrics.txt
	echo '{"updated_at":null,"summary_24h":{"requests":0,"unique_users":0},"summary_7d":{"requests":0,"unique_users":0},"summary_total":{"requests":0,"unique_users":0},"requests_by_hour":[],"requests_by_day":[],"top_users":[]}' > /tmp/analytics.json

	# ── Start metrics pusher with watchdog ────────────────────────────────────────
	start_metrics_pusher() {
	while true; do
	echo "[watchdog] Starting metrics_pusher.py..."
	python3 /app/metrics_pusher.py \|\| true
	echo "[watchdog] metrics_pusher.py exited — restarting in 5s..."
	sleep 5
	done
	}
	start_metrics_pusher &

	echo ""
	echo "=== Bonsai-demo ==="
	echo " Models: $(ls "$MODEL_DIR"/*.gguf \| xargs -n1 basename \| tr '\n' ', ' \| sed 's/,$//')"
	echo " GPUs: $GPU_COUNT"
	echo " Port: 7860 (nginx → llama-server)"
	echo ""

	exec nginx -c /app/nginx.conf