#!/bin/bash set -e MODEL_DIR="/home/user/app/models" MODEL_PATH="$MODEL_DIR/ggml-model-i2_s.gguf" SERVER_BIN="/home/user/app/llama-server" # Download model if not present (runtime download to avoid build timeout) if [ ! -f "$MODEL_PATH" ]; then echo "Downloading BitNet b1.58 2B4T GGUF model (1.1 GB)..." python -c " from huggingface_hub import hf_hub_download import os path = hf_hub_download( repo_id='microsoft/bitnet-b1.58-2B-4T-gguf', filename='ggml-model-i2_s.gguf', local_dir='$MODEL_DIR' ) print(f'Downloaded to: {path}') " echo "Model downloaded!" fi # Start llama-server in background echo "Starting bitnet.cpp llama-server..." $SERVER_BIN \ -m "$MODEL_PATH" \ --host 127.0.0.1 \ --port 8080 \ -t 2 \ -c 4096 \ --log-disable & SERVER_PID=$! # Wait for server to be ready echo "Waiting for server to start..." for i in $(seq 1 120); do if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then echo "Server ready! (took ${i}s)" break fi if [ $i -eq 120 ]; then echo "ERROR: Server failed to start after 120s" exit 1 fi sleep 1 done # Start Gradio app echo "Starting Gradio app..." exec python app.py