#!/bin/bash
set -e

MODEL_DIR="/home/user/app/models"
MODEL_PATH="$MODEL_DIR/ggml-model-i2_s.gguf"
SERVER_BIN="/home/user/app/llama-server"

# Download model if not present (runtime download to avoid build timeout)
if [ ! -f "$MODEL_PATH" ]; then
    echo "Downloading BitNet b1.58 2B4T GGUF model (1.1 GB)..."
    python -c "
from huggingface_hub import hf_hub_download
import os
path = hf_hub_download(
    repo_id='microsoft/bitnet-b1.58-2B-4T-gguf',
    filename='ggml-model-i2_s.gguf',
    local_dir='$MODEL_DIR'
)
print(f'Downloaded to: {path}')
"
    echo "Model downloaded!"
fi

# Start llama-server in background
echo "Starting bitnet.cpp llama-server..."
$SERVER_BIN \
    -m "$MODEL_PATH" \
    --host 127.0.0.1 \
    --port 8080 \
    -t 2 \
    -c 4096 \
    --log-disable &

SERVER_PID=$!

# Wait for server to be ready
echo "Waiting for server to start..."
for i in $(seq 1 120); do
    if curl -s http://127.0.0.1:8080/health > /dev/null 2>&1; then
        echo "Server ready! (took ${i}s)"
        break
    fi
    if [ $i -eq 120 ]; then
        echo "ERROR: Server failed to start after 120s"
        exit 1
    fi
    sleep 1
done

# Start Gradio app
echo "Starting Gradio app..."
exec python app.py