import os
import json
import subprocess
import sys
import gradio as gr
from huggingface_hub import hf_hub_download

# ── Build llama.cpp with ZAYA1 Support (OPTIMIZED) ──────────────────────────

print("🔨 Building llama.cpp with ZAYA1/CCA support...")

build_dir = "/tmp/llama-cpp-build"
os.makedirs(build_dir, exist_ok=True)

if not os.path.exists(f"{build_dir}/llama.cpp"):
    print("📦 Cloning Zyphra/llama.cpp fork...")
    result = subprocess.run([
        "git", "clone", 
        "--depth", "1",
        "https://github.com/Zyphra/llama.cpp.git",
        f"{build_dir}/llama.cpp"
    ], capture_output=True, text=True)
    
    if result.returncode != 0:
        print(f"❌ Clone failed: {result.stderr[:500]}")
        sys.exit(1)
    print("✅ Clone complete")

os.chdir(f"{build_dir}/llama.cpp")
print("🔨 Compiling with CMake (30 minutes max)...")

os.makedirs("build", exist_ok=True)
os.chdir("build")

cpu_count = os.cpu_count() or 4

print("   📋 Configuring...")
configure = subprocess.run(
    [
        "cmake", "..",
        "-DCMAKE_BUILD_TYPE=Release",
        "-DCMAKE_CXX_FLAGS=-O3",
        "-DCMAKE_C_FLAGS=-O3",
        "-DBUILD_SHARED_LIBS=OFF",
        "-DLLAMA_CUDA=OFF",
        "-DLLAMA_BLAS=OFF",
    ],
    capture_output=True,
    text=True,
    timeout=300
)

print(f"   ⚙️  Building with {cpu_count} cores...")
result = subprocess.run(
    ["cmake", "--build", ".", "-j", str(cpu_count)],
    capture_output=True,
    text=True,
    timeout=1800
)

llama_cli_path = f"{build_dir}/llama.cpp/build/bin/llama-cli"
if not os.path.exists(llama_cli_path):
    llama_cli_path = f"{build_dir}/llama.cpp/build/llama-cli"
    if not os.path.exists(llama_cli_path):
        print(f"❌ Build failed!")
        sys.exit(1)

print(f"✅ llama-cli built successfully!")
os.chmod(llama_cli_path, 0o755)

# ── Download GGUF ────────────────────────────────────────────────────────────

GGUF_REPO_ID = "Abiray/ZAYA1-8B-GGUF"
GGUF_FILE = "ZAYA1-8B-Q4_K_M.gguf"

print(f"📥 Downloading ZAYA1-8B (~5.5GB)...")
model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILE)
print(f"✅ Ready!")

# ── Gradio Server ────────────────────────────────────────────────────────────

app = gr.Blocks(title="ZAYA1-8B")

# ── Generate Function ────────────────────────────────────────────────────────

def generate_response(message, history, system_prompt, temperature, top_p, max_tokens):
    if not message.strip():
        return "Please enter a message."
    
    prompt_parts = []
    
    if system_prompt:
        prompt_parts.append(f"System: {system_prompt}\n")
    
    if history:
        for turn in history:
            prompt_parts.append(f"User: {turn['content']}\n")
    
    prompt_parts.append(f"User: {message}\nAssistant:")
    full_prompt = "".join(prompt_parts)
    
    cmd = [
        llama_cli_path,
        "-m", model_path,
        "-p", full_prompt,
        "-n", str(max_tokens),
        "-t", "2",
        "-c", "2048",
        "--mmap",
        "--temp", str(temperature),
        "--top-p", str(top_p),
        "--no-display-prompt"
    ]
    
    try:
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=600
        )
        
        if result.returncode != 0:
            return f"Error: {result.stderr[:300]}"
        
        output = result.stdout.strip()
        return output if output else "No output generated"
        
    except subprocess.TimeoutExpired:
        return "Timeout"
    except Exception as e:
        return f"Error: {str(e)}"

# ── Gradio UI ────────────────────────────────────────────────────────────────

with app:
    gr.Markdown("""
    # 🔥 ZAYA1-8B Reasoning Model
    
    **Model:** ZAYA1-8B-Q4_K_M (5.57GB)  
    **Architecture:** Sparse MoE + CCA  
    ⏱️ **Speed:** 1-5 minutes per response
    """)
    
    with gr.Row():
        with gr.Column():
            chatbot = gr.Chatbot(label="Chat", height=400)
            message_input = gr.Textbox(label="Message", lines=2)
            submit_btn = gr.Button("Send", variant="primary")
        
        with gr.Column():
            system_prompt = gr.Textbox(
                label="System Prompt",
                value="You are ZAYA1-8B, a highly capable reasoning assistant built by Zyphra.",
                lines=3
            )
            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=1.0, step=0.1)
            top_p = gr.Slider(label="Top-P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
            max_tokens = gr.Slider(label="Max Tokens", minimum=100, maximum=2048, value=512, step=100)
    
    def handle_submit(message, chat_history, system_prompt, temperature, top_p, max_tokens):
        if not message.strip():
            return chat_history, ""
        
        response = generate_response(
            message,
            chat_history,
            system_prompt,
            temperature,
            top_p,
            max_tokens
        )
        
        chat_history.append([message, response])
        return chat_history, ""
    
    submit_btn.click(
        handle_submit,
        inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens],
        outputs=[chatbot, message_input]
    )
    
    message_input.submit(
        handle_submit,
        inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens],
        outputs=[chatbot, message_input]
    )

if __name__ == "__main__":
    print("\n" + "="*70)
    print("🚀 ZAYA1-8B is launching...")
    print("="*70 + "\n")
    app.launch(show_error=True)