import os import json import subprocess import sys import gradio as gr from huggingface_hub import hf_hub_download # ── Build llama.cpp with ZAYA1 Support (OPTIMIZED) ────────────────────────── print("🔨 Building llama.cpp with ZAYA1/CCA support...") build_dir = "/tmp/llama-cpp-build" os.makedirs(build_dir, exist_ok=True) if not os.path.exists(f"{build_dir}/llama.cpp"): print("📦 Cloning Zyphra/llama.cpp fork...") result = subprocess.run([ "git", "clone", "--depth", "1", "https://github.com/Zyphra/llama.cpp.git", f"{build_dir}/llama.cpp" ], capture_output=True, text=True) if result.returncode != 0: print(f"❌ Clone failed: {result.stderr[:500]}") sys.exit(1) print("✅ Clone complete") os.chdir(f"{build_dir}/llama.cpp") print("🔨 Compiling with CMake (30 minutes max)...") os.makedirs("build", exist_ok=True) os.chdir("build") cpu_count = os.cpu_count() or 4 print(" 📋 Configuring...") configure = subprocess.run( [ "cmake", "..", "-DCMAKE_BUILD_TYPE=Release", "-DCMAKE_CXX_FLAGS=-O3", "-DCMAKE_C_FLAGS=-O3", "-DBUILD_SHARED_LIBS=OFF", "-DLLAMA_CUDA=OFF", "-DLLAMA_BLAS=OFF", ], capture_output=True, text=True, timeout=300 ) print(f" ⚙️ Building with {cpu_count} cores...") result = subprocess.run( ["cmake", "--build", ".", "-j", str(cpu_count)], capture_output=True, text=True, timeout=1800 ) llama_cli_path = f"{build_dir}/llama.cpp/build/bin/llama-cli" if not os.path.exists(llama_cli_path): llama_cli_path = f"{build_dir}/llama.cpp/build/llama-cli" if not os.path.exists(llama_cli_path): print(f"❌ Build failed!") sys.exit(1) print(f"✅ llama-cli built successfully!") os.chmod(llama_cli_path, 0o755) # ── Download GGUF ──────────────────────────────────────────────────────────── GGUF_REPO_ID = "Abiray/ZAYA1-8B-GGUF" GGUF_FILE = "ZAYA1-8B-Q4_K_M.gguf" print(f"📥 Downloading ZAYA1-8B (~5.5GB)...") model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILE) print(f"✅ Ready!") # ── Gradio Server ──────────────────────────────────────────────────────────── app = gr.Blocks(title="ZAYA1-8B") # ── Generate Function ──────────────────────────────────────────────────────── def generate_response(message, history, system_prompt, temperature, top_p, max_tokens): if not message.strip(): return "Please enter a message." prompt_parts = [] if system_prompt: prompt_parts.append(f"System: {system_prompt}\n") if history: for turn in history: prompt_parts.append(f"User: {turn['content']}\n") prompt_parts.append(f"User: {message}\nAssistant:") full_prompt = "".join(prompt_parts) cmd = [ llama_cli_path, "-m", model_path, "-p", full_prompt, "-n", str(max_tokens), "-t", "2", "-c", "2048", "--mmap", "--temp", str(temperature), "--top-p", str(top_p), "--no-display-prompt" ] try: result = subprocess.run( cmd, capture_output=True, text=True, timeout=600 ) if result.returncode != 0: return f"Error: {result.stderr[:300]}" output = result.stdout.strip() return output if output else "No output generated" except subprocess.TimeoutExpired: return "Timeout" except Exception as e: return f"Error: {str(e)}" # ── Gradio UI ──────────────────────────────────────────────────────────────── with app: gr.Markdown(""" # 🔥 ZAYA1-8B Reasoning Model **Model:** ZAYA1-8B-Q4_K_M (5.57GB) **Architecture:** Sparse MoE + CCA ⏱️ **Speed:** 1-5 minutes per response """) with gr.Row(): with gr.Column(): chatbot = gr.Chatbot(label="Chat", height=400) message_input = gr.Textbox(label="Message", lines=2) submit_btn = gr.Button("Send", variant="primary") with gr.Column(): system_prompt = gr.Textbox( label="System Prompt", value="You are ZAYA1-8B, a highly capable reasoning assistant built by Zyphra.", lines=3 ) temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=1.0, step=0.1) top_p = gr.Slider(label="Top-P", minimum=0.0, maximum=1.0, value=0.95, step=0.05) max_tokens = gr.Slider(label="Max Tokens", minimum=100, maximum=2048, value=512, step=100) def handle_submit(message, chat_history, system_prompt, temperature, top_p, max_tokens): if not message.strip(): return chat_history, "" response = generate_response( message, chat_history, system_prompt, temperature, top_p, max_tokens ) chat_history.append([message, response]) return chat_history, "" submit_btn.click( handle_submit, inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens], outputs=[chatbot, message_input] ) message_input.submit( handle_submit, inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens], outputs=[chatbot, message_input] ) if __name__ == "__main__": print("\n" + "="*70) print("🚀 ZAYA1-8B is launching...") print("="*70 + "\n") app.launch(show_error=True)