Spaces:
Running
Running
| import os | |
| import json | |
| import subprocess | |
| import sys | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| # ── Build llama.cpp with ZAYA1 Support (OPTIMIZED) ────────────────────────── | |
| print("🔨 Building llama.cpp with ZAYA1/CCA support...") | |
| build_dir = "/tmp/llama-cpp-build" | |
| os.makedirs(build_dir, exist_ok=True) | |
| if not os.path.exists(f"{build_dir}/llama.cpp"): | |
| print("📦 Cloning Zyphra/llama.cpp fork...") | |
| result = subprocess.run([ | |
| "git", "clone", | |
| "--depth", "1", | |
| "https://github.com/Zyphra/llama.cpp.git", | |
| f"{build_dir}/llama.cpp" | |
| ], capture_output=True, text=True) | |
| if result.returncode != 0: | |
| print(f"❌ Clone failed: {result.stderr[:500]}") | |
| sys.exit(1) | |
| print("✅ Clone complete") | |
| os.chdir(f"{build_dir}/llama.cpp") | |
| print("🔨 Compiling with CMake (30 minutes max)...") | |
| os.makedirs("build", exist_ok=True) | |
| os.chdir("build") | |
| cpu_count = os.cpu_count() or 4 | |
| print(" 📋 Configuring...") | |
| configure = subprocess.run( | |
| [ | |
| "cmake", "..", | |
| "-DCMAKE_BUILD_TYPE=Release", | |
| "-DCMAKE_CXX_FLAGS=-O3", | |
| "-DCMAKE_C_FLAGS=-O3", | |
| "-DBUILD_SHARED_LIBS=OFF", | |
| "-DLLAMA_CUDA=OFF", | |
| "-DLLAMA_BLAS=OFF", | |
| ], | |
| capture_output=True, | |
| text=True, | |
| timeout=300 | |
| ) | |
| print(f" ⚙️ Building with {cpu_count} cores...") | |
| result = subprocess.run( | |
| ["cmake", "--build", ".", "-j", str(cpu_count)], | |
| capture_output=True, | |
| text=True, | |
| timeout=1800 | |
| ) | |
| llama_cli_path = f"{build_dir}/llama.cpp/build/bin/llama-cli" | |
| if not os.path.exists(llama_cli_path): | |
| llama_cli_path = f"{build_dir}/llama.cpp/build/llama-cli" | |
| if not os.path.exists(llama_cli_path): | |
| print(f"❌ Build failed!") | |
| sys.exit(1) | |
| print(f"✅ llama-cli built successfully!") | |
| os.chmod(llama_cli_path, 0o755) | |
| # ── Download GGUF ──────────────────────────────────────────────────────────── | |
| GGUF_REPO_ID = "Abiray/ZAYA1-8B-GGUF" | |
| GGUF_FILE = "ZAYA1-8B-Q4_K_M.gguf" | |
| print(f"📥 Downloading ZAYA1-8B (~5.5GB)...") | |
| model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILE) | |
| print(f"✅ Ready!") | |
| # ── Gradio Server ──────────────────────────────────────────────────────────── | |
| app = gr.Blocks(title="ZAYA1-8B") | |
| # ── Generate Function ──────────────────────────────────────────────────────── | |
| def generate_response(message, history, system_prompt, temperature, top_p, max_tokens): | |
| if not message.strip(): | |
| return "Please enter a message." | |
| prompt_parts = [] | |
| if system_prompt: | |
| prompt_parts.append(f"System: {system_prompt}\n") | |
| if history: | |
| for turn in history: | |
| prompt_parts.append(f"User: {turn['content']}\n") | |
| prompt_parts.append(f"User: {message}\nAssistant:") | |
| full_prompt = "".join(prompt_parts) | |
| cmd = [ | |
| llama_cli_path, | |
| "-m", model_path, | |
| "-p", full_prompt, | |
| "-n", str(max_tokens), | |
| "-t", "2", | |
| "-c", "2048", | |
| "--mmap", | |
| "--temp", str(temperature), | |
| "--top-p", str(top_p), | |
| "--no-display-prompt" | |
| ] | |
| try: | |
| result = subprocess.run( | |
| cmd, | |
| capture_output=True, | |
| text=True, | |
| timeout=600 | |
| ) | |
| if result.returncode != 0: | |
| return f"Error: {result.stderr[:300]}" | |
| output = result.stdout.strip() | |
| return output if output else "No output generated" | |
| except subprocess.TimeoutExpired: | |
| return "Timeout" | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # ── Gradio UI ──────────────────────────────────────────────────────────────── | |
| with app: | |
| gr.Markdown(""" | |
| # 🔥 ZAYA1-8B Reasoning Model | |
| **Model:** ZAYA1-8B-Q4_K_M (5.57GB) | |
| **Architecture:** Sparse MoE + CCA | |
| ⏱️ **Speed:** 1-5 minutes per response | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| chatbot = gr.Chatbot(label="Chat", height=400) | |
| message_input = gr.Textbox(label="Message", lines=2) | |
| submit_btn = gr.Button("Send", variant="primary") | |
| with gr.Column(): | |
| system_prompt = gr.Textbox( | |
| label="System Prompt", | |
| value="You are ZAYA1-8B, a highly capable reasoning assistant built by Zyphra.", | |
| lines=3 | |
| ) | |
| temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=1.0, step=0.1) | |
| top_p = gr.Slider(label="Top-P", minimum=0.0, maximum=1.0, value=0.95, step=0.05) | |
| max_tokens = gr.Slider(label="Max Tokens", minimum=100, maximum=2048, value=512, step=100) | |
| def handle_submit(message, chat_history, system_prompt, temperature, top_p, max_tokens): | |
| if not message.strip(): | |
| return chat_history, "" | |
| response = generate_response( | |
| message, | |
| chat_history, | |
| system_prompt, | |
| temperature, | |
| top_p, | |
| max_tokens | |
| ) | |
| chat_history.append([message, response]) | |
| return chat_history, "" | |
| submit_btn.click( | |
| handle_submit, | |
| inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens], | |
| outputs=[chatbot, message_input] | |
| ) | |
| message_input.submit( | |
| handle_submit, | |
| inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens], | |
| outputs=[chatbot, message_input] | |
| ) | |
| if __name__ == "__main__": | |
| print("\n" + "="*70) | |
| print("🚀 ZAYA1-8B is launching...") | |
| print("="*70 + "\n") | |
| app.launch(show_error=True) |