ZAYA1-8B / app.py
janhetzler's picture
Update app.py
935ffbb verified
import os
import json
import subprocess
import sys
import gradio as gr
from huggingface_hub import hf_hub_download
# ── Build llama.cpp with ZAYA1 Support (OPTIMIZED) ──────────────────────────
print("🔨 Building llama.cpp with ZAYA1/CCA support...")
build_dir = "/tmp/llama-cpp-build"
os.makedirs(build_dir, exist_ok=True)
if not os.path.exists(f"{build_dir}/llama.cpp"):
print("📦 Cloning Zyphra/llama.cpp fork...")
result = subprocess.run([
"git", "clone",
"--depth", "1",
"https://github.com/Zyphra/llama.cpp.git",
f"{build_dir}/llama.cpp"
], capture_output=True, text=True)
if result.returncode != 0:
print(f"❌ Clone failed: {result.stderr[:500]}")
sys.exit(1)
print("✅ Clone complete")
os.chdir(f"{build_dir}/llama.cpp")
print("🔨 Compiling with CMake (30 minutes max)...")
os.makedirs("build", exist_ok=True)
os.chdir("build")
cpu_count = os.cpu_count() or 4
print(" 📋 Configuring...")
configure = subprocess.run(
[
"cmake", "..",
"-DCMAKE_BUILD_TYPE=Release",
"-DCMAKE_CXX_FLAGS=-O3",
"-DCMAKE_C_FLAGS=-O3",
"-DBUILD_SHARED_LIBS=OFF",
"-DLLAMA_CUDA=OFF",
"-DLLAMA_BLAS=OFF",
],
capture_output=True,
text=True,
timeout=300
)
print(f" ⚙️ Building with {cpu_count} cores...")
result = subprocess.run(
["cmake", "--build", ".", "-j", str(cpu_count)],
capture_output=True,
text=True,
timeout=1800
)
llama_cli_path = f"{build_dir}/llama.cpp/build/bin/llama-cli"
if not os.path.exists(llama_cli_path):
llama_cli_path = f"{build_dir}/llama.cpp/build/llama-cli"
if not os.path.exists(llama_cli_path):
print(f"❌ Build failed!")
sys.exit(1)
print(f"✅ llama-cli built successfully!")
os.chmod(llama_cli_path, 0o755)
# ── Download GGUF ────────────────────────────────────────────────────────────
GGUF_REPO_ID = "Abiray/ZAYA1-8B-GGUF"
GGUF_FILE = "ZAYA1-8B-Q4_K_M.gguf"
print(f"📥 Downloading ZAYA1-8B (~5.5GB)...")
model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILE)
print(f"✅ Ready!")
# ── Gradio Server ────────────────────────────────────────────────────────────
app = gr.Blocks(title="ZAYA1-8B")
# ── Generate Function ────────────────────────────────────────────────────────
def generate_response(message, history, system_prompt, temperature, top_p, max_tokens):
if not message.strip():
return "Please enter a message."
prompt_parts = []
if system_prompt:
prompt_parts.append(f"System: {system_prompt}\n")
if history:
for turn in history:
prompt_parts.append(f"User: {turn['content']}\n")
prompt_parts.append(f"User: {message}\nAssistant:")
full_prompt = "".join(prompt_parts)
cmd = [
llama_cli_path,
"-m", model_path,
"-p", full_prompt,
"-n", str(max_tokens),
"-t", "2",
"-c", "2048",
"--mmap",
"--temp", str(temperature),
"--top-p", str(top_p),
"--no-display-prompt"
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=600
)
if result.returncode != 0:
return f"Error: {result.stderr[:300]}"
output = result.stdout.strip()
return output if output else "No output generated"
except subprocess.TimeoutExpired:
return "Timeout"
except Exception as e:
return f"Error: {str(e)}"
# ── Gradio UI ────────────────────────────────────────────────────────────────
with app:
gr.Markdown("""
# 🔥 ZAYA1-8B Reasoning Model
**Model:** ZAYA1-8B-Q4_K_M (5.57GB)
**Architecture:** Sparse MoE + CCA
⏱️ **Speed:** 1-5 minutes per response
""")
with gr.Row():
with gr.Column():
chatbot = gr.Chatbot(label="Chat", height=400)
message_input = gr.Textbox(label="Message", lines=2)
submit_btn = gr.Button("Send", variant="primary")
with gr.Column():
system_prompt = gr.Textbox(
label="System Prompt",
value="You are ZAYA1-8B, a highly capable reasoning assistant built by Zyphra.",
lines=3
)
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=1.0, step=0.1)
top_p = gr.Slider(label="Top-P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
max_tokens = gr.Slider(label="Max Tokens", minimum=100, maximum=2048, value=512, step=100)
def handle_submit(message, chat_history, system_prompt, temperature, top_p, max_tokens):
if not message.strip():
return chat_history, ""
response = generate_response(
message,
chat_history,
system_prompt,
temperature,
top_p,
max_tokens
)
chat_history.append([message, response])
return chat_history, ""
submit_btn.click(
handle_submit,
inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens],
outputs=[chatbot, message_input]
)
message_input.submit(
handle_submit,
inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens],
outputs=[chatbot, message_input]
)
if __name__ == "__main__":
print("\n" + "="*70)
print("🚀 ZAYA1-8B is launching...")
print("="*70 + "\n")
app.launch(show_error=True)