ZAYA1-8B

Running

App Files Files Community

ZAYA1-8B / app.py

janhetzler

Update app.py

935ffbb verified 8 days ago

raw

history blame contribute delete

6.2 kB

	import os
	import json
	import subprocess
	import sys
	import gradio as gr
	from huggingface_hub import hf_hub_download

	# ── Build llama.cpp with ZAYA1 Support (OPTIMIZED) ──────────────────────────

	print("🔨 Building llama.cpp with ZAYA1/CCA support...")

	build_dir = "/tmp/llama-cpp-build"
	os.makedirs(build_dir, exist_ok=True)

	if not os.path.exists(f"{build_dir}/llama.cpp"):
	print("📦 Cloning Zyphra/llama.cpp fork...")
	result = subprocess.run([
	"git", "clone",
	"--depth", "1",
	"https://github.com/Zyphra/llama.cpp.git",
	f"{build_dir}/llama.cpp"
	], capture_output=True, text=True)

	if result.returncode != 0:
	print(f"❌ Clone failed: {result.stderr[:500]}")
	sys.exit(1)
	print("✅ Clone complete")

	os.chdir(f"{build_dir}/llama.cpp")
	print("🔨 Compiling with CMake (30 minutes max)...")

	os.makedirs("build", exist_ok=True)
	os.chdir("build")

	cpu_count = os.cpu_count() or 4

	print(" 📋 Configuring...")
	configure = subprocess.run(
	[
	"cmake", "..",
	"-DCMAKE_BUILD_TYPE=Release",
	"-DCMAKE_CXX_FLAGS=-O3",
	"-DCMAKE_C_FLAGS=-O3",
	"-DBUILD_SHARED_LIBS=OFF",
	"-DLLAMA_CUDA=OFF",
	"-DLLAMA_BLAS=OFF",
	],
	capture_output=True,
	text=True,
	timeout=300
	)

	print(f" ⚙️ Building with {cpu_count} cores...")
	result = subprocess.run(
	["cmake", "--build", ".", "-j", str(cpu_count)],
	capture_output=True,
	text=True,
	timeout=1800
	)

	llama_cli_path = f"{build_dir}/llama.cpp/build/bin/llama-cli"
	if not os.path.exists(llama_cli_path):
	llama_cli_path = f"{build_dir}/llama.cpp/build/llama-cli"
	if not os.path.exists(llama_cli_path):
	print(f"❌ Build failed!")
	sys.exit(1)

	print(f"✅ llama-cli built successfully!")
	os.chmod(llama_cli_path, 0o755)

	# ── Download GGUF ────────────────────────────────────────────────────────────

	GGUF_REPO_ID = "Abiray/ZAYA1-8B-GGUF"
	GGUF_FILE = "ZAYA1-8B-Q4_K_M.gguf"

	print(f"📥 Downloading ZAYA1-8B (~5.5GB)...")
	model_path = hf_hub_download(repo_id=GGUF_REPO_ID, filename=GGUF_FILE)
	print(f"✅ Ready!")

	# ── Gradio Server ────────────────────────────────────────────────────────────

	app = gr.Blocks(title="ZAYA1-8B")

	# ── Generate Function ────────────────────────────────────────────────────────

	def generate_response(message, history, system_prompt, temperature, top_p, max_tokens):
	if not message.strip():
	return "Please enter a message."

	prompt_parts = []

	if system_prompt:
	prompt_parts.append(f"System: {system_prompt}\n")

	if history:
	for turn in history:
	prompt_parts.append(f"User: {turn['content']}\n")

	prompt_parts.append(f"User: {message}\nAssistant:")
	full_prompt = "".join(prompt_parts)

	cmd = [
	llama_cli_path,
	"-m", model_path,
	"-p", full_prompt,
	"-n", str(max_tokens),
	"-t", "2",
	"-c", "2048",
	"--mmap",
	"--temp", str(temperature),
	"--top-p", str(top_p),
	"--no-display-prompt"
	]

	try:
	result = subprocess.run(
	cmd,
	capture_output=True,
	text=True,
	timeout=600
	)

	if result.returncode != 0:
	return f"Error: {result.stderr[:300]}"

	output = result.stdout.strip()
	return output if output else "No output generated"

	except subprocess.TimeoutExpired:
	return "Timeout"
	except Exception as e:
	return f"Error: {str(e)}"

	# ── Gradio UI ────────────────────────────────────────────────────────────────

	with app:
	gr.Markdown("""
	# 🔥 ZAYA1-8B Reasoning Model

	Model: ZAYA1-8B-Q4_K_M (5.57GB)
	Architecture: Sparse MoE + CCA
	⏱️ Speed: 1-5 minutes per response
	""")

	with gr.Row():
	with gr.Column():
	chatbot = gr.Chatbot(label="Chat", height=400)
	message_input = gr.Textbox(label="Message", lines=2)
	submit_btn = gr.Button("Send", variant="primary")

	with gr.Column():
	system_prompt = gr.Textbox(
	label="System Prompt",
	value="You are ZAYA1-8B, a highly capable reasoning assistant built by Zyphra.",
	lines=3
	)
	temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, value=1.0, step=0.1)
	top_p = gr.Slider(label="Top-P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
	max_tokens = gr.Slider(label="Max Tokens", minimum=100, maximum=2048, value=512, step=100)

	def handle_submit(message, chat_history, system_prompt, temperature, top_p, max_tokens):
	if not message.strip():
	return chat_history, ""

	response = generate_response(
	message,
	chat_history,
	system_prompt,
	temperature,
	top_p,
	max_tokens
	)

	chat_history.append([message, response])
	return chat_history, ""

	submit_btn.click(
	handle_submit,
	inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens],
	outputs=[chatbot, message_input]
	)

	message_input.submit(
	handle_submit,
	inputs=[message_input, chatbot, system_prompt, temperature, top_p, max_tokens],
	outputs=[chatbot, message_input]
	)

	if __name__ == "__main__":
	print("\n" + "="*70)
	print("🚀 ZAYA1-8B is launching...")
	print("="*70 + "\n")
	app.launch(show_error=True)