deneme / app.py
script52's picture
Update app.py
11b54c5 verified
import os
import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
MODEL_REPO = "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF"
# Önce daha pratik quant dosyalarını dene
MODEL_CANDIDATES = [
"Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_M.gguf",
"Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_S.gguf",
"Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q3_K_M.gguf",
"Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q2_K.gguf",
]
llm = None
loaded_model_file = None
def download_first_available_model(token: str | None):
last_error = None
for filename in MODEL_CANDIDATES:
try:
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=filename,
token=token,
)
return model_path, filename
except Exception as e:
last_error = e
raise RuntimeError(
"Uygun GGUF dosyası indirilemedi. "
f"Denenen dosyalar: {', '.join(MODEL_CANDIDATES)}. "
f"Son hata: {last_error}"
)
def build_model(model_path: str):
cpu_count = os.cpu_count() or 2
# CPU Space için daha temkinli ayarlar
n_threads = max(1, min(8, cpu_count))
return Llama(
model_path=model_path,
n_ctx=4096,
n_threads=n_threads,
n_batch=128,
n_gpu_layers=0,
verbose=False,
)
def get_model(hf_token: gr.OAuthToken | None):
global llm, loaded_model_file
if llm is not None:
return llm
token = hf_token.token if hf_token is not None else None
model_path, filename = download_first_available_model(token)
llm = build_model(model_path)
loaded_model_file = filename
return llm
def normalize_history(history):
messages = []
for item in history or []:
if isinstance(item, dict):
role = item.get("role")
content = item.get("content", "")
if role in ("user", "assistant", "system"):
messages.append({"role": role, "content": str(content)})
elif isinstance(item, (list, tuple)) and len(item) == 2:
user_msg, assistant_msg = item
if user_msg:
messages.append({"role": "user", "content": str(user_msg)})
if assistant_msg:
messages.append({"role": "assistant", "content": str(assistant_msg)})
return messages
def respond(
message,
history,
system_message,
max_tokens,
temperature,
top_p,
hf_token: gr.OAuthToken | None,
):
global loaded_model_file
try:
model = get_model(hf_token)
except Exception as e:
yield (
"Model yüklenemedi.\n\n"
f"Hata: {e}\n\n"
"Olası nedenler:\n"
"- Space RAM kapasitesi yetersiz\n"
"- GGUF dosya adı değişmiş\n"
"- Model erişimi için yetkili Hugging Face hesabı gerekiyor\n"
"- llama-cpp-python bu ortamda düzgün kurulmadı"
)
return
messages = [{"role": "system", "content": str(system_message)}]
messages.extend(normalize_history(history))
messages.append({"role": "user", "content": str(message)})
response = ""
header = f"[Model: {loaded_model_file}]\n\n"
try:
stream = model.create_chat_completion(
messages=messages,
max_tokens=int(max_tokens),
temperature=float(temperature),
top_p=float(top_p),
stream=True,
)
first_token = True
for chunk in stream:
token = ""
choices = chunk.get("choices", [])
if choices:
delta = choices[0].get("delta", {})
token = delta.get("content", "") or ""
if token:
response += token
if first_token:
yield header + response
first_token = False
else:
yield header + response
if not response:
yield header + "(Model yanıt üretmedi.)"
except Exception as e:
partial = header + response if response else header
yield (
partial
+ "\n\nÜretim sırasında hata oluştu.\n"
f"Hata: {e}\n\n"
"Daha düşük max_tokens veya daha küçük quant dosyası deneyebilirsin."
)
with gr.Blocks() as demo:
gr.Markdown("# GGUF Chat Demo (Fallback)")
with gr.Sidebar():
gr.LoginButton()
gr.Markdown(
"Model private veya gated ise giriş yapman gerekebilir. "
"Uygun GGUF dosyası otomatik seçilmeye çalışılır."
)
chatbot = gr.ChatInterface(
fn=respond,
additional_inputs=[
gr.Textbox(
value="You are a friendly Chatbot.",
label="System message",
),
gr.Slider(
minimum=1,
maximum=1024,
value=256,
step=1,
label="Max new tokens",
),
gr.Slider(
minimum=0.1,
maximum=1.5,
value=0.7,
step=0.1,
label="Temperature",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.9,
step=0.05,
label="Top-p",
),
],
)
chatbot.render()
if __name__ == "__main__":
demo.launch()