| import os |
| import gradio as gr |
| from huggingface_hub import hf_hub_download |
| from llama_cpp import Llama |
|
|
| MODEL_REPO = "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF" |
|
|
| |
| MODEL_CANDIDATES = [ |
| "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_M.gguf", |
| "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_S.gguf", |
| "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q3_K_M.gguf", |
| "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q2_K.gguf", |
| ] |
|
|
| llm = None |
| loaded_model_file = None |
|
|
|
|
| def download_first_available_model(token: str | None): |
| last_error = None |
|
|
| for filename in MODEL_CANDIDATES: |
| try: |
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=filename, |
| token=token, |
| ) |
| return model_path, filename |
| except Exception as e: |
| last_error = e |
|
|
| raise RuntimeError( |
| "Uygun GGUF dosyası indirilemedi. " |
| f"Denenen dosyalar: {', '.join(MODEL_CANDIDATES)}. " |
| f"Son hata: {last_error}" |
| ) |
|
|
|
|
| def build_model(model_path: str): |
| cpu_count = os.cpu_count() or 2 |
|
|
| |
| n_threads = max(1, min(8, cpu_count)) |
|
|
| return Llama( |
| model_path=model_path, |
| n_ctx=4096, |
| n_threads=n_threads, |
| n_batch=128, |
| n_gpu_layers=0, |
| verbose=False, |
| ) |
|
|
|
|
| def get_model(hf_token: gr.OAuthToken | None): |
| global llm, loaded_model_file |
|
|
| if llm is not None: |
| return llm |
|
|
| token = hf_token.token if hf_token is not None else None |
|
|
| model_path, filename = download_first_available_model(token) |
| llm = build_model(model_path) |
| loaded_model_file = filename |
| return llm |
|
|
|
|
| def normalize_history(history): |
| messages = [] |
|
|
| for item in history or []: |
| if isinstance(item, dict): |
| role = item.get("role") |
| content = item.get("content", "") |
| if role in ("user", "assistant", "system"): |
| messages.append({"role": role, "content": str(content)}) |
| elif isinstance(item, (list, tuple)) and len(item) == 2: |
| user_msg, assistant_msg = item |
| if user_msg: |
| messages.append({"role": "user", "content": str(user_msg)}) |
| if assistant_msg: |
| messages.append({"role": "assistant", "content": str(assistant_msg)}) |
|
|
| return messages |
|
|
|
|
| def respond( |
| message, |
| history, |
| system_message, |
| max_tokens, |
| temperature, |
| top_p, |
| hf_token: gr.OAuthToken | None, |
| ): |
| global loaded_model_file |
|
|
| try: |
| model = get_model(hf_token) |
| except Exception as e: |
| yield ( |
| "Model yüklenemedi.\n\n" |
| f"Hata: {e}\n\n" |
| "Olası nedenler:\n" |
| "- Space RAM kapasitesi yetersiz\n" |
| "- GGUF dosya adı değişmiş\n" |
| "- Model erişimi için yetkili Hugging Face hesabı gerekiyor\n" |
| "- llama-cpp-python bu ortamda düzgün kurulmadı" |
| ) |
| return |
|
|
| messages = [{"role": "system", "content": str(system_message)}] |
| messages.extend(normalize_history(history)) |
| messages.append({"role": "user", "content": str(message)}) |
|
|
| response = "" |
| header = f"[Model: {loaded_model_file}]\n\n" |
|
|
| try: |
| stream = model.create_chat_completion( |
| messages=messages, |
| max_tokens=int(max_tokens), |
| temperature=float(temperature), |
| top_p=float(top_p), |
| stream=True, |
| ) |
|
|
| first_token = True |
| for chunk in stream: |
| token = "" |
| choices = chunk.get("choices", []) |
| if choices: |
| delta = choices[0].get("delta", {}) |
| token = delta.get("content", "") or "" |
|
|
| if token: |
| response += token |
| if first_token: |
| yield header + response |
| first_token = False |
| else: |
| yield header + response |
|
|
| if not response: |
| yield header + "(Model yanıt üretmedi.)" |
|
|
| except Exception as e: |
| partial = header + response if response else header |
| yield ( |
| partial |
| + "\n\nÜretim sırasında hata oluştu.\n" |
| f"Hata: {e}\n\n" |
| "Daha düşük max_tokens veya daha küçük quant dosyası deneyebilirsin." |
| ) |
|
|
|
|
| with gr.Blocks() as demo: |
| gr.Markdown("# GGUF Chat Demo (Fallback)") |
|
|
| with gr.Sidebar(): |
| gr.LoginButton() |
| gr.Markdown( |
| "Model private veya gated ise giriş yapman gerekebilir. " |
| "Uygun GGUF dosyası otomatik seçilmeye çalışılır." |
| ) |
|
|
| chatbot = gr.ChatInterface( |
| fn=respond, |
| additional_inputs=[ |
| gr.Textbox( |
| value="You are a friendly Chatbot.", |
| label="System message", |
| ), |
| gr.Slider( |
| minimum=1, |
| maximum=1024, |
| value=256, |
| step=1, |
| label="Max new tokens", |
| ), |
| gr.Slider( |
| minimum=0.1, |
| maximum=1.5, |
| value=0.7, |
| step=0.1, |
| label="Temperature", |
| ), |
| gr.Slider( |
| minimum=0.1, |
| maximum=1.0, |
| value=0.9, |
| step=0.05, |
| label="Top-p", |
| ), |
| ], |
| ) |
| chatbot.render() |
|
|
| if __name__ == "__main__": |
| demo.launch() |