import os import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama MODEL_REPO = "Jackrong/Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-GGUF" # Önce daha pratik quant dosyalarını dene MODEL_CANDIDATES = [ "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_M.gguf", "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q4_K_S.gguf", "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q3_K_M.gguf", "Qwen3.5-27B-Claude-4.6-Opus-Reasoning-Distilled-Q2_K.gguf", ] llm = None loaded_model_file = None def download_first_available_model(token: str | None): last_error = None for filename in MODEL_CANDIDATES: try: model_path = hf_hub_download( repo_id=MODEL_REPO, filename=filename, token=token, ) return model_path, filename except Exception as e: last_error = e raise RuntimeError( "Uygun GGUF dosyası indirilemedi. " f"Denenen dosyalar: {', '.join(MODEL_CANDIDATES)}. " f"Son hata: {last_error}" ) def build_model(model_path: str): cpu_count = os.cpu_count() or 2 # CPU Space için daha temkinli ayarlar n_threads = max(1, min(8, cpu_count)) return Llama( model_path=model_path, n_ctx=4096, n_threads=n_threads, n_batch=128, n_gpu_layers=0, verbose=False, ) def get_model(hf_token: gr.OAuthToken | None): global llm, loaded_model_file if llm is not None: return llm token = hf_token.token if hf_token is not None else None model_path, filename = download_first_available_model(token) llm = build_model(model_path) loaded_model_file = filename return llm def normalize_history(history): messages = [] for item in history or []: if isinstance(item, dict): role = item.get("role") content = item.get("content", "") if role in ("user", "assistant", "system"): messages.append({"role": role, "content": str(content)}) elif isinstance(item, (list, tuple)) and len(item) == 2: user_msg, assistant_msg = item if user_msg: messages.append({"role": "user", "content": str(user_msg)}) if assistant_msg: messages.append({"role": "assistant", "content": str(assistant_msg)}) return messages def respond( message, history, system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken | None, ): global loaded_model_file try: model = get_model(hf_token) except Exception as e: yield ( "Model yüklenemedi.\n\n" f"Hata: {e}\n\n" "Olası nedenler:\n" "- Space RAM kapasitesi yetersiz\n" "- GGUF dosya adı değişmiş\n" "- Model erişimi için yetkili Hugging Face hesabı gerekiyor\n" "- llama-cpp-python bu ortamda düzgün kurulmadı" ) return messages = [{"role": "system", "content": str(system_message)}] messages.extend(normalize_history(history)) messages.append({"role": "user", "content": str(message)}) response = "" header = f"[Model: {loaded_model_file}]\n\n" try: stream = model.create_chat_completion( messages=messages, max_tokens=int(max_tokens), temperature=float(temperature), top_p=float(top_p), stream=True, ) first_token = True for chunk in stream: token = "" choices = chunk.get("choices", []) if choices: delta = choices[0].get("delta", {}) token = delta.get("content", "") or "" if token: response += token if first_token: yield header + response first_token = False else: yield header + response if not response: yield header + "(Model yanıt üretmedi.)" except Exception as e: partial = header + response if response else header yield ( partial + "\n\nÜretim sırasında hata oluştu.\n" f"Hata: {e}\n\n" "Daha düşük max_tokens veya daha küçük quant dosyası deneyebilirsin." ) with gr.Blocks() as demo: gr.Markdown("# GGUF Chat Demo (Fallback)") with gr.Sidebar(): gr.LoginButton() gr.Markdown( "Model private veya gated ise giriş yapman gerekebilir. " "Uygun GGUF dosyası otomatik seçilmeye çalışılır." ) chatbot = gr.ChatInterface( fn=respond, additional_inputs=[ gr.Textbox( value="You are a friendly Chatbot.", label="System message", ), gr.Slider( minimum=1, maximum=1024, value=256, step=1, label="Max new tokens", ), gr.Slider( minimum=0.1, maximum=1.5, value=0.7, step=0.1, label="Temperature", ), gr.Slider( minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p", ), ], ) chatbot.render() if __name__ == "__main__": demo.launch()