import gradio as gr from llama_cpp import Llama # Path to your Q2 model inside the Space MODEL_PATH = "finance-chat.Q2_K.gguf" # Load the model with llama.cpp llm = Llama( model_path=MODEL_PATH, n_threads=2, # Free HF CPU = ~2 threads n_ctx=4096, verbose=False ) def generate(prompt): output = llm( prompt, max_tokens=256, temperature=0.7, top_p=0.9 ) return output["choices"][0]["text"].strip() demo = gr.Interface( fn=generate, inputs="text", outputs="text", title="Finance Chat LLM (GGUF Q2_K - Free Space)" ) demo.launch()