import gradio as gr
from llama_cpp import Llama

# Path to your Q2 model inside the Space
MODEL_PATH = "finance-chat.Q2_K.gguf"

# Load the model with llama.cpp
llm = Llama(
    model_path=MODEL_PATH,
    n_threads=2,        # Free HF CPU = ~2 threads
    n_ctx=4096,
    verbose=False
)

def generate(prompt):
    output = llm(
        prompt,
        max_tokens=256,
        temperature=0.7,
        top_p=0.9
    )
    return output["choices"][0]["text"].strip()

demo = gr.Interface(
    fn=generate,
    inputs="text",
    outputs="text",
    title="Finance Chat LLM (GGUF Q2_K - Free Space)"
)

demo.launch()