import os import threading from fastapi import FastAPI, HTTPException from pydantic import BaseModel from transformers import pipeline app = FastAPI() ## you can change model. On hugging face go to Models and then you have the ID. For example: ## Nanbeige/Nanbeige4.1-3B ## Careful about how big the model is, as HF free resources are limited MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" _pipe = None _pipe_lock = threading.Lock() class Request(BaseModel): prompt: str temperature: float = 0.0 max_tokens: int = 50 ## you can pass the parameter in the request @app.get("/") def health(): return {"status": "running", "model_loaded": _pipe is not None} def get_pipe(): global _pipe if _pipe is None: with _pipe_lock: if _pipe is None: _pipe = pipeline( "text-generation", model=MODEL_ID, device=-1 ) return _pipe @app.post("/generate") ## this is the endpoint that you call in the notebook def generate(req: Request): try: pipe = get_pipe() do_sample = req.temperature > 0 out = pipe( req.prompt, max_new_tokens=int(req.max_tokens), temperature=float(req.temperature), do_sample=do_sample, return_full_text=False ) return {"response": out[0]["generated_text"].strip()} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) get_pipe()