| import os |
| import threading |
| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from transformers import pipeline |
|
|
| app = FastAPI() |
|
|
| |
| |
|
|
| |
|
|
| MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct" |
|
|
| _pipe = None |
| _pipe_lock = threading.Lock() |
|
|
| class Request(BaseModel): |
| prompt: str |
| temperature: float = 0.0 |
| max_tokens: int = 50 |
|
|
|
|
| @app.get("/") |
| def health(): |
| return {"status": "running", "model_loaded": _pipe is not None} |
|
|
|
|
| def get_pipe(): |
| global _pipe |
| if _pipe is None: |
| with _pipe_lock: |
| if _pipe is None: |
| _pipe = pipeline( |
| "text-generation", |
| model=MODEL_ID, |
| device=-1 |
| ) |
| return _pipe |
|
|
|
|
| @app.post("/generate") |
| def generate(req: Request): |
| try: |
| pipe = get_pipe() |
|
|
| do_sample = req.temperature > 0 |
|
|
| out = pipe( |
| req.prompt, |
| max_new_tokens=int(req.max_tokens), |
| temperature=float(req.temperature), |
| do_sample=do_sample, |
| return_full_text=False |
| ) |
|
|
| return {"response": out[0]["generated_text"].strip()} |
|
|
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
| |
| get_pipe() |