# app/main.py """ FastAPI app with llama.cpp backend. """ import asyncio from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.responses import StreamingResponse from app.model import load_model, generate_stream, generate from app.prompt import build_prompt from app.schemas import GenerationRequest, GenerationResponse @asynccontextmanager async def lifespan(app: FastAPI): """Startup: Download and load model.""" print("=" * 50) print("Starting up - Loading GGUF model...") print("Model: Mungert/Nanbeige4-3B-Thinking-2511-GGUF") print("=" * 50) load_model() # Pre-load on startup print("Ready for requests!") yield print("Shutting down...") app = FastAPI( title="Nanbeige4-3B-Thinking-GGUF API", description="Fast CPU inference with llama.cpp (iq2_m quantized)", version="2.0.0", lifespan=lifespan ) @app.get("/") async def health_check(): return { "status": "ok", "model": "Nanbeige4-3B-Thinking-2511-iq2_m", "backend": "llama.cpp", "device": "cpu", "quantization": "iq2_m", "optimized": True } @app.post("/generate") async def generate_text(request: GenerationRequest): final_prompt = build_prompt(request.prompt) if request.stream: async def stream_generator(): loop = asyncio.get_event_loop() def sync_gen(): for chunk in generate_stream( final_prompt, temperature=request.temperature, max_tokens=request.max_tokens ): yield chunk for chunk in sync_gen(): if chunk: yield f"data: {chunk}\n\n" yield "data: [DONE]\n\n" return StreamingResponse( stream_generator(), media_type="text/event-stream" ) else: result = generate( final_prompt, temperature=request.temperature, max_tokens=request.max_tokens ) return GenerationResponse(text=result)