import torch from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse from safetensors.torch import load_file from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM from huggingface_hub import hf_hub_download from pathlib import Path app = FastAPI() HTML_FILE = Path("index.html") if not HTML_FILE.exists(): with open(HTML_FILE, "w") as f: f.write("""
Loading...
""") def get_html(): with open(HTML_FILE, "r", encoding="utf-8") as f: return f.read() print("Loading nanoWhale-100m model...") config = AutoConfig.from_pretrained("HuggingFaceTB/nanowhale-100m", trust_remote_code=True) model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).float() weights_path = hf_hub_download("HuggingFaceTB/nanowhale-100m", "model.safetensors") state_dict = load_file(weights_path) model.load_state_dict(state_dict, strict=True) model = model.eval() tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/nanowhale-100m") device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) print(f"Model loaded on {device}") @app.get("/", response_class=HTMLResponse) async def get_index(): return HTMLResponse(content=get_html(), status_code=200) @app.post("/generate") async def generate_text(request: Request): data = await request.json() user_prompt = data.get("prompt", "") if not user_prompt: return {"error": "No prompt provided"} try: messages = [{"role": "user", "content": user_prompt}] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device) with torch.no_grad(): output = model.generate( input_ids, max_new_tokens=30, temperature=0.3, top_p=0.9, repetition_penalty=1.0, do_sample=True, pad_token_id=tokenizer.eos_token_id ) generated = output[0][input_ids.shape[1]:] response_text = tokenizer.decode(generated, skip_special_tokens=True) return {"response": response_text} except Exception as e: return {"error": str(e)}