Spaces:
Sleeping
Sleeping
| import torch | |
| from fastapi import FastAPI, Request | |
| from fastapi.responses import HTMLResponse | |
| from safetensors.torch import load_file | |
| from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM | |
| from huggingface_hub import hf_hub_download | |
| from pathlib import Path | |
| app = FastAPI() | |
| HTML_FILE = Path("index.html") | |
| if not HTML_FILE.exists(): | |
| with open(HTML_FILE, "w") as f: | |
| f.write("""<!DOCTYPE html> | |
| <html> | |
| <head><title>nanoWhale-100m</title></head> | |
| <body style="background:#0b1120; color:white; font-family:sans-serif; padding:20px;"> | |
| <h1>🐳 nanoWhale-100m</h1> | |
| <p>Loading...</p> | |
| </body> | |
| </html>""") | |
| def get_html(): | |
| with open(HTML_FILE, "r", encoding="utf-8") as f: | |
| return f.read() | |
| print("Loading nanoWhale-100m model...") | |
| config = AutoConfig.from_pretrained("HuggingFaceTB/nanowhale-100m", trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).float() | |
| weights_path = hf_hub_download("HuggingFaceTB/nanowhale-100m", "model.safetensors") | |
| state_dict = load_file(weights_path) | |
| model.load_state_dict(state_dict, strict=True) | |
| model = model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/nanowhale-100m") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| print(f"Model loaded on {device}") | |
| async def get_index(): | |
| return HTMLResponse(content=get_html(), status_code=200) | |
| async def generate_text(request: Request): | |
| data = await request.json() | |
| user_prompt = data.get("prompt", "") | |
| if not user_prompt: | |
| return {"error": "No prompt provided"} | |
| try: | |
| messages = [{"role": "user", "content": user_prompt}] | |
| prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
| input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| output = model.generate( | |
| input_ids, | |
| max_new_tokens=30, | |
| temperature=0.3, | |
| top_p=0.9, | |
| repetition_penalty=1.0, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| generated = output[0][input_ids.shape[1]:] | |
| response_text = tokenizer.decode(generated, skip_special_tokens=True) | |
| return {"response": response_text} | |
| except Exception as e: | |
| return {"error": str(e)} |