Spaces:

usermma
/

NanoWhale-0.1B

Sleeping

File size: 2,495 Bytes

a3a93be
 
 
 
 
 
b1f8d2a
a3a93be
 
 
b1f8d2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3a93be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1f8d2a
 
a3a93be
 
 
 
 
 
 
 
 
 
 
 
 
b1f8d2a
a3a93be
 
 
6036846
e205106
a3a93be
e205106
cd4f81e
a3a93be

import torch
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from safetensors.torch import load_file
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from pathlib import Path

app = FastAPI()

HTML_FILE = Path("index.html")
if not HTML_FILE.exists():
    with open(HTML_FILE, "w") as f:
        f.write("""<!DOCTYPE html>
<html>
<head><title>nanoWhale-100m</title></head>
<body style="background:#0b1120; color:white; font-family:sans-serif; padding:20px;">
<h1>🐳 nanoWhale-100m</h1>
<p>Loading...</p>
</body>
</html>""")

def get_html():
    with open(HTML_FILE, "r", encoding="utf-8") as f:
        return f.read()

print("Loading nanoWhale-100m model...")
config = AutoConfig.from_pretrained("HuggingFaceTB/nanowhale-100m", trust_remote_code=True)
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).float()

weights_path = hf_hub_download("HuggingFaceTB/nanowhale-100m", "model.safetensors")
state_dict = load_file(weights_path)
model.load_state_dict(state_dict, strict=True)
model = model.eval()

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/nanowhale-100m")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model loaded on {device}")

@app.get("/", response_class=HTMLResponse)
async def get_index():
    return HTMLResponse(content=get_html(), status_code=200)

@app.post("/generate")
async def generate_text(request: Request):
    data = await request.json()
    user_prompt = data.get("prompt", "")
    
    if not user_prompt:
        return {"error": "No prompt provided"}
    
    try:
        messages = [{"role": "user", "content": user_prompt}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        
        with torch.no_grad():
            output = model.generate(
                input_ids,
                max_new_tokens=30,
                temperature=0.3,
                top_p=0.9,
                repetition_penalty=1.0,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated = output[0][input_ids.shape[1]:]
        response_text = tokenizer.decode(generated, skip_special_tokens=True)
        return {"response": response_text}
    
    except Exception as e:
        return {"error": str(e)}