File size: 2,495 Bytes
a3a93be
 
 
 
 
 
b1f8d2a
a3a93be
 
 
b1f8d2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a3a93be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1f8d2a
 
a3a93be
 
 
 
 
 
 
 
 
 
 
 
 
b1f8d2a
a3a93be
 
 
6036846
e205106
a3a93be
e205106
cd4f81e
a3a93be
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from safetensors.torch import load_file
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
from pathlib import Path

app = FastAPI()

HTML_FILE = Path("index.html")
if not HTML_FILE.exists():
    with open(HTML_FILE, "w") as f:
        f.write("""<!DOCTYPE html>
<html>
<head><title>nanoWhale-100m</title></head>
<body style="background:#0b1120; color:white; font-family:sans-serif; padding:20px;">
<h1>🐳 nanoWhale-100m</h1>
<p>Loading...</p>
</body>
</html>""")

def get_html():
    with open(HTML_FILE, "r", encoding="utf-8") as f:
        return f.read()

print("Loading nanoWhale-100m model...")
config = AutoConfig.from_pretrained("HuggingFaceTB/nanowhale-100m", trust_remote_code=True)
model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).float()

weights_path = hf_hub_download("HuggingFaceTB/nanowhale-100m", "model.safetensors")
state_dict = load_file(weights_path)
model.load_state_dict(state_dict, strict=True)
model = model.eval()

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/nanowhale-100m")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Model loaded on {device}")

@app.get("/", response_class=HTMLResponse)
async def get_index():
    return HTMLResponse(content=get_html(), status_code=200)

@app.post("/generate")
async def generate_text(request: Request):
    data = await request.json()
    user_prompt = data.get("prompt", "")
    
    if not user_prompt:
        return {"error": "No prompt provided"}
    
    try:
        messages = [{"role": "user", "content": user_prompt}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        
        with torch.no_grad():
            output = model.generate(
                input_ids,
                max_new_tokens=30,
                temperature=0.3,
                top_p=0.9,
                repetition_penalty=1.0,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        generated = output[0][input_ids.shape[1]:]
        response_text = tokenizer.decode(generated, skip_special_tokens=True)
        return {"response": response_text}
    
    except Exception as e:
        return {"error": str(e)}