Spaces:

usermma
/

NanoWhale-0.1B

Sleeping

App Files Files Community

NanoWhale-0.1B / app.py

usermma

Update app.py

e205106 verified 14 days ago

raw

history blame contribute delete

2.5 kB

	import torch
	from fastapi import FastAPI, Request
	from fastapi.responses import HTMLResponse
	from safetensors.torch import load_file
	from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
	from huggingface_hub import hf_hub_download
	from pathlib import Path

	app = FastAPI()

	HTML_FILE = Path("index.html")
	if not HTML_FILE.exists():
	with open(HTML_FILE, "w") as f:
	f.write("""<!DOCTYPE html>
	<html>
	<head><title>nanoWhale-100m</title></head>
	<body style="background:#0b1120; color:white; font-family:sans-serif; padding:20px;">
	<h1>🐳 nanoWhale-100m</h1>
	<p>Loading...</p>
	</body>
	</html>""")

	def get_html():
	with open(HTML_FILE, "r", encoding="utf-8") as f:
	return f.read()

	print("Loading nanoWhale-100m model...")
	config = AutoConfig.from_pretrained("HuggingFaceTB/nanowhale-100m", trust_remote_code=True)
	model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).float()

	weights_path = hf_hub_download("HuggingFaceTB/nanowhale-100m", "model.safetensors")
	state_dict = load_file(weights_path)
	model.load_state_dict(state_dict, strict=True)
	model = model.eval()

	tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/nanowhale-100m")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model = model.to(device)
	print(f"Model loaded on {device}")

	@app.get("/", response_class=HTMLResponse)
	async def get_index():
	return HTMLResponse(content=get_html(), status_code=200)

	@app.post("/generate")
	async def generate_text(request: Request):
	data = await request.json()
	user_prompt = data.get("prompt", "")

	if not user_prompt:
	return {"error": "No prompt provided"}

	try:
	messages = [{"role": "user", "content": user_prompt}]
	prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

	with torch.no_grad():
	output = model.generate(
	input_ids,
	max_new_tokens=30,
	temperature=0.3,
	top_p=0.9,
	repetition_penalty=1.0,
	do_sample=True,
	pad_token_id=tokenizer.eos_token_id
	)

	generated = output[0][input_ids.shape[1]:]
	response_text = tokenizer.decode(generated, skip_special_tokens=True)
	return {"response": response_text}

	except Exception as e:
	return {"error": str(e)}