Spaces:

ziadkassem
/

StoryGPT

Sleeping

App Files Files Community

StoryGPT / app.py

ziadkassem

Upload app.py with huggingface_hub

270380c verified 11 days ago

raw

history blame contribute delete

10.1 kB

	import gradio as gr
	import torch
	import torch.nn.functional as F
	import sys, os

	# ── Inline all model code so the Space is self-contained ──────────────────────
	import torch.nn as nn
	import math

	# ── RMSNorm ───────────────────────────────────────────────────────────────────
	class RMSNorm(nn.Module):
	def __init__(self, cfg, eps=1e-8):
	super().__init__()
	self.eps = eps
	self.gamma = nn.Parameter(torch.ones(cfg["emb_dim"]))
	def forward(self, x):
	rms = x.pow(2).mean(dim=-1, keepdim=True).sqrt()
	return (x / (rms + self.eps)) * self.gamma

	# ── SwiGLU FFN ────────────────────────────────────────────────────────────────
	class SwiGLUFFN(nn.Module):
	def __init__(self, cfg, hidden_dim):
	super().__init__()
	self.w1 = nn.Linear(cfg["emb_dim"], hidden_dim)
	self.w2 = nn.Linear(cfg["emb_dim"], hidden_dim)
	self.w3 = nn.Linear(hidden_dim, cfg["emb_dim"])
	def forward(self, x):
	return self.w3(F.silu(self.w1(x)) * self.w2(x))

	# ── RoPE ──────────────────────────────────────────────────────────────────────
	def precompute_rope_freqs(dim, max_seq_len, theta=10000.0):
	freqs = 1.0 / (theta ** (torch.arange(0, dim, 2).float() / dim))
	positions = torch.arange(max_seq_len).float()
	angles = torch.outer(positions, freqs)
	return angles.cos(), angles.sin()

	def apply_rope(x, cos, sin):
	x_even = x[..., 0::2]
	x_odd = x[..., 1::2]
	cos = cos[:x.shape[2]].unsqueeze(0).unsqueeze(0)
	sin = sin[:x.shape[2]].unsqueeze(0).unsqueeze(0)
	out = torch.stack([x_even * cos - x_odd * sin,
	x_even * sin + x_odd * cos], dim=-1).flatten(-2)
	return out

	# ── Grouped Query Attention ───────────────────────────────────────────────────
	class GroupedQueryAttention(nn.Module):
	def __init__(self, d_in, d_out, num_heads, n_kv_heads, context_length, dropout):
	super().__init__()
	assert d_out % num_heads == 0
	assert num_heads % n_kv_heads == 0
	self.d_out = d_out
	self.num_heads = num_heads
	self.n_kv_heads = n_kv_heads
	self.dim_head = d_out // num_heads
	self.n_rep = num_heads // n_kv_heads
	kv_dim = n_kv_heads * self.dim_head
	self.W_Query = nn.Linear(d_in, d_out, bias=False)
	self.W_Key = nn.Linear(d_in, kv_dim, bias=False)
	self.W_Value = nn.Linear(d_in, kv_dim, bias=False)
	self.dropout = nn.Dropout(dropout)
	self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
	cos, sin = precompute_rope_freqs(self.dim_head, context_length)
	self.register_buffer("rope_cos", cos)
	self.register_buffer("rope_sin", sin)

	def forward(self, x):
	b, t, _ = x.shape
	Q = self.W_Query(x).view(b, t, self.num_heads, self.dim_head).transpose(1, 2)
	K = self.W_Key(x).view(b, t, self.n_kv_heads, self.dim_head).transpose(1, 2)
	V = self.W_Value(x).view(b, t, self.n_kv_heads, self.dim_head).transpose(1, 2)
	Q = apply_rope(Q, self.rope_cos, self.rope_sin)
	K = apply_rope(K, self.rope_cos, self.rope_sin)
	K = K.repeat_interleave(self.n_rep, dim=1)
	V = V.repeat_interleave(self.n_rep, dim=1)
	scores = Q @ K.transpose(-2, -1)
	mask = self.mask[:t, :t].unsqueeze(0).unsqueeze(0)
	scores = scores.masked_fill(mask.bool(), -torch.inf)
	w = self.dropout(torch.softmax(scores / self.dim_head ** 0.5, dim=-1))
	out = (w @ V).transpose(1, 2).contiguous().view(b, t, self.d_out)
	return out

	# ── Transformer Block ─────────────────────────────────────────────────────────
	class TransformerBlock(nn.Module):
	def __init__(self, cfg):
	super().__init__()
	self.norm1 = RMSNorm(cfg)
	self.GQA = GroupedQueryAttention(cfg["emb_dim"], cfg["emb_dim"],
	cfg["n_heads"], cfg["n_kv_heads"],
	cfg["context_length"], cfg["dropout"])
	self.norm2 = RMSNorm(cfg)
	self.FF = SwiGLUFFN(cfg, cfg["ffn_hidden"])
	self.drop = nn.Dropout(cfg["dropout"])
	def forward(self, x):
	x = x + self.drop(self.GQA(self.norm1(x)))
	x = x + self.drop(self.FF(self.norm2(x)))
	return x

	# ── GPT ───────────────────────────────────────────────────────────────────────
	class GPT(nn.Module):
	def __init__(self, cfg):
	super().__init__()
	self.token_embedding = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
	self.dropout = nn.Dropout(cfg["dropout"])
	self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
	self.final_norm = RMSNorm(cfg)
	self.output_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
	self.output_head.weight = self.token_embedding.weight
	def forward(self, x):
	x = self.dropout(self.token_embedding(x))
	x = self.trf_blocks(x)
	return self.output_head(self.final_norm(x))

	# ── Config ────────────────────────────────────────────────────────────────────
	MODEL_CONFIG = {
	"vocab_size": 16384,
	"context_length": 512,
	"emb_dim": 512,
	"n_heads": 8,
	"n_kv_heads": 4,
	"n_layers": 8,
	"ffn_hidden": 1376,
	"dropout": 0.0,
	"norm_eps": 1e-6,
	}

	# ── Load model & tokenizer once at startup ────────────────────────────────────
	from huggingface_hub import hf_hub_download
	from tokenizers import Tokenizer

	REPO_ID = "ziadkassem/StoryGPT"

	print("Downloading model weights...")
	weights_path = hf_hub_download(repo_id=REPO_ID, filename="best_model.pt")
	tok_path = hf_hub_download(repo_id=REPO_ID, filename="storygpt_tokenizer.json")

	tokenizer = Tokenizer.from_file(tok_path)

	device = torch.device("cpu") # Spaces free tier is CPU only
	model = GPT(MODEL_CONFIG)
	weights = torch.load(weights_path, map_location=device)
	if list(weights.keys())[0].startswith("module."):
	weights = {k.replace("module.", ""): v for k, v in weights.items()}
	model.load_state_dict(weights)
	model.eval()
	print("Model loaded!")

	# ── Generation logic ──────────────────────────────────────────────────────────
	def generate_story(prompt: str, max_tokens: int, temperature: float, top_k: int) -> str:
	bos_id = tokenizer.token_to_id("<bos>")
	eos_id = tokenizer.token_to_id("<eos>")
	ids = [bos_id] + tokenizer.encode(prompt).ids
	idx = torch.tensor(ids, dtype=torch.long).unsqueeze(0)

	with torch.no_grad():
	for _ in range(max_tokens):
	idx_cond = idx[:, -MODEL_CONFIG["context_length"]:]
	logits = model(idx_cond)[:, -1, :]
	logits = logits / temperature
	v, _ = torch.topk(logits, top_k)
	logits[logits < v[:, [-1]]] = -float("Inf")
	probs = torch.softmax(logits, dim=-1)
	next_id = torch.multinomial(probs, num_samples=1)
	if next_id.item() == eos_id:
	break
	idx = torch.cat([idx, next_id], dim=1)

	return tokenizer.decode(idx.squeeze(0).tolist())

	# ── Gradio UI ─────────────────────────────────────────────────────────────────
	with gr.Blocks(title="StoryGPT", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 📖 StoryGPT
	A 50M parameter LLaMA-style language model pre-trained from scratch on TinyStories.
	Architecture: GQA · RoPE · RMSNorm · SwiGLU \| Trained with AMP + Cosine LR \| Perplexity: 4.09
	""")

	with gr.Row():
	with gr.Column(scale=2):
	prompt_box = gr.Textbox(
	label="Story Prompt",
	placeholder="Once upon a time,",
	lines=2,
	)
	with gr.Row():
	max_tokens = gr.Slider(50, 400, value=200, step=10, label="Max Tokens")
	temperature = gr.Slider(0.5, 1.5, value=0.8, step=0.05, label="Temperature")
	top_k = gr.Slider(5, 100, value=40, step=5, label="Top-K")
	btn = gr.Button("✨ Generate Story", variant="primary")

	with gr.Column(scale=3):
	output_box = gr.Textbox(label="Generated Story", lines=12, interactive=False)

	btn.click(
	fn=generate_story,
	inputs=[prompt_box, max_tokens, temperature, top_k],
	outputs=output_box,
	)

	gr.Examples(
	examples=[
	["Once upon a time, there was a little girl named Lily who"],
	["One sunny day, a puppy named Max found a"],
	["The brave knight looked at the dragon and said"],
	],
	inputs=prompt_box,
	)

	demo.launch()