Update Llama2.py

c462c28 verified almost 2 years ago

12.8 kB

	import tensorflow as tf
	from tensorflow.keras.layers import Dense,Dropout
	from tensorflow.keras.initializers import RandomNormal
	from tensorflow.keras.regularizers import L2
	from tensorflow.keras import Model
	import math
	from dataclasses import dataclass
	from typing import Optional


	@dataclass
	class ModelArgs:
	# default hyperparameters for the Llama 7B model
	dim: int = 4096
	n_layers: int = 32
	n_heads: int = 32
	n_kv_heads: Optional[int] = None
	vocab_size: int = 32000
	hidden_dim: Optional[int] = None
	multiple_of: int = 256 # MLP hidden layer size will be multiple of
	norm_eps: float = 1e-5
	max_seq_len: int = 2048
	dropout: float = 0.0
	weight_decay: float = 0.1


	class RMSNorm(tf.keras.layers.Layer):
	def __init__(self, dim: int, eps: float):
	self.eps = eps
	self.weight = self.add_weight(
	name='weight',
	shape=(self.dim,),
	initializer=tf.keras.initializers.Ones(),
	trainable=True
	)

	def _norm(self, x):
	return x * tf.math.rsqrt(tf.reduce_mean(tf.math.pow(x, 2), -1, keepdims=True) + self.eps)

	def __call__(self, x):
	output = tf.cast(self._norm(tf.cast(x, 'float32')), x.dtype)
	return output * self.weight


	def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
	freqs = 1.0 / (theta ** (tf.cast(tf.range(0, dim, 2)[: (dim // 2)], 'float32') / dim))
	t = tf.range(end) # type: ignore
	freqs = tf.cast(tf.experimental.numpy.outer(t, freqs), 'float32') # type: ignore
	freqs_cos = tf.math.cos(freqs) # real part
	freqs_sin = tf.math.sin(freqs) # imaginary part
	return freqs_cos, freqs_sin

	def reshape_for_broadcast(freqs_cis, x):
	ndim = x.ndim
	assert 0 <= 1 < ndim
	assert freqs_cis.shape == (x.shape[1], x.shape[-1])
	shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
	return tf.reshape(freqs_cis, shape)

	def apply_rotary_emb(
	xq,
	xk,
	freqs_cos,
	freqs_sin
	):

	# reshape xq and xk to match the complex representation
	xq_r, xq_i = tf.unstack(tf.reshape(tf.cast(xq, 'float32'), (xq.shape[:-1] + (xq.shape[-1] // 2, 2))), axis=-1)
	xk_r, xk_i = tf.unstack(tf.reshape(tf.cast(xk, 'float32'), (xk.shape[:-1] + (xk.shape[-1] // 2, 2))), axis=-1)

	# reshape freqs_cos and freqs_sin for broadcasting
	freqs_cos = reshape_for_broadcast(freqs_cos, xq_r)
	freqs_sin = reshape_for_broadcast(freqs_sin, xq_r)

	# apply rotation using real numbers
	xq_out_r = xq_r * freqs_cos - xq_i * freqs_sin
	xq_out_i = xq_r * freqs_sin + xq_i * freqs_cos
	xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
	xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos

	# flatten last two dimensions
	xq_out = tf.stack([xq_out_r, xq_out_i], axis=-1)
	shape = xq_out.shape
	xq_out = tf.reshape(xq_out, [-1, shape[1], shape[2], shape[3] * shape[4]])
	xk_out = tf.stack([xk_out_r, xk_out_i], axis=-1)
	shape = xk_out.shape
	xk_out = tf.reshape(xk_out, [-1, shape[1], shape[2], shape[3] * shape[4]])

	return tf.cast(xq_out, xq.dtype), tf.cast(xk_out, xk.dtype)

	def repeat_kv(x, n_rep: int):
	bs, slen, n_kv_heads, head_dim = x.shape
	if n_rep == 1:
	return x
	return tf.reshape(tf.tile(x[:, :, :, None, :], [1, 1, 1, n_rep, 1]), (bs, slen, n_kv_heads * n_rep, head_dim))

	class Attention:
	def __init__(self, args: ModelArgs):
	self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
	assert args.n_heads % self.n_kv_heads == 0
	model_parallel_size = 1
	self.n_local_heads = args.n_heads // model_parallel_size
	self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
	self.n_rep = self.n_local_heads // self.n_local_kv_heads
	self.head_dim = args.dim // args.n_heads
	self.wq = Dense(args.n_heads * self.head_dim, kernel_initializer=RandomNormal(stddev=0.02),
	kernel_regularizer=L2(args.weight_decay), use_bias=False)
	self.wk = Dense(self.n_kv_heads * self.head_dim, kernel_initializer=RandomNormal(stddev=0.02),
	kernel_regularizer=L2(args.weight_decay), use_bias=False)
	self.wv = Dense(self.n_kv_heads * self.head_dim, kernel_initializer=RandomNormal(stddev=0.02),
	kernel_regularizer=L2(args.weight_decay), use_bias=False)
	self.wo = Dense(args.dim, kernel_initializer=RandomNormal(stddev=0.02/math.sqrt(2 * args.n_layers)),
	kernel_regularizer=L2(args.weight_decay), use_bias=False)
	self.attn_dropout = Dropout(args.dropout)
	self.resid_dropout = Dropout(args.dropout)
	self.mask = tf.fill((args.max_seq_len, args.max_seq_len), float("-inf"))
	self.mask = tf.linalg.band_part(self.mask, 0, -1)
	self.mask = tf.linalg.set_diag(self.mask, tf.zeros(args.max_seq_len))
	self.mask = tf.reshape(self.mask, (1, 1, *self.mask.shape))

	def __call__(
	self,
	x,
	freqs_cos,
	freqs_sin,
	):
	bsz, seqlen, _ = x.shape

	# QKV
	xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
	xq = tf.reshape(xq, (bsz, seqlen, self.n_local_heads, self.head_dim))
	xk = tf.reshape(xk, (bsz, seqlen, self.n_local_kv_heads, self.head_dim))
	xv = tf.reshape(xv, (bsz, seqlen, self.n_local_kv_heads, self.head_dim))

	# RoPE relative positional embeddings
	xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)

	# grouped multiquery attention: expand out keys and values
	xk = repeat_kv(xk, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)
	xv = repeat_kv(xv, self.n_rep) # (bs, seqlen, n_local_heads, head_dim)

	# make heads into a batch dimension
	xq = tf.transpose(xq, (0, 2, 1, 3)) # (bs, n_local_heads, seqlen, head_dim)
	xk = tf.transpose(xk, (0, 2, 1, 3))
	xv = tf.transpose(xv, (0, 2, 1, 3))

	scores = tf.matmul(xq, tf.transpose(xk, (0, 1, 3, 2))) / math.sqrt(self.head_dim)
	assert hasattr(self, 'mask')
	scores = scores + self.mask[:, :, :seqlen, :seqlen] # (bs, n_local_heads, seqlen, cache_len + seqlen)
	scores = tf.cast(tf.nn.softmax(tf.cast(scores, 'float32'), axis=-1), xq.dtype)
	scores = self.attn_dropout(scores)
	output = tf.matmul(scores, xv) # (bs, n_local_heads, seqlen, head_dim)

	# restore time as batch dimension and concat heads
	output = tf.reshape(tf.transpose(output, (0, 2, 1, 3)), (bsz, seqlen, -1))

	# final projection into the residual stream
	output = self.wo(output)
	output = self.resid_dropout(output)
	return output


	class FeedForward:
	def __init__(self, dim: int, hidden_dim: int, multiple_of: int, drop_rate: float):
	if hidden_dim is None:
	hidden_dim = 4 * dim
	hidden_dim = int(2 * hidden_dim / 3)
	hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
	self.w1 = Dense(hidden_dim, kernel_initializer=RandomNormal(stddev=0.02),
	kernel_regularizer=L2(ModelArgs.weight_decay), use_bias=False)
	self.w2 = Dense(dim, kernel_initializer=RandomNormal(stddev=0.02),
	kernel_regularizer=L2(ModelArgs.weight_decay), use_bias=False)
	self.w3 = Dense(hidden_dim, kernel_initializer=RandomNormal(stddev=0.02/math.sqrt(2 * ModelArgs.n_layers)),
	kernel_regularizer=L2(ModelArgs.weight_decay), use_bias=False)
	self.dropout = Dropout(drop_rate)

	def __call__(self, x):
	return self.dropout(self.w2(tf.nn.silu(self.w1(x)) * self.w3(x)))


	class TransformerBlock:
	def __init__(self, layer_id: int, args: ModelArgs):
	self.n_heads = args.n_heads
	self.dim = args.dim
	self.head_dim = args.dim // args.n_heads
	self.attention = Attention(args)
	self.feed_forward = FeedForward(
	dim=args.dim,
	hidden_dim=args.hidden_dim,
	multiple_of=args.multiple_of,
	drop_rate=args.dropout,
	)
	self.layer_id = layer_id
	self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
	self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)

	def __call__(self, x, freqs_cos, freqs_sin):
	h = x + self.attention(self.attention_norm(x), freqs_cos, freqs_sin)
	out = h + self.feed_forward(self.ffn_norm(h))
	return out


	class Llama2(Model):
	def __init__(self, params: ModelArgs):
	super(Llama2, self).__init__()
	self.params = params
	self.vocab_size = params.vocab_size
	self.n_layers = params.n_layers

	self.dropout = Dropout(params.dropout)
	self.layers = []
	for layer_id in range(params.n_layers):
	self.layers.append(TransformerBlock(layer_id, params))
	self.norm = RMSNorm(params.dim, eps=params.norm_eps)
	self.output = Dense(params.vocab_size, kernel_initializer=RandomNormal(stddev=0.02),
	kernel_regularizer=L2(params.weight_decay), use_bias=False)

	# some useful precompute for the RoPE relative positional embeddings
	self.freqs_cos, self.freqs_sin = precompute_freqs_cis(self.params.dim // self.params.n_heads, self.params.max_seq_len)

	def __call__(self, tokens):
	_bsz, seqlen = tokens.shape
	h = tf.gather(tf.transpose(self.output.weight), tokens)
	h = self.dropout(h)
	freqs_cos = self.freqs_cos[:seqlen]
	freqs_sin = self.freqs_sin[:seqlen]

	for layer in self.layers:
	h = layer(h, freqs_cos, freqs_sin)
	h = self.norm(h)

	if self.training:
	# if we are given some desired targets also calculate the loss
	logits = self.output(h)
	else:
	# inference-time mini-optimization: only forward the output on the very last position
	logits = self.output(h[:, [-1], :]) # note: using list [-1] to preserve the time dim

	return logits

	def estimate_mfu(self, fwdbwd_per_iter, dt):
	""" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
	# first estimate the number of flops we do per iteration.
	# see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
	N = sum(p.numel() for p in self.parameters())
	cfg = self.params
	L, H, Q, T = cfg.n_layers, cfg.n_heads, cfg.dim//cfg.n_heads, cfg.max_seq_len
	flops_per_token = 6N + 12LHQ*T
	flops_per_fwdbwd = flops_per_token * T
	flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
	# express our flops throughput as ratio of A100 bfloat16 peak flops
	flops_achieved = flops_per_iter * (1.0/dt) # per second
	flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
	mfu = flops_achieved / flops_promised
	return mfu

	def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
	"""
	Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
	the sequence max_new_tokens times, feeding the predictions back into the model each time.
	Most likely you'll want to make sure to be in model.eval() mode of operation for this.
	Also note this is a super inefficient version of sampling with no key/value cache.
	"""
	for _ in range(max_new_tokens):
	# if the sequence context is growing too long we must crop it at block_size
	idx_cond = idx if idx.size(1) <= self.params.max_seq_len else idx[:, -self.params.max_seq_len:]
	# forward the model to get the logits for the index in the sequence
	logits = self(idx_cond)
	logits = logits[:, -1, :] # crop to just the final time step
	if temperature == 0.0:
	# "sample" the single most likely index
	idx_next = tf.math.argmax(logits, axis=-1)
	else:
	# pluck the logits at the final step and scale by desired temperature
	logits = logits / temperature
	# optionally crop the logits to only the top k options
	if top_k is not None:
	k = tf.minimum(top_k, logits.shape[-1])
	v, _ = tf.math.top_k(logits, k=k, sorted=True)
	logits[logits < v[:, [-1]]] = -float('Inf')
	# apply softmax to convert logits to (normalized) probabilities
	probs = tf.nn.softmax(logits, dim=-1)
	idx_next = tf.random.categorical(tf.math.log(probs), num_samples=1)
	# append sampled index to the running sequence and continue
	idx = tf.concat((idx, idx_next), axis=1)

	return idx