togethercomputer
/

m2-bert-80M-2k

Model card Files Files and versions

m2-bert-80M-2k / blockdiag_multiply.py

Dan Fu

Automodel

6471836 over 2 years ago

history blame contribute delete

2.87 kB

	# Adapted from https://github.com/HazyResearch/fly/tree/master/src/models/layers

	import numpy as np
	import torch
	from torch.nn import functional as F
	from einops import rearrange


	def blockdiag_weight_to_dense_weight(weight):
	"""
	Argumments:
	weight: (nblocks, out / nblocks, in / blocks)
	Return:
	dense_weight: (out / in)
	"""
	return torch.block_diag(*torch.unbind(weight, dim=0))


	def blockdiag_multiply_reference(x, weight):
	"""
	This implementation is slow but more likely to be correct.
	Arguments:
	x: (..., n)
	weight: (nblocks, q, n / nblocks)
	Outputs:
	out: (..., nblocks * q)
	"""
	n = x.shape[-1]
	nblocks, q, p = weight.shape
	assert nblocks * p == n

	x_reshaped = rearrange(x, '... (nblocks p) -> ... nblocks p', nblocks=nblocks)
	return rearrange(torch.einsum('...kp, kqp -> ...kq', x_reshaped, weight),
	'... nblocks q -> ... (nblocks q)')


	class BlockdiagMultiply(torch.autograd.Function):

	"""This is a faster implementation, with careful memory copies for the fastest
	bmm performance.
	The backward pass is also written manually with careful memory copies.
	Arguments:
	x: (..., n)
	weight: (nblocks, q, n / nblocks)
	Outputs:
	out: (..., nblocks * q)
	"""

	@staticmethod
	@torch.cuda.amp.custom_fwd(cast_inputs=torch.bfloat16)
	def forward(ctx, x, weight):
	ctx.save_for_backward(x, weight)
	batch_shape, n = x.shape[:-1], x.shape[-1]
	batch_dim = np.prod(batch_shape)
	nblocks, q, p = weight.shape
	assert nblocks * p == n
	x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1)
	out = torch.empty(batch_dim, nblocks, q, device=x.device, dtype=x.dtype).transpose(0, 1)
	out = torch.bmm(x_reshaped, weight.transpose(-1, -2), out=out).transpose(0, 1)
	return out.reshape(batch_shape, nblocks q)

	@staticmethod
	@torch.cuda.amp.custom_bwd
	def backward(ctx, dout):
	x, weight = ctx.saved_tensors
	batch_shape, n = x.shape[:-1], x.shape[-1]
	batch_dim = np.prod(batch_shape)
	nblocks, q, p = weight.shape
	assert nblocks * p == n
	dx, dweight = None, None
	dout_reshaped = dout.reshape(batch_dim, nblocks, q).transpose(0, 1)
	if ctx.needs_input_grad[0]:
	dx = torch.empty(batch_dim, nblocks, p, device=x.device, dtype=x.dtype)
	dx = torch.bmm(dout_reshaped, weight.conj(),
	out=dx.transpose(0, 1)).transpose(0, 1).reshape(*batch_shape, n)
	if ctx.needs_input_grad[1]:
	x_reshaped = x.reshape(batch_dim, nblocks, p).transpose(0, 1)
	dweight = torch.bmm(dout_reshaped.transpose(-1, -2), x_reshaped.conj())
	return dx, dweight


	blockdiag_multiply = BlockdiagMultiply.apply