Spaces:

Angione-Lab
/

FateFormerExplorer

Running

App Files Files Community

FateFormerExplorer / interpretation /attentions.py

kaveh

init

ef814bf 8 days ago

raw

history blame contribute delete

11.3 kB

	import numpy as np
	import torch
	from torch.utils.data import DataLoader, Subset
	from utils.helpers import create_multimodal_model
	from models import SingleTransformer
	from scipy.sparse import csr_matrix

	def filter_idx(dataset, idx):
	"""
	Filter the idx to only return the samples that none of its modalities are all zeros
	Args:
	dataset: Dataset object containing the data.
	idx: List of indices to filter.
	Returns:
	filtered_idx: List of filtered indices.
	"""
	rna = dataset.rna_data
	atac = dataset.atac_data
	flux = dataset.flux_data
	mask = (rna != 0).any(axis=1) & (atac != 0).any(axis=1) & (flux != 0).any(axis=1)
	# filter the idx if the id is in the mask
	filtered_idx = [i for i in idx if mask[i]]

	return filtered_idx


	def analyze_cls_attention(id, fold_results, dataset, model_config, device, indices,
	average_heads=True, return_flow_attention=False):
	"""
	Extracts the attention weights of the validation set of each fold
	Args:
	id: The type of data to use. Must be one of 'RNA', 'ATAC', 'Flux', 'Multi'.
	fold_results: List of dictionaries containing the results of each fold.
	dataset: Dataset object containing the data.
	model_config: Dictionary containing the model configuration.
	device: Device to run the model on.
	sample_type: The type of samples to analyze. Must be one of 'all', 'dead-end', or 'reprogramming'. Defaults to 'all'.
	average_heads: Whether to average the attention weights across heads. Defaults to True.
	Returns:
	all_attention_weights: Numpy array containing the attention weights of the validation set
	"""
	if id not in ['RNA', 'ATAC', 'Flux', 'Multi']:
	raise ValueError("id must be one of 'RNA', 'ATAC', 'Flux', 'Multi'")

	all_attention_weights = []

	for fold in fold_results:

	val_idx = fold['val_idx']
	# filter val_idx if is in indices
	val_idx = [i for i in val_idx if i in indices]

	if id == 'Multi':
	val_idx = filter_idx(dataset, val_idx)

	if len(val_idx) == 0:
	print('No samples of the specified type in the validation set. Skipping...')
	continue

	val_ds = Subset(dataset, val_idx)
	val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

	if id=='Multi':
	model = create_multimodal_model(model_config, device, use_mlm=False)
	else:
	model = SingleTransformer(id=id, **model_config).to(device)

	model_path = fold['best_model_path']
	state_dict = torch.load(model_path, map_location='cpu')
	model.load_state_dict(state_dict)
	model.eval()

	with torch.no_grad():
	for batch in val_loader:
	x, b, _ = batch
	if isinstance(x, list):
	rna = x[0].to(device)
	atac = x[1].to(device)
	flux = x[2].to(device)
	x = (rna, atac, flux)
	else:
	x = x.to(device)
	b = b.to(device)

	_, _, attention_weights = model(x, b, return_attention=True, return_flow_attention=return_flow_attention)

	if not return_flow_attention:
	if average_heads:
	attention_weights = attention_weights.squeeze(-2).mean(dim=1) # Average across heads (batch, 1, seq_len) -> (batch, seq_len)
	else:
	attention_weights = attention_weights.squeeze(-2) # (batch, num_heads, 1, seq_len) -> (batch, num_heads, seq_len)

	# if hasattr(attention_weights, 'numpy'):
	# attention_weights = attention_weights.cpu().numpy()
	all_attention_weights.append(attention_weights)

	if not return_flow_attention:
	return np.concatenate(all_attention_weights, axis=0) # (n_samples, seq_len) or (n_samples, num_heads, seq_len)
	else:
	att_w = {'rna': [], 'atac': [], 'flux': [], 'cls': []}
	# noew we have a dict. So concatenating all values for each key
	num_layers_mlm = len(all_attention_weights[0]['rna'])
	num_layers_cls = len(all_attention_weights[0]['cls']) if isinstance(all_attention_weights[0]['cls'], list) else 1

	for key in all_attention_weights[0].keys():
	key_all_attentions = []
	for batch_row in all_attention_weights:
	modality_batch_attention_layers = batch_row[key]
	if isinstance(modality_batch_attention_layers, list):
	for i, modality_attention_layers in enumerate(modality_batch_attention_layers):
	modality_batch_attention_layers[i] = modality_attention_layers.cpu()
	key_all_attentions.append(modality_batch_attention_layers)
	else:
	key_all_attentions.append([modality_batch_attention_layers.cpu()])
	# now I have a list of attention weights for each batch in each layer [[layer0_att_weights_batch1, layer1_att_weights_batch1, ...], [layer0_att_weights_batch2, layer1_att_weights_batch2, ...], ...]
	# I want to concatenate all the attention weights for each layer
	num_layers = num_layers_cls if key == 'cls' else num_layers_mlm
	att_w[key] = [torch.cat([layer[i] for layer in key_all_attentions], axis=0) for i in range(num_layers)]
	return att_w


	# def compute_attention_rollout(attention_weights):
	# num_layers = len(attention_weights)
	# combined_attention = torch.eye(attention_weights[0].size(-1)).to(attention_weights[0].device)
	# for layer in range(num_layers):
	# layer_attention = attention_weights[layer].mean(dim=1) # Average over heads
	# combined_attention = torch.matmul(layer_attention, combined_attention)
	# return combined_attention
	def compute_attention_rollout(attention_weights):
	"""
	Computes the attention rollout for a batch of samples.
	Expects attention_weights to be a list (length=num_layers) of tensors
	with shape (batch, num_heads, seq_len, seq_len). For each layer, we average
	over the heads and then compute the rollout per sample.

	Returns:
	rollout: A tensor of shape (batch, seq_len, seq_len) representing the
	effective attention from the input token (typically CLS) to all tokens.
	"""
	num_layers = len(attention_weights)
	# Get batch size and sequence length from the first layer's tensor
	batch_size, num_heads, seq_len, _ = attention_weights[0].shape

	# Initialize the combined attention as the identity matrix for each sample
	combined_attention = torch.eye(seq_len, device=attention_weights[0].device)
	combined_attention = combined_attention.unsqueeze(0).repeat(batch_size, 1, 1)

	for layer in range(num_layers):
	# Average over heads to get a (batch, seq_len, seq_len) tensor for this layer
	layer_attention = attention_weights[layer].mean(dim=1)
	# Update the rollout for each sample using batched matrix multiplication
	combined_attention = torch.bmm(layer_attention, combined_attention)
	return combined_attention
	def multimodal_attention_rollout(all_attention_weights):
	rna_rollout = compute_attention_rollout(all_attention_weights['rna'])
	atac_rollout = compute_attention_rollout(all_attention_weights['atac'])
	flux_rollout = compute_attention_rollout(all_attention_weights['flux'])

	cls_attention = all_attention_weights['cls'][0].mean(dim=1).squeeze(1) # Average over heads

	# Split CLS attention for each modality
	rna_cls_attn, atac_cls_attn, flux_cls_attn = cls_attention.split(
	[rna_rollout.size(1), atac_rollout.size(1), flux_rollout.size(1)], dim=1)

	final_rollout = torch.cat([
	rna_cls_attn.unsqueeze(1) @ rna_rollout,
	atac_cls_attn.unsqueeze(1) @ atac_rollout,
	flux_cls_attn.unsqueeze(1) @ flux_rollout
	], dim=2)

	return final_rollout.squeeze(1) # remove head dimension [samples, tokens]

	def print_top_features(attention_weights, feature_names, top_n=5, modality=None):
	print(f"\nTop {top_n} attended features ({modality} samples):")
	avg_attention = attention_weights.mean(axis=0).numpy() if hasattr(attention_weights, 'numpy') else attention_weights.mean(axis=0)
	top_indices = avg_attention.argsort()[-top_n:][::-1]
	for i in top_indices:
	print(f"{feature_names[i]}: {avg_attention[i]:.4f}")

	def get_top_features(attention_weights, feature_names, top_n=100, modality=None):
	ls = []
	avg_attention = attention_weights.mean(axis=0).numpy() if hasattr(attention_weights, 'numpy') else attention_weights.mean(axis=0)
	if top_n:
	top_indices = avg_attention.argsort()[-top_n:][::-1]
	else:
	top_indices = avg_attention.argsort()[::-1]

	for i in top_indices:
	ls.append((feature_names[i],avg_attention[i]))
	return ls

	from scipy.sparse.csgraph import maximum_flow

	def compute_attention_flow(attention_weights):
	num_layers = len(attention_weights)
	num_tokens = attention_weights[0].size(-1)

	# Create adjacency matrix for the flow network
	adj_matrix = np.zeros((num_layers * num_tokens, num_layers * num_tokens))

	for i in range(num_layers - 1):
	layer_attention = attention_weights[i].mean(dim=1).cpu().numpy() # Average over heads
	start_idx = i * num_tokens
	end_idx = (i + 1) * num_tokens
	adj_matrix[start_idx:end_idx, end_idx:(end_idx + num_tokens)] = layer_attention

	for i in range(num_layers - 1):
	start_idx = i * num_tokens
	end_idx = (i + 1) * num_tokens
	adj_matrix[start_idx:end_idx, end_idx:(end_idx + num_tokens)] += np.eye(num_tokens)

	flows = np.zeros((num_tokens, num_tokens))
	for i in range(num_tokens):
	source = i
	for j in range(num_tokens):
	sink = (num_layers - 1) * num_tokens + j
	_, flow = maximum_flow(csr_matrix(adj_matrix), source, sink)
	flows[i, j] = flow

	return torch.tensor(flows, device=attention_weights[0].device)

	def multimodal_attention_flow(all_attention_weights):
	rna_flow = compute_attention_flow(all_attention_weights['rna'])
	atac_flow = compute_attention_flow(all_attention_weights['atac'])
	flux_flow = compute_attention_flow(all_attention_weights['flux'])

	cls_attention = all_attention_weights['cls'][0].mean(dim=1).squeeze(1) # Average over heads

	# Split CLS attention for each modality
	rna_cls_attn, atac_cls_attn, flux_cls_attn = cls_attention.split(
	[rna_flow.size(1), atac_flow.size(1), flux_flow.size(1)], dim=1)

	# Normalize flows
	rna_flow = rna_flow / rna_flow.sum(dim=1, keepdim=True)
	atac_flow = atac_flow / atac_flow.sum(dim=1, keepdim=True)
	flux_flow = flux_flow / flux_flow.sum(dim=1, keepdim=True)

	final_flow = torch.cat([
	rna_cls_attn.unsqueeze(1) @ rna_flow,
	atac_cls_attn.unsqueeze(1) @ atac_flow,
	flux_cls_attn.unsqueeze(1) @ flux_flow
	], dim=2)

	return final_flow.squeeze(1)