FateFormerExplorer / data /preprocess_data.py
kaveh's picture
init
ef814bf
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
def filter_rna_cells_genes(adata, min_genes=100, min_cells=10):
"""
Filter cells and genes in RNA data.
Parameters:
- adata (AnnData): Annotated data object containing the RNA data.
- min_genes (int): The minimum number of genes to keep a cell. Default is 100.
- min_cells (int): The minimum number of cells to keep a gene. Default is 10.
Returns:
- adata_filtered (AnnData): Annotated data object containing the filtered RNA data.
"""
sc.pp.filter_cells(adata, min_genes=min_genes)
sc.pp.filter_genes(adata, min_cells=min_cells)
return adata
def get_degs(adata, method='t-test', p_val=0.05,
batch_remove=True, batch_key='batch_no', label_key='label',
reference='dead-end', target='reprogramming'):
"""
Get differentially expressed genes (DEGs) from the RNA data.
"""
sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=False)
sc.pp.log1p(adata)
if batch_remove:
sc.pp.combat(adata, key=batch_key)
sc.tl.rank_genes_groups(adata, groupby=label_key, method=method, n_genes=adata.shape[1], use_raw=False, reference=reference)
de_results = adata.uns['rank_genes_groups']
gene_list = list(pd.DataFrame(de_results['names'])[target])
# Compute mean and std for each gene in both groups.
# These are Series indexed by gene names (from adata.var_names).
group_a_mean_expression = adata[adata.obs[label_key] == reference].to_df().mean()
group_a_std_expression = adata[adata.obs[label_key] == reference].to_df().std()
group_b_mean_expression = adata[adata.obs[label_key] == target].to_df().mean()
group_b_std_expression = adata[adata.obs[label_key] == target].to_df().std()
# Reorder (or reindex) the computed series so that they match the order in gene_list.
group_a_mean_expression = group_a_mean_expression.reindex(gene_list)
group_a_std_expression = group_a_std_expression.reindex(gene_list)
group_b_mean_expression = group_b_mean_expression.reindex(gene_list)
group_b_std_expression = group_b_std_expression.reindex(gene_list)
# Create the DEG DataFrame.
df = pd.DataFrame({
'gene': gene_list,
'mean_exp_de': group_a_mean_expression.values, # 'dead-end' (reference)
'mean_exp_re': group_b_mean_expression.values, # 'reprogramming' (target)
'std_exp_de': group_a_std_expression.values,
'std_exp_re': group_b_std_expression.values,
'pval': de_results['pvals'][target],
'pval_adj': de_results['pvals_adj'][target],
'log_fc': de_results['logfoldchanges'][target],
})
df['group'] = df.apply(lambda row: reference if row['log_fc'] < 0 else target, axis=1)
df.sort_values(by='pval_adj', inplace=True)
df.reset_index(drop=True, inplace=True)
df['pval_adj_log'] = -np.log10(df['pval_adj'])
df = df[(df.pval_adj < p_val) & ((df.log_fc < -1) | ((df.log_fc > 1) & (df.log_fc < 7)))]
return df
def get_flux_degs(adata_Flux_labelled, labels):
dead_end = adata_Flux_labelled[labels.values == "dead-end"]
reprogramming = adata_Flux_labelled[labels.values == "reprogramming"]
features = []
log_fold_changes = []
p_values = []
mean_des = []
mean_res = []
std_des = []
std_res = []
for feature in adata_Flux_labelled.columns:
mean_de = dead_end[feature].mean()
mean_re = reprogramming[feature].mean()
std_de = dead_end[feature].std()
std_re = reprogramming[feature].std()
log_fold_change = np.log2(mean_re + 1e-10) - np.log2(mean_de + 1e-10)
t_stat, p_value = ttest_ind(dead_end[feature], reprogramming[feature], nan_policy="omit")
mean_des.append(mean_de)
mean_res.append(mean_re)
std_des.append(std_de)
std_res.append(std_re)
features.append(feature)
log_fold_changes.append(log_fold_change)
p_values.append(p_value)
adjusted_p_values = multipletests(p_values, method="fdr_bh")[1]
df_flux_degs = pd.DataFrame({
"feature": features,
"mean_de": mean_des,
"mean_re": mean_res,
"mean_diff": np.array(mean_res) - np.array(mean_des),
"std_de": std_des,
"std_re": std_res,
"log_fc": log_fold_changes,
"pval": p_values,
"pval_adj": adjusted_p_values,
'pval_adj_log' : -np.log10(adjusted_p_values)
})
df_flux_degs['group'] = df_flux_degs.apply(lambda row: 'dead-end' if row['mean_de'] > row['mean_re'] else 'reprogramming', axis=1)
df_flux_degs = df_flux_degs.sort_values(by="pval_adj").reset_index(drop=True)
return df_flux_degs
def get_atac_degs(adata, method='t-test', label_key='label',
reference='dead-end', target='reprogramming'):
"""
Get differentially expressed genes (DEGs) from the ATAC data.
"""
sc.tl.rank_genes_groups(adata, groupby=label_key, method=method,
n_genes=adata.shape[1], use_raw=False, reference=reference)
group_a_mean_expression = adata[adata.obs[label_key] == reference].to_df().mean()
group_a_std_expression = adata[adata.obs[label_key] == reference].to_df().std()
group_b_mean_expression = adata[adata.obs[label_key] == target].to_df().mean()
group_b_std_expression = adata[adata.obs[label_key] == target].to_df().std()
de_results = adata.uns['rank_genes_groups']
features = list(pd.DataFrame(de_results['names'])[target])
# Reindex the mean and std Series to this feature list
mean_de = group_a_mean_expression.reindex(features)
mean_re = group_b_mean_expression.reindex(features)
std_de = group_a_std_expression.reindex(features)
std_re = group_b_std_expression.reindex(features)
min_val = min(mean_de.min(), mean_re.min())
# Determine a shift value so that the smallest value becomes a small positive number.
shift = 0
if min_val <= 0:
shift = abs(min_val) + 1e-10
df = pd.DataFrame({
'feature': list(pd.DataFrame(de_results['names'])[target]),
'pval': de_results['pvals'][target],
'pval_adj': de_results['pvals_adj'][target],
'log_fc': np.log2(mean_re + shift) - np.log2(mean_de + shift),
'mean_de': mean_de,
'mean_re': mean_re,
'mean_diff': mean_re - mean_de,
'std_de': std_de,
'std_re': std_re,
})
df['group'] = df.apply(lambda row: 'dead-end' if row['mean_de'] > row['mean_re'] else 'reprogramming', axis=1)
df.sort_values(by='pval_adj', inplace=True)
df.reset_index(drop=True, inplace=True)
df['pval_adj_log'] = -np.log10(df['pval_adj'])
return df