import scanpy as sc import pandas as pd import numpy as np from scipy.stats import ttest_ind from statsmodels.stats.multitest import multipletests def filter_rna_cells_genes(adata, min_genes=100, min_cells=10): """ Filter cells and genes in RNA data. Parameters: - adata (AnnData): Annotated data object containing the RNA data. - min_genes (int): The minimum number of genes to keep a cell. Default is 100. - min_cells (int): The minimum number of cells to keep a gene. Default is 10. Returns: - adata_filtered (AnnData): Annotated data object containing the filtered RNA data. """ sc.pp.filter_cells(adata, min_genes=min_genes) sc.pp.filter_genes(adata, min_cells=min_cells) return adata def get_degs(adata, method='t-test', p_val=0.05, batch_remove=True, batch_key='batch_no', label_key='label', reference='dead-end', target='reprogramming'): """ Get differentially expressed genes (DEGs) from the RNA data. """ sc.pp.normalize_total(adata, target_sum=1e4, exclude_highly_expressed=False) sc.pp.log1p(adata) if batch_remove: sc.pp.combat(adata, key=batch_key) sc.tl.rank_genes_groups(adata, groupby=label_key, method=method, n_genes=adata.shape[1], use_raw=False, reference=reference) de_results = adata.uns['rank_genes_groups'] gene_list = list(pd.DataFrame(de_results['names'])[target]) # Compute mean and std for each gene in both groups. # These are Series indexed by gene names (from adata.var_names). group_a_mean_expression = adata[adata.obs[label_key] == reference].to_df().mean() group_a_std_expression = adata[adata.obs[label_key] == reference].to_df().std() group_b_mean_expression = adata[adata.obs[label_key] == target].to_df().mean() group_b_std_expression = adata[adata.obs[label_key] == target].to_df().std() # Reorder (or reindex) the computed series so that they match the order in gene_list. group_a_mean_expression = group_a_mean_expression.reindex(gene_list) group_a_std_expression = group_a_std_expression.reindex(gene_list) group_b_mean_expression = group_b_mean_expression.reindex(gene_list) group_b_std_expression = group_b_std_expression.reindex(gene_list) # Create the DEG DataFrame. df = pd.DataFrame({ 'gene': gene_list, 'mean_exp_de': group_a_mean_expression.values, # 'dead-end' (reference) 'mean_exp_re': group_b_mean_expression.values, # 'reprogramming' (target) 'std_exp_de': group_a_std_expression.values, 'std_exp_re': group_b_std_expression.values, 'pval': de_results['pvals'][target], 'pval_adj': de_results['pvals_adj'][target], 'log_fc': de_results['logfoldchanges'][target], }) df['group'] = df.apply(lambda row: reference if row['log_fc'] < 0 else target, axis=1) df.sort_values(by='pval_adj', inplace=True) df.reset_index(drop=True, inplace=True) df['pval_adj_log'] = -np.log10(df['pval_adj']) df = df[(df.pval_adj < p_val) & ((df.log_fc < -1) | ((df.log_fc > 1) & (df.log_fc < 7)))] return df def get_flux_degs(adata_Flux_labelled, labels): dead_end = adata_Flux_labelled[labels.values == "dead-end"] reprogramming = adata_Flux_labelled[labels.values == "reprogramming"] features = [] log_fold_changes = [] p_values = [] mean_des = [] mean_res = [] std_des = [] std_res = [] for feature in adata_Flux_labelled.columns: mean_de = dead_end[feature].mean() mean_re = reprogramming[feature].mean() std_de = dead_end[feature].std() std_re = reprogramming[feature].std() log_fold_change = np.log2(mean_re + 1e-10) - np.log2(mean_de + 1e-10) t_stat, p_value = ttest_ind(dead_end[feature], reprogramming[feature], nan_policy="omit") mean_des.append(mean_de) mean_res.append(mean_re) std_des.append(std_de) std_res.append(std_re) features.append(feature) log_fold_changes.append(log_fold_change) p_values.append(p_value) adjusted_p_values = multipletests(p_values, method="fdr_bh")[1] df_flux_degs = pd.DataFrame({ "feature": features, "mean_de": mean_des, "mean_re": mean_res, "mean_diff": np.array(mean_res) - np.array(mean_des), "std_de": std_des, "std_re": std_res, "log_fc": log_fold_changes, "pval": p_values, "pval_adj": adjusted_p_values, 'pval_adj_log' : -np.log10(adjusted_p_values) }) df_flux_degs['group'] = df_flux_degs.apply(lambda row: 'dead-end' if row['mean_de'] > row['mean_re'] else 'reprogramming', axis=1) df_flux_degs = df_flux_degs.sort_values(by="pval_adj").reset_index(drop=True) return df_flux_degs def get_atac_degs(adata, method='t-test', label_key='label', reference='dead-end', target='reprogramming'): """ Get differentially expressed genes (DEGs) from the ATAC data. """ sc.tl.rank_genes_groups(adata, groupby=label_key, method=method, n_genes=adata.shape[1], use_raw=False, reference=reference) group_a_mean_expression = adata[adata.obs[label_key] == reference].to_df().mean() group_a_std_expression = adata[adata.obs[label_key] == reference].to_df().std() group_b_mean_expression = adata[adata.obs[label_key] == target].to_df().mean() group_b_std_expression = adata[adata.obs[label_key] == target].to_df().std() de_results = adata.uns['rank_genes_groups'] features = list(pd.DataFrame(de_results['names'])[target]) # Reindex the mean and std Series to this feature list mean_de = group_a_mean_expression.reindex(features) mean_re = group_b_mean_expression.reindex(features) std_de = group_a_std_expression.reindex(features) std_re = group_b_std_expression.reindex(features) min_val = min(mean_de.min(), mean_re.min()) # Determine a shift value so that the smallest value becomes a small positive number. shift = 0 if min_val <= 0: shift = abs(min_val) + 1e-10 df = pd.DataFrame({ 'feature': list(pd.DataFrame(de_results['names'])[target]), 'pval': de_results['pvals'][target], 'pval_adj': de_results['pvals_adj'][target], 'log_fc': np.log2(mean_re + shift) - np.log2(mean_de + shift), 'mean_de': mean_de, 'mean_re': mean_re, 'mean_diff': mean_re - mean_de, 'std_de': std_de, 'std_re': std_re, }) df['group'] = df.apply(lambda row: 'dead-end' if row['mean_de'] > row['mean_re'] else 'reprogramming', axis=1) df.sort_values(by='pval_adj', inplace=True) df.reset_index(drop=True, inplace=True) df['pval_adj_log'] = -np.log10(df['pval_adj']) return df