Spaces:

Angione-Lab
/

FateFormerExplorer

Running

File size: 13,828 Bytes

ef814bf

import os
import anndata as ad
import pandas as pd
from sklearn.preprocessing import StandardScaler
from . import preprocess_data

def load_clones(data_path):
    df_clone = pd.read_csv(data_path, index_col=["cell.bc"])
    df_clone = df_clone[["assay", 'state/fate', 'cell_type', 
                         'most_dominant_fate', 'most_dominant_fate_pct', 
                         "clone_id", "clone.size (RNA & ATAC)", 'clone.size (RNA)', 'clone.size (ATAC)', 
                         '# of D3 cells (RNA)', '# of D3 cells (ATAC)']]
    df_clone.rename({"clone.size (RNA & ATAC)": "clone_size",
                        "clone.size (RNA)": "cells_RNA",
                        'clone.size (ATAC)': "cells_ATAC",
                        '# of D3 cells (ATAC)' : "cells_ATAC_D3",
                        '# of D3 cells (RNA)' : "cells_RNA_D3",
                        'most_dominant_fate': 'label',
                        'most_dominant_fate_pct': 'pct',
                        'state/fate': 'day3_day21'}, inplace=True, axis=1)
    return df_clone

def add_clone_info(adata, clone_path, split=False):
    """
    Adds clone information to the given AnnData object.
    Parameters:
    adata (AnnData): The AnnData object to which clone information will be added.
    clone_path (str): The file path to the clone data.
    split (bool): Whether to split the data into labelled and unlabelled. Default is False.
    Returns:
    AnnData: The modified AnnData object with clone information added.
    """

    df_clone = load_clones(clone_path)
    filtered_obs = adata.obs.join(df_clone, how='inner')

    if split:
        filtered_obs = filtered_obs[(filtered_obs.label=='reprogramming') | (filtered_obs.label=='dead-end')]
        adata_labelled = adata[filtered_obs.index].copy()
        adata_labelled.obs = filtered_obs
        adata_unlabelled = adata[~adata.obs.index.isin(adata_labelled.obs.index)].copy()
        return adata_labelled, adata_unlabelled

    adata = adata[filtered_obs.index]
    adata.obs = filtered_obs
    return adata

def load_rna(data_path, return_raw=True, clone_info=False, clone_path=None):
    """
    Load RNA data from a given file path.
    Parameters:
    - data_path (str): The file path to the RNA data.
    - return_raw (bool): Whether to return the raw counts or not. Default is False.
    - add_clone_info (bool): Whether to add clone information or not. Default is True.
    - clone_path (str): The file path to the clone information. Required if add_clone_info is True.
    Returns:
    - adata_RNA (AnnData): Annotated data object containing the loaded RNA data.
    """

    # Load RNA data
    adata_RNA = ad.read_h5ad(data_path)
    adata_RNA.obs.index = adata_RNA.obs.index.str.replace('_', '-')
    
    # Restore raw counts if necessary
    if return_raw:
        adata_RNA.X = adata_RNA.raw.X.copy() # Copy raw counts to the expression matrix
     
    # Add batch information
    adata_RNA.obs['batch_no'] = adata_RNA.obs.index.to_series().apply(lambda idx: 1 if 'r1' in idx else (2 if 'r2' in idx else 0))
    
    # Add clone information
    if clone_info:
        if clone_path is None:
            raise ValueError("clone_path must be provided if add_clone_info is True.")
        else:
            adata_RNA = add_clone_info(adata_RNA, clone_path)
    
    # Remove unwanted columns
    columns_to_remove = ['orig.ident', 'old_ident', 'cc_score_diff', 'snn_res_0_8', 
                        'seurat_clusters', 
                        'predicted__cca_co_id', 'prediction_score_fib_1', 'prediction_score_fib_0', 
                        'prediction_score_fib_2', 
                        'prediction_score_early_0', 'prediction_score_transition_0', 
                        'prediction_score_transition_1', 
                        'prediction_score_early_1', 'prediction_score_early_2', 'prediction_score_iep_1', 
                        'prediction_score_transition_2', 'prediction_score_iep_2', 'prediction_score_dead_end_1', 
                        'prediction_score_dead_end_0', 'prediction_score_iep_0', 'prediction_score_dead_end_2', 
                        'prediction_score_max', 'snn_res_0_2', 'cellranger_ident', 'metadata_fate_coarse_rev1', 
                        'md_fate_rev1', 'md_fate_coarse_rev1', 'metadata_fate_rev1', 'day3_day21', 'sample_id',
                        'replicate_id', 'cell_type', 'assay']
    intersection = set(columns_to_remove).intersection(adata_RNA.obs.columns)
    if intersection:
        adata_RNA.obs.drop(intersection, axis=1, inplace=True)

    # Rename columns
    columns_to_rename = {'S.Score': 'S_score', 
                         'G2M.Score': 'G2M_score',
                         'nCount_RNA': 'total_counts',
                         'nFeature_RNA': 'n_genes_by_counts',
                         'Phase': 'phase',
                         'percent.mt': 'pct_counts_mt',
                         }
    intersection = set(columns_to_rename.keys()).intersection(adata_RNA.obs.columns)
    if intersection:
        adata_RNA.obs.rename(columns=columns_to_rename, inplace=True)

    return adata_RNA

def load_atac(data_path, clone_info=False, clone_path=None):
    """
    Load ATAC data from a given file path.
    Parameters:
    - data_path (str): The file path to the ATAC data.
    - clone_info (bool): Whether to add clone information or not. Default is False.
    - clone_path (str): The file path to the clone information. Required if add_clone_info is True.
    Returns:
    - adata_atac (AnnData): Annotated data object containing the loaded ATAC data.
    """
    adata_atac = ad.read_h5ad(data_path)
    adata_atac = adata_atac[:,adata_atac.var['name'] != "Crebzf_122"]
    adata_atac.obs.index = adata_atac.obs.index.str.replace('_', '-')

    adata_atac = adata_atac.copy()
    adata_atac.obs['batch_no'] = adata_atac.obs.index.to_series().apply(lambda idx: 1 if 'r1' in idx else (2 if 'r2' in idx else 0))

    columns_to_remove = ['BlacklistRatio', 'CellNames', 'DoubletEnrichment', 
                     'DoubletScore', 'NucleosomeRatio', 'PassQC', 'PromoterRatio', 
                     'ReadsInBlacklist', 'ReadsInPromoter', 'ReadsInTSS', 'TSSEnrichment',
                     'nDiFrags', 'nFrags', 'nMonoFrags', 'nMultiFrags', 
                     'origin']

    intersection = set(columns_to_remove).intersection(adata_atac.obs.columns)
    if intersection:
        adata_atac.obs.drop(intersection, axis=1, inplace=True)

    if clone_info:
        if clone_path is None:
            raise ValueError("clone_path must be provided if add_clone_info is True.")
        else:
            adata_atac_labelled, adata_atac_unlabelled = add_clone_info(adata_atac, clone_path, split=True)
            return adata_atac_labelled, adata_atac_unlabelled
    else:
        # warning that without clone info, the data will be returned as a single object
        print("Warning: Clone information not provided. Returning a single object.")

    return adata_atac

def concat_fluxes(directory, prefix):
    df_list = []
    for filename in os.listdir(directory):
        if filename.startswith(prefix) and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            df = pd.read_csv(file_path, index_col=0)
            df_list.append(df)
    
    if df_list:
        concatenated_df = pd.concat(df_list, axis=0)
    else:
        concatenated_df = pd.DataFrame() 

    return concatenated_df

def load_flux(data_path, prefix='flux_un', clone_info=False, clone_path=None, scale=True, flux_metadata_path=None):
    """
    Load Flux data from a given file path.
    Parameters:
    - data_path (str): The file path to the Flux data.
    - prefix (str): The prefix of the Flux files. Default is 'flux_un'.
    - clone_info (bool): Whether to add clone information or not. Default is False.
    - clone_path (str): The file path to the clone information. Required if add_clone_info is True.
    Returns:
    - adata_Flux_labelled (pd.DataFrame): Annotated data object containing the labelled Flux data.
    - adata_Flux_unlabelled (pd.DataFrame): Annotated data object containing the unlabelled Flux data.
    - bi_labelled (list): List of binary labels for the labelled Flux data.
    - bi_unlabelled (list): List of binary labels for the unlabelled Flux data.
    - labels (list): List of labels for the labelled Flux data.
    """

    adata_Flux_labelled = pd.read_csv(data_path, index_col=0)
    directory = os.path.dirname(data_path)
    adata_Flux_unlabelled = concat_fluxes(directory, prefix)

    adata_Flux_labelled.index = adata_Flux_labelled.index.str.replace('_', '-')
    if not adata_Flux_unlabelled.empty:
        adata_Flux_unlabelled.index = adata_Flux_unlabelled.index.str.replace('_', '-')
    else:
        # Keep schema consistent when unlabeled files are not shipped.
        adata_Flux_unlabelled = pd.DataFrame(columns=adata_Flux_labelled.columns)

    if scale:
        std_sc = StandardScaler()
        if not adata_Flux_unlabelled.empty:
            scaled_unl = std_sc.fit_transform(adata_Flux_unlabelled.values)
            scaled_unl += abs(scaled_unl.min())
            adata_Flux_unlabelled = pd.DataFrame(
                scaled_unl,
                index=adata_Flux_unlabelled.index,
                columns=adata_Flux_unlabelled.columns,
            )
            scaled_la = std_sc.transform(adata_Flux_labelled.values)
            scaled_la += abs(scaled_la.min())
        else:
            # Fallback for minimal/portable app packages: scale from labelled only.
            scaled_la = std_sc.fit_transform(adata_Flux_labelled.values)
            scaled_la += abs(scaled_la.min())

        adata_Flux_labelled = pd.DataFrame(
            scaled_la,
            index=adata_Flux_labelled.index,
            columns=adata_Flux_labelled.columns,
        )
    if flux_metadata_path is not None:
        md = pd.read_csv(flux_metadata_path)[['X', 'rxnName']]
    else:
        md = pd.read_csv("data/datasets/flux/metabolic_model_metadata.csv")[['X', 'rxnName']]
    dict_rename = {}
    for col in adata_Flux_labelled.columns:
        reaction = md[md['X'] == col]['rxnName'].str.replace(" -> ", "→").values
        dict_rename[col] = reaction[0]
    adata_Flux_labelled = adata_Flux_labelled.rename(columns=dict_rename)
    adata_Flux_unlabelled = adata_Flux_unlabelled.rename(columns=dict_rename)    


    if clone_info:
        if clone_path is None:
            raise ValueError("clone_path must be provided if add_clone_info is True.")
        else:
            df_clone = load_clones(clone_path)
            filtered_obs = adata_Flux_labelled.join(df_clone, how='inner')
            labels = filtered_obs['label']
            pcts = filtered_obs['pct']
            bi_labelled = adata_Flux_labelled.index.map(lambda x: 2 if 'r2' in x else 1 if 'r1' in x else 0)
            bi_unlabelled = adata_Flux_unlabelled.index.map(lambda x: 2 if 'r2' in x else 1 if 'r1' in x else 0)   
            adata_Flux_labelled = adata_Flux_labelled.loc[filtered_obs.index]
            return  adata_Flux_labelled, adata_Flux_unlabelled, bi_labelled, bi_unlabelled, labels, pcts
    else:
        print("Warning: Clone information not provided. Returning raw data.")
        return adata_Flux_labelled, adata_Flux_unlabelled


def load_processed_rna(verbose=True, return_raw=True, return_all_features=False):

    if verbose:
         print('Loading RNA data...')
    # Load RNA data labelled
    adata_RNA_labelled = load_rna("data/datasets/rna/all_rna_d3_labelled.h5ad",
                                    return_raw=True,
                                    clone_info=True,
                                    clone_path="data/datasets/clone/clones.csv")
    # Load RNA data unlabelled
    adata_RNA_unlabelled = load_rna("data/datasets/rna/all_rna_d3_unlabelled.h5ad", 
                                    return_raw=True, 
                                    clone_info=False)
   
    if verbose:
        print('Filtering RNA data...')
    adata_RNA_labelled = preprocess_data.filter_rna_cells_genes(adata_RNA_labelled.copy())
    adata_RNA_unlabelled = preprocess_data.filter_rna_cells_genes(adata_RNA_unlabelled.copy())

    if verbose:
        print('Feature Selection by DEGs...')
    deg_list = preprocess_data.get_degs(adata_RNA_labelled, method='t-test')
 
    if verbose:
        print('Filtering Genes...')
    genes_intersection = set(adata_RNA_labelled.var_names).intersection(set(adata_RNA_unlabelled.var_names)).intersection(set(deg_list.gene))
    adata_RNA_labelled_all = adata_RNA_labelled.copy()
    adata_RNA_labelled = adata_RNA_labelled[:, list(genes_intersection)]
    adata_RNA_unlabelled = adata_RNA_unlabelled[:, list(genes_intersection)]
    

    if return_raw:
        gene_indices = [adata_RNA_labelled.raw.var_names.get_loc(gene) for gene in adata_RNA_labelled.var_names]
        adata_RNA_labelled.X = adata_RNA_labelled.raw.X[:, gene_indices].toarray().copy()
        adata_RNA_unlabelled.X = adata_RNA_unlabelled.raw.X[:, gene_indices].copy()

    if return_all_features:
        return adata_RNA_labelled, adata_RNA_unlabelled, deg_list, adata_RNA_labelled_all
    return adata_RNA_labelled, adata_RNA_unlabelled, deg_list

if __name__ == '__main__':
    adata_ATAC_labelled, adata_ATAC_unlabelled = load_atac("data/datasets/atac/all_atac_d3_motif.h5ad", 
                                                          clone_info=True, 
                                                          clone_path="data/datasets/clone/clones.csv")
    print(adata_ATAC_labelled.obs.columns, adata_ATAC_labelled.obs.shape, adata_ATAC_labelled.obs.index[:10])
    print(adata_ATAC_unlabelled.obs.columns, adata_ATAC_unlabelled.obs.shape, adata_ATAC_unlabelled.obs.index[:10])
    print("Data loaded successfully!")