Source code for scenicplus.TF_to_gene

"""Link transcription factors (TFs) to genes based on co-expression of TF and target genes.

Both linear methods (spearman or pearson correlation) and non-linear methods (random forrest or gradient boosting) are used to link TF to genes.

The correlation methods are used to seperate TFs which are infered to have a positive influence on gene expression (i.e. positive correlation) 
and TFs which are infered to have a negative influence on gene expression (i.e. negative correlation).

"""


import pandas as pd
import numpy as np
import ray
import logging
import time
import sys

from tqdm import tqdm

from .scenicplus_class import SCENICPLUS
from .utils import _create_idx_pairs, masked_rho4pairs

import scipy.sparse

COLUMN_NAME_TARGET = "target"
COLUMN_NAME_WEIGHT = "importance"
COLUMN_NAME_REGULATION = "regulation"
COLUMN_NAME_CORRELATION = "rho"
COLUMN_NAME_TF = "TF"
COLUMN_NAME_SCORE_1 = "importance_x_rho"
COLUMN_NAME_SCORE_2 = "importance_x_abs_rho"
RHO_THRESHOLD = 0.03

# Create logger
level = logging.INFO
format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
handlers = [logging.StreamHandler(stream=sys.stdout)]
logging.basicConfig(level=level, format=format, handlers=handlers)
log = logging.getLogger('TF2G')

def _inject_TF_as_its_own_target(
    scplus_obj: SCENICPLUS = None,
    TF2G_adj: pd.DataFrame = None, 
    ex_mtx: pd.DataFrame = None,
    rho_threshold = RHO_THRESHOLD, 
    TF2G_key = 'TF2G_adj', 
    out_key = 'TF2G_adj',
    inplace = True,
    increase_importance_by = 0.00001) -> pd.DataFrame:
    if scplus_obj is None and TF2G_adj is None:
        raise ValueError('Either provide a SCENIC+ object of a pd.DataFrame with TF to gene adjecencies!')
    if scplus_obj is not None and TF2G_adj is not None:
        raise ValueError('Either provide a SCENIC+ object of a pd.DataFrame with TF to gene adjecencies! Not both!')

    log.info(f"Warning: adding TFs as their own target to adjecencies matrix. Importance values will be max + {increase_importance_by}")
    
    origin_TF2G_adj = scplus_obj.uns[TF2G_key] if scplus_obj is not None else TF2G_adj
    ex_mtx = scplus_obj.to_df(layer='EXP') if scplus_obj is not None else ex_mtx

    origin_TF2G_adj = origin_TF2G_adj.sort_values('TF')
    max_importances = origin_TF2G_adj.groupby('TF').max()['importance']

    TFs_in_adj = list(set(origin_TF2G_adj['TF'].to_list()))
    TF_to_TF_adj = pd.DataFrame(
                    data = {"TF": TFs_in_adj,
                            "target": TFs_in_adj,
                            "importance": max_importances.loc[TFs_in_adj] + increase_importance_by})
    TF_to_TF_adj = _add_correlation(
            adjacencies=TF_to_TF_adj,
            ex_mtx = ex_mtx,
            rho_threshold=rho_threshold)

    new_TF2G_adj = pd.concat([origin_TF2G_adj, TF_to_TF_adj]).reset_index(drop = True)
    if inplace:
        scplus_obj.uns[out_key] = new_TF2G_adj
        return None
    else:
        return new_TF2G_adj


[docs]def load_TF2G_adj_from_file(SCENICPLUS_obj: SCENICPLUS, f_adj: str, inplace=True, key='TF2G_adj', rho_threshold=RHO_THRESHOLD): """ Function to load TF2G adjacencies from file Parameters ---------- SCENICPLUS_obj An instance of :class:`~scenicplus.scenicplus_class.SCENICPLUS` f_adj File path to TF2G adjacencies matrix inplace Boolean specifying wether or not to store adjacencies matrix in `SCENICPLUS_obj` under slot .uns[key]. Default: True key_added String specifying where in the .uns slot to store the adjacencies matrix in `SCENICPLUS_obj` Default: "TF2G_adj" rho_threshold A floating point number specifying from which absolute value to consider a correlation coefficient positive or negative. Default: 0.03 """ log.info(f'Reading file: {f_adj}') df_TF_gene_adj = pd.read_csv(f_adj, sep='\t') # only keep relevant entries idx_to_keep = np.logical_and(np.array([tf in SCENICPLUS_obj.gene_names for tf in df_TF_gene_adj['TF']]), np.array([gene in SCENICPLUS_obj.gene_names for gene in df_TF_gene_adj['target']])) df_TF_gene_adj_subset = df_TF_gene_adj.loc[idx_to_keep] if not COLUMN_NAME_CORRELATION in df_TF_gene_adj_subset.columns: log.info(f'Adding correlation coefficients to adjacencies.') df_TF_gene_adj_subset = _add_correlation( adjacencies=df_TF_gene_adj_subset, ex_mtx=SCENICPLUS_obj.to_df(layer='EXP'), rho_threshold=rho_threshold) df_TF_gene_adj_subset = _inject_TF_as_its_own_target( TF2G_adj=df_TF_gene_adj_subset, inplace = False, ex_mtx = SCENICPLUS_obj.to_df(layer='EXP')) if not COLUMN_NAME_SCORE_1 in df_TF_gene_adj_subset.columns: log.info(f'Adding importance x rho scores to adjacencies.') df_TF_gene_adj_subset[COLUMN_NAME_SCORE_1] = df_TF_gene_adj_subset[COLUMN_NAME_CORRELATION] * \ df_TF_gene_adj_subset[COLUMN_NAME_WEIGHT] if not COLUMN_NAME_SCORE_2 in df_TF_gene_adj_subset.columns: log.info(f'Adding importance x |rho| scores to adjacencies.') df_TF_gene_adj_subset[COLUMN_NAME_SCORE_2] = abs( df_TF_gene_adj_subset[COLUMN_NAME_CORRELATION]) * abs(df_TF_gene_adj_subset[COLUMN_NAME_WEIGHT]) if inplace: log.info(f'Storing adjacencies in .uns["{key}"].') SCENICPLUS_obj.uns[key] = df_TF_gene_adj_subset else: return df_TF_gene_adj_subset
def _add_correlation( adjacencies: pd.DataFrame, ex_mtx: pd.DataFrame, rho_threshold=RHO_THRESHOLD, mask_dropouts=False): """ Add correlation in expression levels between target and factor. Parameters ---------- adjacencies: pd.DataFrame The dataframe with the TF-target links. ex_mtx: pd.DataFrame The expression matrix (n_cells x n_genes). rho_threshold: float The threshold on the correlation to decide if a target gene is activated (rho > `rho_threshold`) or repressed (rho < -`rho_threshold`). mask_dropouts: boolean Do not use cells in which either the expression of the TF or the target gene is 0 when calculating the correlation between a TF-target pair. Returns ------- The adjacencies dataframe with an extra column. """ assert rho_threshold > 0, "rho_threshold should be greater than 0." # Calculate Pearson correlation to infer repression or activation. if mask_dropouts: ex_mtx = ex_mtx.sort_index(axis=1) col_idx_pairs = _create_idx_pairs(adjacencies, ex_mtx) rhos = masked_rho4pairs(ex_mtx.values, col_idx_pairs, 0.0) else: genes = list(set(adjacencies[COLUMN_NAME_TF]).union( set(adjacencies[COLUMN_NAME_TARGET]))) ex_mtx = ex_mtx[ex_mtx.columns[ex_mtx.columns.isin(genes)]] corr_mtx = pd.DataFrame( index=ex_mtx.columns, columns=ex_mtx.columns, data=np.corrcoef(ex_mtx.values.T)) rhos = np.array([corr_mtx[s2][s1] for s1, s2 in zip(adjacencies.TF, adjacencies.target)]) regulations = (rhos > rho_threshold).astype( int) - (rhos < -rho_threshold).astype(int) return pd.DataFrame( data={ COLUMN_NAME_TF: adjacencies[COLUMN_NAME_TF].values, COLUMN_NAME_TARGET: adjacencies[COLUMN_NAME_TARGET].values, COLUMN_NAME_WEIGHT: adjacencies[COLUMN_NAME_WEIGHT].values, COLUMN_NAME_REGULATION: regulations, COLUMN_NAME_CORRELATION: rhos, } ) def _run_infer_partial_network(target_gene_name, gene_names, ex_matrix, method_params, tf_matrix, tf_matrix_gene_names): """ A function to call arboreto """ from arboreto import core as arboreto_core target_gene_name_index = _get_position_index([target_gene_name], gene_names) target_gene_expression = ex_matrix[:, target_gene_name_index].ravel() n = arboreto_core.infer_partial_network( regressor_type=method_params[0], regressor_kwargs=method_params[1], tf_matrix=tf_matrix, tf_matrix_gene_names=tf_matrix_gene_names, target_gene_name=target_gene_name, target_gene_expression=target_gene_expression, include_meta=False, early_stop_window_length=arboreto_core.EARLY_STOP_WINDOW_LENGTH, seed=666) return(n) @ray.remote def _run_infer_partial_network_ray(target_gene_name, gene_names, ex_matrix, method_params, tf_matrix, tf_matrix_gene_names): """ A function to call arboreto with ray """ return _run_infer_partial_network(target_gene_name, gene_names, ex_matrix, method_params, tf_matrix, tf_matrix_gene_names)
[docs]def calculate_TFs_to_genes_relationships(scplus_obj: SCENICPLUS, tf_file: str, method: str = 'GBM', ray_n_cpu: int = 1, key: str = 'TF2G_adj', **kwargs): """ A function to calculate TF to gene relationships using arboreto and correlation Parameters ---------- scplus_obj An instance of :class:`~scenicplus.scenicplus_class.SCENICPLUS` tf_file Path to a file specifying with genes are TFs method Whether to use Gradient Boosting Machines (GBM) or random forest (RF) ray_n_cpu Number of cpus to use key String specifying where in the .uns slot to store the adjacencies matrix in :param:`SCENICPLUS_obj` default: "TF2G_adj" **kwargs Parameters to pass to ray.init """ from arboreto.utils import load_tf_names from arboreto.algo import _prepare_input from arboreto.core import SGBM_KWARGS, RF_KWARGS from arboreto.core import to_tf_matrix if(method == 'GBM'): method_params = [ 'GBM', # regressor_type SGBM_KWARGS # regressor_kwargs ] elif(method == 'RF'): method_params = [ 'RF', # regressor_type RF_KWARGS # regressor_kwargs ] gene_names = scplus_obj.gene_names cell_names = scplus_obj.cell_names ex_matrix = scplus_obj.X_EXP tf_names = load_tf_names(tf_file) ex_matrix, gene_names, tf_names = _prepare_input( ex_matrix, gene_names, tf_names) tf_matrix, tf_matrix_gene_names = to_tf_matrix( ex_matrix, gene_names, tf_names) #convert ex_matrix, tf_matrix to np.array if necessary if isinstance(ex_matrix, np.matrix): ex_matrix = np.array(ex_matrix) elif scipy.sparse.issparse(ex_matrix): ex_matrix = ex_matrix.toarray() if isinstance(tf_matrix, np.matrix): tf_matrix = np.array(tf_matrix) elif scipy.sparse.issparse(tf_matrix): tf_matrix = tf_matrix.toarray() ray.init(num_cpus=ray_n_cpu, **kwargs) log.info(f'Calculating TF to gene correlation, using {method} method') start_time = time.time() try: jobs = [] for gene in tqdm(gene_names, total=len(gene_names), desc='initializing'): jobs.append(_run_infer_partial_network_ray.remote(gene, gene_names, ex_matrix, method_params, tf_matrix, tf_matrix_gene_names)) # add progress bar, adapted from: https://github.com/ray-project/ray/issues/8164 def to_iterator(obj_ids): while obj_ids: finished_ids, obj_ids = ray.wait(obj_ids) for finished_id in finished_ids: yield ray.get(finished_id) tfs_to_genes = [] for adj in tqdm(to_iterator(jobs), total=len(jobs), desc=f'Running using {ray_n_cpu} cores', smoothing=0.1): tfs_to_genes.append(adj) except Exception as e: print(e) finally: ray.shutdown() log.info('Took {} seconds'.format(time.time() - start_time)) start_time = time.time() log.info(f'Adding correlation coefficients to adjacencies.') adj = pd.concat(tfs_to_genes).sort_values(by='importance', ascending=False) ex_matrix = pd.DataFrame( scplus_obj.X_EXP, index=scplus_obj.cell_names, columns=scplus_obj.gene_names) adj = _add_correlation(adj, ex_matrix) adj = _inject_TF_as_its_own_target( TF2G_adj=adj, inplace = False, ex_mtx = scplus_obj.to_df(layer='EXP')) log.info(f'Adding importance x rho scores to adjacencies.') adj[COLUMN_NAME_SCORE_1] = adj[COLUMN_NAME_CORRELATION] * \ adj[COLUMN_NAME_WEIGHT] adj[COLUMN_NAME_SCORE_2] = abs( adj[COLUMN_NAME_CORRELATION]) * abs(adj[COLUMN_NAME_WEIGHT]) log.info('Took {} seconds'.format(time.time() - start_time)) scplus_obj.uns[key] = adj
def _get_position_index(query_list, target_list): """ Helper function to grep an instance in a list """ d = {k: v for v, k in enumerate(target_list)} index = (d[k] for k in query_list) return list(index)