Source code for scenicplus.enhancer_to_gene

"""Link enhancers to genes based on co-occurence of chromatin accessbility of the enhancer and gene expression.

Both linear methods (spearman or pearson correlation) and non-linear methods (random forrest or gradient boosting) are used to link enhancers to genes.

The correlation methods are used to seperate regions which are infered to have a positive influence on gene expression (i.e. positive correlation) 
and regions which are infered to have a negative influence on gene expression (i.e. negative correlation).

"""

import pandas as pd
import numpy as np
import ray
import logging
import time
import sys
import os
import subprocess
import pyranges as pr
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm
from matplotlib import cm
from matplotlib.colors import Normalize
from typing import List

from .utils import extend_pyranges, extend_pyranges_with_limits, reduce_pyranges_with_limits_b
from .utils import calculate_distance_with_limits_join, reduce_pyranges_b, calculate_distance_join
from .utils import coord_to_region_names, region_names_to_coordinates, ASM_SYNONYMS, Groupby, flatten_list
from .scenicplus_class import SCENICPLUS

RANDOM_SEED = 666

SKLEARN_REGRESSOR_FACTORY = {
    'RF': RandomForestRegressor,
    'ET': ExtraTreesRegressor,
    'GBM': GradientBoostingRegressor
}

SCIPY_CORRELATION_FACTORY = {
    'PR': pearsonr,
    'SR': spearmanr
}

# Parameters from arboreto
# scikit-learn random forest regressor
RF_KWARGS = {
    'n_jobs': 1,
    'n_estimators': 1000,
    'max_features': 'sqrt'
}

# scikit-learn extra-trees regressor
ET_KWARGS = {
    'n_jobs': 1,
    'n_estimators': 1000,
    'max_features': 'sqrt'
}

# scikit-learn gradient boosting regressor
GBM_KWARGS = {
    'learning_rate': 0.01,
    'n_estimators': 500,
    'max_features': 0.1
}

# scikit-learn stochastic gradient boosting regressor
SGBM_KWARGS = {
    'learning_rate': 0.01,
    'n_estimators': 5000,  # can be arbitrarily large
    'max_features': 0.1,
    'subsample': 0.9
}

# Interact auto sql definition
INTERACT_AS = """table interact
"Interaction between two regions"
    (
    string chrom;      "Chromosome (or contig, scaffold, etc.). For interchromosomal, use 2 records"
    uint chromStart;   "Start position of lower region. For interchromosomal, set to chromStart of this region"
    uint chromEnd;     "End position of upper region. For interchromosomal, set to chromEnd of this region"
    string name;       "Name of item, for display.  Usually 'sourceName/targetName' or empty"
    uint score;        "Score from 0-1000."
    double value;      "Strength of interaction or other data value. Typically basis for score"
    string exp;        "Experiment name (metadata for filtering). Use . if not applicable"
    string color;      "Item color.  Specified as r,g,b or hexadecimal #RRGGBB or html color name, as in //www.w3.org/TR/css3-color/#html4."
    string sourceChrom;  "Chromosome of source region (directional) or lower region. For non-directional interchromosomal, chrom of this region."
    uint sourceStart;  "Start position source/lower/this region"
    uint sourceEnd;    "End position in chromosome of source/lower/this region"
    string sourceName;  "Identifier of source/lower/this region"
    string sourceStrand; "Orientation of source/lower/this region: + or -.  Use . if not applicable"
    string targetChrom; "Chromosome of target region (directional) or upper region. For non-directional interchromosomal, chrom of other region"
    uint targetStart;  "Start position in chromosome of target/upper/this region"
    uint targetEnd;    "End position in chromosome of target/upper/this region"
    string targetName; "Identifier of target/upper/this region"
    string targetStrand; "Orientation of target/upper/this region: + or -.  Use . if not applicable"
    )
"""


[docs]def get_search_space(SCENICPLUS_obj: SCENICPLUS,
                     species=None,
                     assembly=None,
                     pr_annot=None,
                     pr_chromsizes=None,
                     predefined_boundaries=None,
                     use_gene_boundaries=False,
                     upstream=[1000, 150000],
                     downstream=[1000, 150000],
                     extend_tss=[10, 10],
                     remove_promoters=False,
                     biomart_host='http://www.ensembl.org',
                     inplace=True,
                     key_added='search_space',
                     implode_entries=True):
    """
    Get search space surrounding genes to calculate enhancer to gene links

    Parameters
    ----------
    SCENICPLUS_obj: SCENICPLUS
        a :class:`pr.SCENICPLUS`.
    species: string, optional
        Name of the species (e.g. hsapiens) on whose reference genome the search space should be calculated. This will be used to retrieve gene annotations from biomart. 
        Annotations can also be manually provided using the parameter [pr_annot]. Default: None
    assembly: string, optional
        Name of the assembly (e.g. hg38) of the reference genome on which the search space should be calculated. 
        This will be used to retrieve chromosome sizes from the UCSC genome browser.
        Chromosome sizes can also be manually provided using the parameter [pr_chromsizes]. Default: None  
    pr_annot: pr.PyRanges, optional
        A :class:`pr.PyRanges` containing gene annotation, including Chromosome, Start, End, Strand (as '+' and '-'), Gene name
        and Transcription Start Site. Default: None
    pr_chromsizes: pr.PyRanges, optional
        A :class:`pr.PyRanges` containing size of each chromosome, containing 'Chromosome', 'Start' and 'End' columns. Default: None
    predefined_boundaries: pr.PyRanges, optional
        A :class:`pr.PyRanges` containing predefined genomic domain boundaries (e.g. TAD boundaries) to use as boundaries. If 
        given, use_gene_boundaries will be ignored.
    use_gene_boundaries: bool, optional
        Whether to use the whole search space or stop when encountering another gene. Default: False
    upstream: List, optional
        Search space upstream. The minimum (first position) means that even if there is a gene right next to it these
        bp will be taken. The second position indicates the maximum distance. Default: [1000, 100000]
    downstream: List, optional
        Search space downstream. The minimum (first position) means that even if there is a gene right next to it these
        bp will be taken. The second position indicates the maximum distance. Default: [1000, 100000]
    extend_tss: list, optional
        Space around the TSS consider as promoter. Default: [10,10]
    remove_promoters: bool, optional
        Whether to remove promoters from the search space or not. Default: False
    biomart_host: str, optional
        Biomart host to use to download TSS annotation. Please make sure this host matches the expression data (i.e. matching gene names) otherwise a lot of genes are potentially lost.
    inplace: bool, optional
        If set to True, store results into scplus_obj, otherwise return results.
    key_added: str, optional
        Key under which to add the results under scplus.uns.
    implode_entries: bool, optional
        When a gene has multiple start/end sites it has multiple distances and gene width. 
        If this parameter is set to True these multiple entries per region and gene will be put in a list, generating a single entry.
        If this parameter is set to False these multiple entries will be kept.
    Return
    ------
    pd.DataFrame
        A data frame containing regions in the search space for each gene
    """

    # Create logger
    level = logging.INFO
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=format, handlers=handlers)
    log = logging.getLogger('R2G')

    # get regions
    pr_regions = pr.PyRanges(
        region_names_to_coordinates(SCENICPLUS_obj.region_names))

    # set region names
    pr_regions.Name = coord_to_region_names(pr_regions)

    # GET GENE ANNOTATION AND CHROMSIZES
    if species is not None and assembly is not None:
        # Download gene annotation and chromsizes
        # 1. Download gene annotation from biomart
        import pybiomart as pbm
        dataset_name = '{}_gene_ensembl'.format(species)
        server = pbm.Server(host=biomart_host, use_cache=False)
        mart = server['ENSEMBL_MART_ENSEMBL']
        # check if biomart host is correct
        dataset_display_name = getattr(
            mart.datasets[dataset_name], 'display_name')
        if not (ASM_SYNONYMS[assembly] in dataset_display_name or assembly in dataset_display_name):
            print(
                f'\u001b[31m!! The provided assembly {assembly} does not match the biomart host ({dataset_display_name}).\n Please check biomart host parameter\u001b[0m\nFor more info see: https://m.ensembl.org/info/website/archives/assembly.html')
        # check wether dataset can be accessed.
        if dataset_name not in mart.list_datasets()['name'].to_numpy():
            raise Exception(
                '{} could not be found as a dataset in biomart. Check species name or consider manually providing gene annotations!')
        else:
            log.info(
                "Downloading gene annotation from biomart dataset: {}".format(dataset_name))
            dataset = mart[dataset_name]
            if 'external_gene_name' not in dataset.attributes.keys():
                external_gene_name_query = 'hgnc_symbol'
            else:
                external_gene_name_query = 'external_gene_name'
            if 'transcription_start_site' not in dataset.attributes.keys():
                transcription_start_site_query = 'transcript_start'
            else:
                transcription_start_site_query = 'transcription_start_site'
            annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position',
                                  'strand', external_gene_name_query, transcription_start_site_query, 'transcript_biotype'])
            annot.columns = ['Chromosome', 'Start', 'End', 'Strand',
                             'Gene', 'Transcription_Start_Site', 'Transcript_type']
            annot['Chromosome'] = 'chr' + annot['Chromosome'].astype(str)
            annot = annot[annot.Transcript_type == 'protein_coding']
            annot.Strand[annot.Strand == 1] = '+'
            annot.Strand[annot.Strand == -1] = '-'
            annot = pr.PyRanges(annot.dropna(axis=0))
            if not any(['chr' in c for c in SCENICPLUS_obj.region_names]):
                annot.Chromosome = annot.Chromosome.str.replace('chr', '')
            

        # 2. Download chromosome sizes from UCSC genome browser
        import requests
        target_url = 'http://hgdownload.cse.ucsc.edu/goldenPath/{asm}/bigZips/{asm}.chrom.sizes'.format(
            asm=assembly)
        # check wether url exists
        request = requests.get(target_url)
        if request.status_code == 200:
            log.info("Downloading chromosome sizes from: {}".format(target_url))
            chromsizes = pd.read_csv(target_url, sep='\t', header=None)
            chromsizes.columns = ['Chromosome', 'End']
            chromsizes['Start'] = [0]*chromsizes.shape[0]
            chromsizes = chromsizes.loc[:, ['Chromosome', 'Start', 'End']]
            if not any(['chr' in c for c in SCENICPLUS_obj.region_names]):
                annot.Chromosome = annot.Chromosome.str.replace('chr', '')
            chromsizes = pr.PyRanges(chromsizes)
        else:
            raise Exception(
                'The assembly {} could not be found in http://hgdownload.cse.ucsc.edu/goldenPath/. Check assembly name or consider manually providing chromosome sizes!'.format(assembly))
    else:
        # Manually provided gene annotation and chromsizes
        annot = pr_annot
        chromsizes = pr_chromsizes

    # Add gene width
    if annot.df['Gene'].isnull().to_numpy().any():
        annot = pr.PyRanges(annot.df.fillna(value={'Gene': 'na'}))
    annot.Gene_width = abs(annot.End - annot.Start).astype(np.int32)

    # Prepare promoter annotation
    pd_promoters = annot.df.loc[:, ['Chromosome',
                                    'Transcription_Start_Site', 'Strand', 'Gene']]
    pd_promoters['Transcription_Start_Site'] = (
        pd_promoters.loc[:, 'Transcription_Start_Site']
    ).astype(np.int32)
    pd_promoters['End'] = (
        pd_promoters.loc[:, 'Transcription_Start_Site']).astype(np.int32)
    pd_promoters.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'End']
    pd_promoters = pd_promoters.loc[:, [
        'Chromosome', 'Start', 'End', 'Strand', 'Gene']]
    pr_promoters = pr.PyRanges(pd_promoters)
    log.info('Extending promoter annotation to {} bp upstream and {} downstream'.format(
        str(extend_tss[0]), str(extend_tss[1])))
    pr_promoters = extend_pyranges(pr_promoters, extend_tss[0], extend_tss[1])

    if use_gene_boundaries or predefined_boundaries:
        if predefined_boundaries:
            predefined_boundaries_pos = predefined_boundaries.df.copy()
            predefined_boundaries_neg = predefined_boundaries.df.copy()
            predefined_boundaries_pos['Strand'] = '+'
            predefined_boundaries_neg['Strand'] = '-'
            predefined_boundaries = pr.PyRanges(
                pd.concat(
                    [
                        predefined_boundaries_pos,
                        predefined_boundaries_neg
                    ]
                )
            )
            space = predefined_boundaries
            log.info('Using predefined domains')
            use_gene_boundaries = False
        if use_gene_boundaries:
            space = pr_promoters
            log.info('Calculating gene boundaries')
        # add chromosome limits
        chromsizes_begin_pos = chromsizes.df.copy()
        chromsizes_begin_pos['End'] = 1
        chromsizes_begin_pos['Strand'] = '+'
        chromsizes_begin_pos['Gene'] = 'Chrom_Begin'
        chromsizes_begin_neg = chromsizes_begin_pos.copy()
        chromsizes_begin_neg['Strand'] = '-'
        chromsizes_end_pos = chromsizes.df.copy()
        chromsizes_end_pos['Start'] = chromsizes_end_pos['End'] - 1
        chromsizes_end_pos['Strand'] = '+'
        chromsizes_end_pos['Gene'] = 'Chrom_End'
        chromsizes_end_neg = chromsizes_end_pos.copy()
        chromsizes_end_neg['Strand'] = '-'
        gene_bound = pr.PyRanges(
            pd.concat(
                [
                    pr_promoters.df,
                    chromsizes_begin_pos,
                    chromsizes_begin_neg,
                    chromsizes_end_pos,
                    chromsizes_end_neg
                ]
            )
        )

        # Get distance to nearest promoter (of a differrent gene)
        annot_nodup = annot[['Chromosome',
                             'Start',
                             'End',
                             'Strand',
                             'Gene',
                             'Gene_width',
                             'Gene_size_weight']].drop_duplicate_positions().copy()
        annot_nodup = pr.PyRanges(
            annot_nodup.df.drop_duplicates(subset="Gene", keep="first"))

        closest_promoter_upstream = annot_nodup.nearest(
            gene_bound, overlap=False, how='upstream')
        closest_promoter_upstream = closest_promoter_upstream[[
            'Chromosome', 'Start', 'End', 'Strand', 'Gene', 'Distance']]

        closest_promoter_downstream = annot_nodup.nearest(
            gene_bound, overlap=False, how='downstream')
        closest_promoter_downstream = closest_promoter_downstream[[
            'Chromosome', 'Start', 'End', 'Strand', 'Gene', 'Distance']]

        # Add distance information and limit if above/below thresholds
        annot_df = annot_nodup.df
        annot_df = annot_df.set_index('Gene')
        closest_promoter_upstream_df = closest_promoter_upstream.df.set_index(
            'Gene').Distance
        closest_promoter_upstream_df.name = 'Distance_upstream'
        annot_df = pd.concat(
            [annot_df, closest_promoter_upstream_df], axis=1, sort=False)

        closest_promoter_downstream_df = closest_promoter_downstream.df.set_index(
            'Gene').Distance
        closest_promoter_downstream_df.name = 'Distance_downstream'
        annot_df = pd.concat(
            [annot_df, closest_promoter_downstream_df], axis=1, sort=False).reset_index()

        annot_df.loc[annot_df.Distance_upstream <
                     upstream[0], 'Distance_upstream'] = upstream[0]
        annot_df.loc[annot_df.Distance_upstream >
                     upstream[1], 'Distance_upstream'] = upstream[1]
        annot_df.loc[annot_df.Distance_downstream <
                     downstream[0], 'Distance_downstream'] = downstream[0]
        annot_df.loc[annot_df.Distance_downstream >
                     downstream[1], 'Distance_downstream'] = downstream[1]

        annot_nodup = pr.PyRanges(annot_df.dropna(axis=0))

        # Extend to search space
        log.info(
            """Extending search space to: 
            \t\t\t\t\t\tA minimum of {} bp downstream of the end of the gene.
            \t\t\t\t\t\tA minimum of {} bp upstream of the start of the gene.
            \t\t\t\t\t\tA maximum of {} bp downstream of the end of the gene or the promoter of the nearest downstream gene.
            \t\t\t\t\t\tA maximum of {} bp upstream of the start of the gene or the promoter of the nearest upstream gene""".format(str(downstream[0]), str(upstream[0]), str(downstream[1]), str(upstream[1])))

        extended_annot = extend_pyranges_with_limits(annot_nodup)
        extended_annot = extended_annot[['Chromosome',
                                         'Start',
                                         'End',
                                         'Strand',
                                         'Gene',
                                         'Gene_width',
                                         'Distance_upstream',
                                         'Distance_downstream']]
    else:
        log.info(
            """Extending search space to:
            \t\t\t\t\t\t{} bp downstream of the end of the gene.
            \t\t\t\t\t\t{} bp upstream of the start of the gene.""".format(str(downstream[1]), str(upstream[1])))
        annot_nodup = annot[['Chromosome',
                             'Start',
                             'End',
                             'Strand',
                             'Gene',
                             'Gene_width',
                             'Gene_size_weight']].drop_duplicate_positions().copy()
        annot_nodup = pr.PyRanges(
            annot_nodup.df.drop_duplicates(subset="Gene", keep="first"))
        extended_annot = extend_pyranges(annot, upstream[1], downstream[1])
        extended_annot = extended_annot[[
            'Chromosome', 'Start', 'End', 'Strand', 'Gene', 'Gene_width', 'Gene_size_weight']]

    # Format search space
    extended_annot = extended_annot.drop_duplicate_positions()

    log.info('Intersecting with regions.')
    regions_per_gene = pr_regions.join(extended_annot)
    regions_per_gene.Width = abs(
        regions_per_gene.End - regions_per_gene.Start).astype(np.int32)
    regions_per_gene.Start = round(
        regions_per_gene.Start + regions_per_gene.Width / 2).astype(np.int32)
    regions_per_gene.End = (regions_per_gene.Start + 1).astype(np.int32)
    # Calculate distance
    log.info('Calculating distances from region to gene')
    if use_gene_boundaries or predefined_boundaries:
        regions_per_gene = reduce_pyranges_with_limits_b(regions_per_gene)
        regions_per_gene = calculate_distance_with_limits_join(
            regions_per_gene)
    else:
        regions_per_gene = reduce_pyranges_b(
            regions_per_gene, upstream[1], downstream[1])
        regions_per_gene = calculate_distance_join(regions_per_gene)

    # Remove DISTAL regions overlapping with promoters
    if remove_promoters:
        log.info('Removing DISTAL regions overlapping promoters')
        regions_per_gene_overlapping_genes = regions_per_gene[regions_per_gene.Distance == 0]
        regions_per_gene_distal = regions_per_gene[regions_per_gene.Distance != 0]
        regions_per_gene_distal_wo_promoters = regions_per_gene_distal.overlap(
            pr_promoters, invert=True)
        regions_per_gene = pr.PyRanges(pd.concat(
            [regions_per_gene_overlapping_genes.df, regions_per_gene_distal_wo_promoters.df]))

    regions_per_gene = pr.PyRanges(regions_per_gene.df.drop_duplicates())

    if implode_entries:
        log.info('Imploding multiple entries per region and gene')
        df = regions_per_gene.df
        default_columns = ['Chromosome', 'Start',
                           'End', 'Name', 'Strand', 'Gene']
        agg_dict_func1 = {column: lambda x: x.tolist()[0]
                          for column in default_columns}
        agg_dict_func2 = {column: lambda x: x.tolist()
                          for column in set(list(df.columns)) - set(default_columns)}
        agg_dict_func = {**agg_dict_func1, **agg_dict_func2}
        df = df.groupby(['Gene', 'Name'], as_index=False).agg(agg_dict_func)
        regions_per_gene = pr.PyRanges(df)
    else:
        Warning(
            'Not imploding might cause error when calculating region to gene links.')

    log.info('Done!')
    if inplace:
        SCENICPLUS_obj.uns[key_added] = regions_per_gene.df[[
            'Name', 'Gene', 'Distance']]
    else:
        return regions_per_gene.df[['Name', 'Gene', 'Distance']]


@ray.remote
def _score_regions_to_single_gene_ray(X, y, gene_name, region_names, regressor_type, regressor_kwargs) -> list:
    return _score_regions_to_single_gene(X, y, gene_name, region_names, regressor_type, regressor_kwargs)


def _score_regions_to_single_gene(X, y, gene_name, region_names, regressor_type, regressor_kwargs) -> list:
    """
    Calculates region to gene importances or region to gene correlations for a single gene
    :param X: numpy array containing matrix of accessibility of regions in search space
    :param y: numpy array containing expression vector
    :param regressor_type: type of regression/correlation analysis. 
           Available regression analysis are: 'RF' (Random Forrest regression), 'ET' (Extra Trees regression), 'GBM' (Gradient Boostin regression).
           Available correlation analysis are: 'PR' (pearson correlation), 'SR' (spearman correlation).
    :param regressor_kwargs: arguments to pass to regression function.
    :returns feature_importance for regression methods and correlation_coef for correlation methods
    """
    if regressor_type in SKLEARN_REGRESSOR_FACTORY.keys():
        from arboreto import core as arboreto_core
        # fit model
        fitted_model = arboreto_core.fit_model(regressor_type=regressor_type,
                                               regressor_kwargs=regressor_kwargs,
                                               tf_matrix=X,
                                               target_gene_expression=y)
        # get importance scores for each feature
        feature_importance = arboreto_core.to_feature_importances(regressor_type=regressor_type,
                                                                  regressor_kwargs=regressor_kwargs,
                                                                  trained_regressor=fitted_model)
        return pd.Series(feature_importance, index=region_names), gene_name

    if regressor_type in SCIPY_CORRELATION_FACTORY.keys():
        # define correlation method
        correlator = SCIPY_CORRELATION_FACTORY[regressor_type]

        # do correlation and get correlation coef and p value
        correlation_result = np.array([correlator(x, y) for x in X.T])
        correlation_coef = correlation_result[:, 0]

        # , correlation_adj_pval
        return pd.Series(correlation_coef, index=region_names), gene_name


def _score_regions_to_genes(SCENICPLUS_obj: SCENICPLUS,
                           search_space: pd.DataFrame,
                           mask_expr_dropout=False,
                           genes=None,
                           regressor_type='GBM',
                           ray_n_cpu=None,
                           regressor_kwargs=GBM_KWARGS,
                           **kwargs) -> dict:
    """
    Wrapper function for score_regions_to_single_gene and score_regions_to_single_gene_ray.
    Calculates region to gene importances or region to gene correlations for multiple genes
    :param SCENICPLUS_obj: instance of SCENICPLUS class containing expression data and chromatin accessbility data
    :param genes: list of genes for which to calculate region gene scores. Uses all genes if set to None
    :param regressor_type: type of regression/correlation analysis. 
           Available regression analysis are: 'RF' (Random Forrest regression), 'ET' (Extra Trees regression), 'GBM' (Gradient Boostin regression).
           Available correlation analysis are: 'PR' (pearson correlation), 'SR' (spearman correlation).
    :param regressor_kwargs: arguments to pass to regression function.
    :param **kwargs: additional parameters to pass to ray.init.
    :returns dictionary with genes as keys and importance score or correlation coefficient 
             as values for resp. regression based and correlation based calculations.
    """
    # Check overlaps with search space (Issue #1)
    search_space = search_space[search_space['Name'].isin(
        SCENICPLUS_obj.region_names)]
    if genes is None:
        genes_to_use = list(set.intersection(
            set(search_space['Gene']), set(SCENICPLUS_obj.gene_names)))
    elif not all(np.isin(genes, list(search_space['Gene']))):
        genes_to_use = list(set.intersection(
            set(search_space['Gene']), set(genes)))
    else:
        genes_to_use = genes
    # get expression and chromatin accessibility dataframes only once
    EXP_df = SCENICPLUS_obj.to_df(layer='EXP')
    ACC_df = SCENICPLUS_obj.to_df(layer='ACC')
    if ray_n_cpu != None:
        ray.init(num_cpus=ray_n_cpu, **kwargs)
        try:
            jobs = []
            for gene in tqdm(genes_to_use, total=len(genes_to_use), desc='initializing'):
                regions_in_search_space = search_space.loc[search_space['Gene']
                                                           == gene, 'Name'].values
                if mask_expr_dropout:
                    expr = EXP_df[gene]
                    cell_non_zero = expr.index[expr != 0]
                    expr = expr.loc[cell_non_zero].to_numpy()
                    acc = ACC_df.loc[regions_in_search_space,
                                     cell_non_zero].T.to_numpy()
                else:
                    expr = EXP_df[gene].to_numpy()
                    acc = ACC_df.loc[regions_in_search_space].T.to_numpy()
                # Check-up for genes with 1 region only, related to issue 2
                if acc.ndim == 1:
                    acc = acc.reshape(-1, 1)
                jobs.append(_score_regions_to_single_gene_ray.remote(X=acc,
                                                                    y=expr,
                                                                    gene_name=gene,
                                                                    region_names=regions_in_search_space,
                                                                    regressor_type=regressor_type,
                                                                    regressor_kwargs=regressor_kwargs))

            # add progress bar, adapted from: https://github.com/ray-project/ray/issues/8164
            def to_iterator(obj_ids):
                while obj_ids:
                    finished_ids, obj_ids = ray.wait(obj_ids)
                    for finished_id in finished_ids:
                        yield ray.get(finished_id)
            regions_to_genes = {}
            for importance, gene_name in tqdm(to_iterator(jobs),
                                              total=len(jobs),
                                              desc=f'Running using {ray_n_cpu} cores',
                                              smoothing=0.1):
                regions_to_genes[gene_name] = importance
        except Exception as e:
            print(e)
        finally:
            ray.shutdown()
    else:
        regions_to_genes = {}
        for gene in tqdm(genes_to_use, total=len(genes_to_use), desc=f'Running using a single core'):
            regions_in_search_space = search_space.loc[search_space['Gene']
                                                       == gene, 'Name'].values
            if mask_expr_dropout:
                expr = EXP_df[gene]
                cell_non_zero = expr.index[expr != 0]
                expr = expr.loc[cell_non_zero].to_numpy()
                acc = ACC_df.loc[regions_in_search_space,
                                 cell_non_zero].T.to_numpy()
            else:
                expr = EXP_df[gene].to_numpy()
                acc = ACC_df.loc[regions_in_search_space].T.to_numpy()
            # Check-up for genes with 1 region only, related to issue 2
            if acc.ndim == 1:
                acc = acc.reshape(-1, 1)
            regions_to_genes[gene], _ = _score_regions_to_single_gene(X=acc,
                                                                     y=expr,
                                                                     gene_name=gene,
                                                                     region_names=regions_in_search_space,
                                                                     regressor_type=regressor_type,
                                                                     regressor_kwargs=regressor_kwargs)

    return regions_to_genes


[docs]def calculate_regions_to_genes_relationships(SCENICPLUS_obj: SCENICPLUS,
                                             search_space_key: str = 'search_space',
                                             mask_expr_dropout: bool = False,
                                             genes: List[str] = None,
                                             importance_scoring_method: str = 'GBM',
                                             importance_scoring_kwargs: dict = GBM_KWARGS,
                                             correlation_scoring_method: str = 'SR',
                                             ray_n_cpu: int = None,
                                             add_distance: bool = True,
                                             key_added: str = 'region_to_gene',
                                             inplace: bool = True,
                                             **kwargs):
    """
    Calculates region to gene relationships using non-linear regression methods and correlation

    Parameters
    ----------
    SCENICPLUS_obj: SCENICPLUS
        instance of SCENICPLUS class containing expression data and chromatin accessbility data
    search_space_key: str = 'search_space'
        a key in SCENICPLUS_obj.uns.keys pointing to a dataframe containing the search space surounding each gene.
    mask_expr_dropout: bool = False
        Wether or not to exclude cells which have zero counts for a gene from the calculations
    genes: List[str] None
        list of genes for which to calculate region gene scores. Default is None, i.e. all genes
    importance_scoring_method: str = GBM
        method used to score region to gene importances. Available regression analysis are: 'RF' (Random Forrest regression), 'ET' (Extra Trees regression), 'GBM' (Gradient Boostin regression).
    importance_scoring_kwargs: dict = GBM_KWARGS
        arguments to pass to the importance scoring function
    correlation_scoring_method: str = SR
        method used to calculate region to gene correlations. Available correlation analysis are: 'PR' (pearson correlation), 'SR' (spearman correlation).
    ray_n_cpu: int = None
        number of cores to use for ray multi-processing. Does not use ray when set to None
    add_distance: bool = True
        Wether or not to return region to gene distances
    key_added: str = region_to_gene
        Key in SCENICPLUS_obj.uns under which to store region to gene links, only stores when inplace = True
    inplace: bool = True
        Wether or not store the region to gene links in the SCENICPLUS_obj, if False a pd.DataFrame will be returned.
    """
    # Create logger
    level = logging.INFO
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=format, handlers=handlers)
    log = logging.getLogger('R2G')
    if search_space_key not in SCENICPLUS_obj.uns.keys():
        raise Exception(
            f'key {search_space_key} not found in SCENICPLUS_obj.uns, first get search space using function: "get_search_space"')

    search_space = SCENICPLUS_obj.uns[search_space_key]
    # Check overlaps with search space (Issue #1)
    search_space = search_space[search_space['Name'].isin(
        SCENICPLUS_obj.region_names)]

    # calulcate region to gene importance
    log.info(
        f'Calculating region to gene importances, using {importance_scoring_method} method')
    start_time = time.time()
    region_to_gene_importances = _score_regions_to_genes(SCENICPLUS_obj,
                                                        search_space=search_space,
                                                        mask_expr_dropout=mask_expr_dropout,
                                                        genes=genes,
                                                        regressor_type=importance_scoring_method,
                                                        regressor_kwargs=importance_scoring_kwargs,
                                                        ray_n_cpu=ray_n_cpu,
                                                        **kwargs)
    log.info('Took {} seconds'.format(time.time() - start_time))

    # calculate region to gene correlation
    log.info(
        f'Calculating region to gene correlation, using {correlation_scoring_method} method')
    start_time = time.time()
    region_to_gene_correlation = _score_regions_to_genes(SCENICPLUS_obj,
                                                        search_space=search_space,
                                                        mask_expr_dropout=mask_expr_dropout,
                                                        genes=genes,
                                                        regressor_type=correlation_scoring_method,
                                                        ray_n_cpu=ray_n_cpu,
                                                        **kwargs)
    log.info('Took {} seconds'.format(time.time() - start_time))

    # transform dictionaries to pandas dataframe
    try:
        result_df = pd.concat([pd.DataFrame(data={'target': gene,
                                                  'region': region_to_gene_importances[gene].index.to_list(),
                                                  'importance': region_to_gene_importances[gene].to_list(),
                                                  'rho': region_to_gene_correlation[gene].loc[
                                                      region_to_gene_importances[gene].index.to_list()].to_list()})
                               for gene in region_to_gene_importances.keys()
                               ]
                              )
        result_df = result_df.reset_index()
        result_df = result_df.drop('index', axis=1)
        result_df['importance_x_rho'] = result_df['rho'] * \
            result_df['importance']
        result_df['importance_x_abs_rho'] = abs(
            result_df['rho']) * result_df['importance']

    except:
        print('An error occured!')
        return region_to_gene_importances, region_to_gene_correlation

    if add_distance:
        # TODO: use consistent column names
        search_space_rn = search_space.rename(
            {'Name': 'region', 'Gene': 'target'}, axis=1).copy()
        result_df = result_df.merge(search_space_rn, on=['region', 'target'])
        #result_df['Distance'] = result_df['Distance'].map(lambda x: x[0])
    log.info('Done!')

    if inplace:
        SCENICPLUS_obj.uns[key_added] = result_df
    else:
        return result_df


[docs]def export_to_UCSC_interact(SCENICPLUS_obj: SCENICPLUS,
                            species: str,
                            outfile: str,
                            region_to_gene_key: str =' region_to_gene',
                            pbm_host:str = 'http://www.ensembl.org',
                            bigbed_outfile:str = None,
                            path_bedToBigBed: str= None,
                            assembly: str = None,
                            ucsc_track_name: str = 'region_to_gene',
                            ucsc_description: str = 'interaction file for region to gene',
                            cmap_neg: str = 'Reds',
                            cmap_pos: str = 'Greens',
                            key_for_color: str = 'importance',
                            vmin: int = 0,
                            vmax: int = 1,
                            scale_by_gene: bool = True,
                            subset_for_eRegulons_regions: bool = True,
                            eRegulons_key: str = 'eRegulons') -> pd.DataFrame:
    """
    Exports interaction dataframe to UCSC interaction file and (optionally) UCSC bigInteract file.

    Parameters
    ----------
    SCENICPLUS_obj: SCENICPLUS
        An instance of class scenicplus_class.SCENICPLUS containing region to gene links in .uns.
    species: str
        Species corresponding to your datassets (e.g. hsapiens)
    outfile: str
        Path to output file 
    region_to_gene_key: str =' region_to_gene'
        Key in SCENICPLUS_obj.uns.keys() under which to find region to gene links.
    pbm_host:str = 'http://www.ensembl.org'
        Url of biomart host relevant for your assembly.
    bigbed_outfile:str = None
        Path to which to write the bigbed output.
    path_bedToBigBed: str= None
        Path to bedToBigBed program, used to convert bed file to bigbed format. Can be downloaded from http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed
    assembly: str = None
        String identifying the assembly of your dataset (e.g. hg39).
    ucsc_track_name: str = 'region_to_gene'
        Name of the exported UCSC track
    ucsc_description: str = 'interaction file for region to gene'
        Description of the exported UCSC track
    cmap_neg: str = 'Reds'
        Matplotlib colormap used to color negative region to gene links.
    cmap_pos: str = 'Greens'
        Matplotlib colormap used to color positive region to gene links.
    key_for_color: str = 'importance'
        Key pointing to column in region to gene links used to map cmap colors to.
    vmin: int = 0  
        vmin of region to gene link colors.
    vmax: int = 1
        vmax of region to gene link colors.
    scale_by_gene: bool = True
        Boolean specifying wether to scale importance scores of regions linking to the same gene from 0 to 1
    subset_for_eRegulons_regions: bool = True
        Boolean specifying wether or not to subset region to gene links for regions and genes in eRegulons.
    eRegulons_key: str = 'eRegulons'
        key in SCENICPLUS_obj.uns.keys() under which to find eRegulons.
    
    Returns
    -------
    pd.DataFrame with region to gene links formatted in the UCSC interaction format.
    """
    # Create logger
    level = logging.INFO
    format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    handlers = [logging.StreamHandler(stream=sys.stdout)]
    logging.basicConfig(level=level, format=format, handlers=handlers)
    log = logging.getLogger('R2G')

    if region_to_gene_key not in SCENICPLUS_obj.uns.keys():
        raise Exception(
            f'key {region_to_gene_key} not found in SCENICPLUS_obj.uns, first calculate region to gene relationships using function: "calculate_regions_to_genes_relationships"')

    region_to_gene_df = SCENICPLUS_obj.uns[region_to_gene_key].copy()

    if subset_for_eRegulons_regions:
        if eRegulons_key not in SCENICPLUS_obj.uns.keys():
            raise ValueError(
                f'key {eRegulons_key} not found in SCENICPLUS_obj.uns.keys()')
        eRegulon_regions = list(set(flatten_list(
            [ereg.target_regions for ereg in SCENICPLUS_obj.uns[eRegulons_key]])))
        region_to_gene_df.index = region_to_gene_df['region']
        region_to_gene_df = region_to_gene_df.loc[eRegulon_regions].reset_index(
            drop=True)

    # Rename columns to be in line with biomart annotation
    region_to_gene_df.rename(columns={'target': 'Gene'}, inplace=True)

    # Get TSS annotation (end-point for links)
    log.info('Downloading gene annotation from biomart, using dataset: {}'.format(
        species+'_gene_ensembl'))
    import pybiomart as pbm
    dataset = pbm.Dataset(name=species+'_gene_ensembl',  host=pbm_host)
    annot = dataset.query(attributes=['chromosome_name', 'start_position', 'end_position',
                          'strand', 'external_gene_name', 'transcription_start_site', 'transcript_biotype'])
    annot.columns = ['Chromosome', 'Start', 'End', 'Strand',
                     'Gene', 'Transcription_Start_Site', 'Transcript_type']
    annot['Chromosome'] = 'chr' + \
        annot['Chromosome'].astype(str)
    annot = annot[annot.Transcript_type == 'protein_coding']
    annot.Strand[annot.Strand == 1] = '+'
    annot.Strand[annot.Strand == -1] = '-'
    if not any(['chr' in c for c in SCENICPLUS_obj.region_names]):
        annot.Chromosome = annot.Chromosome.str.replace('chr', '')

    log.info('Formatting data ...')
    # get gene to tss mapping, take the one equal to the gene start/end location if possible otherwise take the first one
    annot['TSSeqStartEnd'] = np.logical_or(
        annot['Transcription_Start_Site'] == annot['Start'], annot['Transcription_Start_Site'] == annot['End'])
    gene_to_tss = annot[['Gene', 'Transcription_Start_Site']].groupby(
        'Gene').agg(lambda x: list(map(str, x)))
    startEndEq = annot[['Gene', 'TSSeqStartEnd']
                       ].groupby('Gene').agg(lambda x: list(x))
    gene_to_tss['Transcription_Start_Site'] = [np.array(tss[0])[eq[0]][0] if sum(
        eq[0]) >= 1 else tss[0][0] for eq, tss in zip(startEndEq.values, gene_to_tss.values)]
    gene_to_tss.columns = ['TSS_Gene']

    # get gene to strand mapping
    gene_to_strand = annot[['Gene', 'Strand']].groupby(
        'Gene').agg(lambda x: list(map(str, x))[0])

    # get gene to chromosome mapping (should be the same as the regions mapped to the gene)
    gene_to_chrom = annot[['Gene', 'Chromosome']].groupby(
        'Gene').agg(lambda x: list(map(str, x))[0])

    # add TSS for each gene to region_to_gene_df
    region_to_gene_df = region_to_gene_df.join(gene_to_tss, on='Gene')

    # add strand for each gene to region_to_gene_df
    region_to_gene_df = region_to_gene_df.join(gene_to_strand, on='Gene')

    # add chromosome for each gene to region_to_gene_df
    region_to_gene_df = region_to_gene_df.join(gene_to_chrom, on='Gene')

    # get chrom, chromStart, chromEnd
    region_to_gene_df.dropna(axis=0, how='any', inplace=True)
    arr = region_names_to_coordinates(region_to_gene_df['region']).to_numpy()
    chrom, chromStart, chromEnd = np.split(arr, 3, 1)
    chrom = chrom[:, 0]
    chromStart = chromStart[:, 0]
    chromEnd = chromEnd[:, 0]

    # get source chrom, chromStart, chromEnd (i.e. middle of regions)
    sourceChrom = chrom
    sourceStart = np.array(
        list(map(int, chromStart + (chromEnd - chromStart)/2 - 1)))
    sourceEnd = np.array(
        list(map(int, chromStart + (chromEnd - chromStart)/2)))

    # get target chrom, chromStart, chromEnd (i.e. TSS)
    targetChrom = region_to_gene_df['Chromosome']
    targetStart = region_to_gene_df['TSS_Gene'].values
    targetEnd = list(map(str, np.array(list(map(int, targetStart))) + np.array(
        [1 if strand == '+' else -1 for strand in region_to_gene_df['Strand'].values])))

    # get color
    norm = Normalize(vmin=vmin, vmax=vmax)
    if scale_by_gene:
        grouper = Groupby(
            region_to_gene_df.loc[region_to_gene_df['rho'] >= 0, 'Gene'].to_numpy())
        scores = region_to_gene_df.loc[region_to_gene_df['rho']
                                       >= 0, key_for_color].to_numpy()
        mapper = cm.ScalarMappable(norm=norm, cmap=cmap_pos)

        def _value_to_color(scores):
            S = (scores - scores.min()) / (scores.max() - scores.min())
            return [','.join([str(x) for x in mapper.to_rgba(s, bytes=True)][0:3]) for s in S]

        colors_pos = np.zeros(len(scores), dtype='object')
        for idx in grouper.indices:
            colors_pos[idx] = _value_to_color(scores[idx])

        grouper = Groupby(
            region_to_gene_df.loc[region_to_gene_df['rho'] < 0, 'Gene'].to_numpy())
        scores = region_to_gene_df.loc[region_to_gene_df['rho']
                                       < 0, key_for_color].to_numpy()
        mapper = cm.ScalarMappable(norm=norm, cmap=cmap_neg)

        def _value_to_color(scores):
            S = (scores - scores.min()) / (scores.max() - scores.min())
            return [','.join([str(x) for x in mapper.to_rgba(s, bytes=True)][0:3]) for s in S]

        colors_neg = np.zeros(len(scores), dtype='object')
        for idx in grouper.indices:
            colors_neg[idx] = _value_to_color(scores[idx])

    else:
        scores = region_to_gene_df.loc[region_to_gene_df['rho']
                                       >= 0, key_for_color].to_numpy()
        mapper = cm.ScalarMappable(norm=norm, cmap=cmap_pos)
        colors_pos = [
            ','.join([str(x) for x in mapper.to_rgba(s, bytes=True)][0:3]) for s in scores]

        scores = region_to_gene_df.loc[region_to_gene_df['rho']
                                       < 0, key_for_color].to_numpy()
        mapper = cm.ScalarMappable(norm=norm, cmap=cmap_neg)
        colors_neg = [
            ','.join([str(x) for x in mapper.to_rgba(s, bytes=True)][0:3]) for s in scores]

    region_to_gene_df.loc[region_to_gene_df['rho'] >= 0, 'color'] = colors_pos
    region_to_gene_df.loc[region_to_gene_df['rho'] < 0,  'color'] = colors_neg
    region_to_gene_df['color'] = region_to_gene_df['color'].fillna('55,55,55')

    # get name for regions (add incremental number to gene in range of regions linked to gene)
    counter = 1
    previous_gene = region_to_gene_df['Gene'].values[0]
    names = []
    for gene in region_to_gene_df['Gene'].values:
        if gene != previous_gene:
            counter = 1
        else:
            counter += 1
        names.append(gene + '_' + str(counter))
        previous_gene = gene

    # format final interact dataframe
    df_interact = pd.DataFrame(
        data={
            'chrom':        chrom,
            'chromStart':   chromStart,
            'chromEnd':     chromEnd,
            'name':         names,
            'score':        (1000*(region_to_gene_df['importance'].values - np.min(region_to_gene_df['importance'].values))/np.ptp(region_to_gene_df['importance'].values)).astype(int) ,
            'value':        region_to_gene_df['importance'].values,
            'exp':          np.repeat('.', len(region_to_gene_df)),
            'color':        region_to_gene_df['color'].values,
            'sourceChrom':  sourceChrom,
            'sourceStart':  sourceStart,
            'sourceEnd':    sourceEnd,
            'sourceName':   names,
            'sourceStrand': np.repeat('.', len(region_to_gene_df)),
            'targetChrom':  targetChrom,
            'targetStart':  targetStart,
            'targetEnd':    targetEnd,
            'targetName':   region_to_gene_df['Gene'].values,
            'targetStrand': region_to_gene_df['Strand'].values
        }
    )
    # sort dataframe
    df_interact = df_interact.sort_values(by=['chrom', 'chromStart'])
    # Write interact file
    log.info('Writing data to: {}'.format(outfile))
    with open(outfile, 'w') as f:
        f.write('track type=interact name="{}" description="{}" useScore=0 maxHeightPixels=200:100:50 visibility=full\n'.format(
            ucsc_track_name, ucsc_description))
        df_interact.to_csv(f, header=False, index=False, sep='\t')

    # write bigInteract file
    if bigbed_outfile != None:
        log.info('Writing data to: {}'.format(bigbed_outfile))
        outfolder = bigbed_outfile.rsplit('/', 1)[0]
        # write bed file without header to tmp file
        df_interact.to_csv(os.path.join(
            outfolder, 'interact.bed.tmp'), header=False, index=False, sep='\t')

        # check if auto sql definition for interaction file exists in outfolder, otherwise create it
        if not os.path.exists(os.path.join(outfolder, 'interact.as')):
            with open(os.path.join(outfolder, 'interact.as'), 'w') as f:
                f.write(INTERACT_AS)
        # convert interact.bed.tmp to bigBed format
        # bedToBigBed -as=interact.as -type=bed5+13 region_to_gene_no_head.interact https://genome.ucsc.edu/goldenPath/help/hg38.chrom.sizes region_to_gene.inter.bb
        cmds = [
            os.path.join(path_bedToBigBed, 'bedToBigBed'),
            '-as={}'.format(os.path.join(os.path.join(outfolder, 'interact.as'))),
            '-type=bed5+13',
            os.path.join(outfolder, 'interact.bed.tmp'),
            'https://hgdownload.cse.ucsc.edu/goldenpath/' + assembly + '/bigZips/' + assembly + '.chrom.sizes',
            bigbed_outfile
        ]
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        if p.returncode:
            raise ValueError(
                "cmds: %s\nstderr:%s\nstdout:%s" % (
                    " ".join(cmds), stderr, stdout)
            )
    return df_interact