Source code for scenicplus.plotting.correlation_plot

"""Plot correlation and overlap of eRegulons

"""

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform
import seaborn as sns
import numpy as np
import pandas as pd
import plotly.express as px
from typing import List, Tuple
from typing import Optional, Union
import sklearn
import matplotlib.pyplot as plt
from itertools import combinations

from ..scenicplus_class import SCENICPLUS
from ..utils import p_adjust_bh, flatten_list



[docs]
def correlation_heatmap(scplus_obj: SCENICPLUS,
                        auc_key: Optional[str] = 'eRegulon_AUC',
                        signature_keys: Optional[List[str]] = ['Gene_based', 'Region_based'],
                        scale: Optional[bool] = False,
                        linkage_method: Optional[str] = 'average',
                        fcluster_threshold: Optional[float] = 0.1,
                        selected_regulons: Optional[List[int]] = None,
                        cmap: Optional[str] = 'viridis',
                        plotly_height: Optional[int] = 1000,
                        fontsize: Optional[int] = 3,
                        save: Optional[str] = None,
                        use_plotly: Optional[int] = True,
                        figsize: Optional[Tuple[int, int]] = (20, 20)
                        ):
    """
    Plot correlation between eRegulons enrichment,

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
        A SCENICPLUS object with eRegulons AUC computed.
    auc_key: str, optional
        Key to extract AUC values from. Default: 'eRegulon_AUC'
    signature_keys: List, optional
        Keys to extract AUC values from. Default: ['Gene_based', 'Region_based']
    scale: bool, optional
        Whether to scale the enrichments prior to the dimensionality reduction. Default: False
    linkage_method: str, optional
        Linkage method to use for clustering. See `scipy.cluster.hierarchy.linkage`.
    fcluster_threshold: float, optional
        Threshold to use to divide hierarchical clustering into clusters. See `scipy.cluster.hierarchy.fcluster`.
    selected_regulons: list, optional
        A list with selected regulons to be used for clustering. Default: None (use all regulons)
    cmap: str or 'matplotlib.cm', optional
        For continuous variables, color map to use for the legend color bar. Default: cm.viridis
    plotly_height: int, optional
        Height of the plotly plot. Width will be adjusted accordingly
    fontsize: int, optional
        Labels fontsize
    save: str, optional
        Path to save heatmap as file
    use_plotly: bool, optional
        Use plotly or seaborn to generate the image
    figsize: tupe, optional
        Matplotlib figsize, used only if use_plotly == False
    """
    if scale:
        data_mat = pd.concat([pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            scplus_obj.uns[auc_key][x].T), index=scplus_obj.uns[auc_key][x].T.index.to_list(), columns=scplus_obj.uns[auc_key][x].T.columns) for x in signature_keys]).T
    else:
        data_mat = pd.concat([scplus_obj.uns[auc_key][x]
                             for x in signature_keys], axis=1)
    if selected_regulons is not None:
        subset = [x for x in selected_regulons if x in data_mat.columns]
        data_mat = data_mat[subset]
    # check if some eRegulon AUC are all 0 and remove them
    all_zero_eregs = data_mat.columns[np.where(data_mat.sum(0) == 0)]
    if len(all_zero_eregs) > 0:
        print(
            f"Following eregulons have an AUC value of all zero and will be removed: {', '.join(all_zero_eregs)}")
        data_mat.drop(all_zero_eregs, axis=1, inplace=True)
    correlations = data_mat.corr()
    similarity = 1 - correlations
    # use np.clip: due to floating point impercisions some very small values become negative, clip them to 0
    Z = linkage(np.clip(squareform(similarity), 0,
                similarity.to_numpy().max()), linkage_method)
    # Clusterize the data
    labels = fcluster(Z, fcluster_threshold)
    # Keep the indices to sort labels
    labels_order = np.argsort(labels)
    # Build a new dataframe with the sorted columns
    for idx, i in enumerate(data_mat.columns[labels_order]):
        if idx == 0:
            clustered = pd.DataFrame(data_mat[i])
        else:
            df_to_append = pd.DataFrame(data_mat[i])
            clustered = pd.concat([clustered, df_to_append], axis=1)
    correlations = clustered.corr()
    if use_plotly:
        fig = px.imshow(clustered.corr(), color_continuous_scale=cmap)
        fig.update_layout(
            height=plotly_height,  # Added parameter
            legend={'itemsizing': 'trace'},
            plot_bgcolor='rgba(0,0,0,0)',
            yaxis=dict(tickfont=dict(size=fontsize)),
            xaxis=dict(tickfont=dict(size=fontsize)),
        )
        if save is not None:
            fig.write_image(save)
        fig.show()
    else:
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(
            data=correlations,
            cmap=cmap,
            square=True,
            ax=ax,
            robust=True,
            cbar_kws={"shrink": .50, 'label': 'Correlation'},
            xticklabels=False)
        if save is not None:
            fig.tight_layout()
            fig.savefig(save)
        plt.show()
        plt.close(fig)



def _jaccard(signature1, signature2):
    s_signature1 = set(signature1)
    s_signature2 = set(signature2)
    intersect = len(s_signature1 & s_signature2)
    union = len(s_signature1) + len(s_signature2) - intersect
    return intersect / union

def _intersect_norm_by_one(signature1, signature2):
    s_signature1 = set(signature1)
    s_signature2 = set(signature2)
    intersect = len(s_signature1 & s_signature2)
    return intersect / len(s_signature1)

from scipy.stats import fisher_exact
def _fisher_exact_sign(signature1, signature2, total):
    overlap = len(signature1 & signature2)
    contingency_table = np.array(
        [
            [total - len(signature1) - len(signature2), len(signature1) - overlap],
            [len(signature2) - overlap,                 overlap]
        ])
    return fisher_exact(contingency_table, alternative = 'greater')



[docs]
def jaccard_heatmap(scplus_obj: SCENICPLUS,
                    method: str = 'jaccard',
                    gene_or_region_based: str = 'Gene_based',
                    signature_key: Optional[str] = 'eRegulon_signatures',
                    selected_regulons: Optional[List[int]] = None,
                    linkage_method: Optional[str] = 'average',
                    fcluster_threshold: Optional[float] = 0.1,
                    cmap: Optional[str] = 'viridis',
                    plotly_height: Optional[int] = 1000,
                    fontsize: Optional[int] = 3,
                    save: Optional[str] = None,
                    use_plotly: Optional[int] = True,
                    figsize: Optional[Tuple[int, int]] = (20, 20),
                    vmin=None,
                    vmax=None,
                    return_data = False
                    ):
    """
    Plot jaccard index of regions/genes

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
        A SCENICPLUS object with eRegulon signatures.
    method: str
        Wether to use Jaccard (jaccard) or normalized intersection (intersect) as metric
    gene_or_region_based: str
        Gene_based or Region_based eRegulon signatures to use.
    signature_key: List, optional
        Key to extract eRegulon signatures from
    selected_regulons: list, optional
        A list with selected regulons to be used for clustering. Default: None (use all regulons)
    linkage_method: str, optional
        Linkage method to use for clustering. See `scipy.cluster.hierarchy.linkage`.
    fcluster_threshold: float, optional
        Threshold to use to divide hierarchical clustering into clusters. See `scipy.cluster.hierarchy.fcluster`.
    cmap: str or 'matplotlib.cm', optional
        For continuous variables, color map to use for the legend color bar. Default: cm.viridis
    plotly_height: int, optional
        Height of the plotly plot. Width will be adjusted accordingly
    fontsize: int, optional
        Labels fontsize
    save: str, optional
        Path to save heatmap as file
    use_plotly: bool, optional
        Use plotly or seaborn to generate the image
    figsize: tupe, optional
        Matplotlib figsize, used only if use_plotly == False
    return_data: boolean, optional
        Return data
    plot_dendrogram: boolean, optional
    """
    signatures = scplus_obj.uns[signature_key][gene_or_region_based]
    if selected_regulons is not None:
        signatures = {k: signatures[k]
                      for k in signatures.keys() if k in selected_regulons}
    signatures_names = list(signatures.keys())
    sign_combinations = list(combinations(signatures_names, 2))
    n_signatures = len(signatures_names)
    jaccards = np.zeros((n_signatures, n_signatures))

    for signature_1, signature_2 in sign_combinations:
        idx_1 = signatures_names.index(signature_1)
        idx_2 = signatures_names.index(signature_2)
        if method == 'jaccard':
            jaccards[idx_1, idx_2] = _jaccard(
                signatures[signature_1], signatures[signature_2])
            jaccards[idx_2, idx_1] = _jaccard(
                signatures[signature_2], signatures[signature_1])
        elif method == 'intersect':
            jaccards[idx_1, idx_2] = _intersect_norm_by_one(
                signatures[signature_1], signatures[signature_2])
            jaccards[idx_2, idx_1] = _intersect_norm_by_one(
                signatures[signature_2], signatures[signature_1])
    np.fill_diagonal(jaccards, 1)
    similarity = 1 - jaccards
    Z = linkage(similarity, linkage_method)
    # Clusterize the data
    labels = fcluster(Z, fcluster_threshold)
    # Keep the indices to sort labels
    labels_order = np.argsort(labels)
    clustered_jaccard_df = pd.DataFrame(
        jaccards, index=signatures_names, columns=signatures_names).iloc[labels_order, labels_order]
    if use_plotly:
        fig = px.imshow(clustered_jaccard_df, color_continuous_scale=cmap)
        fig.update_layout(
            height=plotly_height,  # Added parameter
            legend={'itemsizing': 'trace'},
            plot_bgcolor='rgba(0,0,0,0)',
            yaxis=dict(tickfont=dict(size=fontsize)),
            xaxis=dict(tickfont=dict(size=fontsize)),
        )
        if save is not None:
            fig.write_image(save)
        fig.show()
    else:
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(
            data=clustered_jaccard_df,
            cmap=cmap,
            square=False,
            ax=ax,
            robust=True,
            cbar_kws={"shrink": .50, 'label': 'Jaccard'},
            xticklabels=False,
            vmin=vmin,
            vmax=vmax)
        if save is not None:
            fig.tight_layout()
            fig.savefig(save)
        plt.show()
        plt.close(fig)
    if return_data:
        return clustered_jaccard_df, Z



from scipy.stats import fisher_exact
def _fisher_exact_sign(signature1, signature2, total):
    overlap = len(signature1 & signature2)
    contingency_table = np.array(
        [
            [total - len(signature1) - len(signature2), len(signature1) - overlap],
            [len(signature2) - overlap,                 overlap]
        ])
    return fisher_exact(contingency_table, alternative = 'greater')



[docs]
def fisher_exact_test_heatmap(
    scplus_obj,
    gene_or_region_based: str = 'Gene_based',
    signature_key: Optional[str] = 'eRegulon_signatures',
    selected_regulons: Optional[List[int]] = None,
    linkage_method: Optional[str] = 'average',
    fcluster_threshold: Optional[float] = 0.1,
    cmap: Optional[str] = 'viridis',
    plotly_height: Optional[int] = 1000,
    fontsize: Optional[int] = 3,
    save: Optional[str] = None,
    use_plotly: Optional[int] = True,
    figsize: Optional[Tuple[int, int]] = (20, 20),
    vmin=None,
    vmax=None,
    return_data = False):
    """
    Plot jaccard index of regions/genes

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
        A SCENICPLUS object with eRegulon signatures.
    method: str
        Wether to use Jaccard (jaccard) or normalized intersection (intersect) as metric
    gene_or_region_based: str
        Gene_based or Region_based eRegulon signatures to use.
    signature_key: List, optional
        Key to extract eRegulon signatures from
    selected_regulons: list, optional
        A list with selected regulons to be used for clustering. Default: None (use all regulons)
    linkage_method: str, optional
        Linkage method to use for clustering. See `scipy.cluster.hierarchy.linkage`.
    fcluster_threshold: float, optional
        Threshold to use to divide hierarchical clustering into clusters. See `scipy.cluster.hierarchy.fcluster`.
    cmap: str or 'matplotlib.cm', optional
        For continuous variables, color map to use for the legend color bar. Default: cm.viridis
    plotly_height: int, optional
        Height of the plotly plot. Width will be adjusted accordingly
    fontsize: int, optional
        Labels fontsize
    save: str, optional
        Path to save heatmap as file
    use_plotly: bool, optional
        Use plotly or seaborn to generate the image
    figsize: tupe, optional
        Matplotlib figsize, used only if use_plotly == False
    return_data: boolean, optional
        Return data
    plot_dendrogram: boolean, optional
    """
    signatures = scplus_obj.uns[signature_key][gene_or_region_based]
    if selected_regulons is not None:
        signatures = {k: signatures[k]
                      for k in signatures.keys() if k in selected_regulons}
    signatures_names = list(signatures.keys())
    sign_combinations = list(combinations(signatures_names, 2))
    n_signatures = len(signatures_names)
    fisher_exact_values = np.zeros((n_signatures, n_signatures))
    fisher_exact_pvalues =  np.ones((n_signatures, n_signatures))


    total_elements = len(set(flatten_list(scplus_obj.uns[signature_key][gene_or_region_based].values())))
    
    for signature_1, signature_2 in sign_combinations:
        idx_1 = signatures_names.index(signature_1)
        idx_2 = signatures_names.index(signature_2)
        test_result = _fisher_exact_sign(set(signatures[signature_1]), set(signatures[signature_2]), total_elements)
        fisher_exact_values[idx_1, idx_2] = test_result[0]
        fisher_exact_values[idx_2, idx_1] = test_result[0]
        fisher_exact_pvalues[idx_1, idx_2] = test_result[1]
        fisher_exact_pvalues[idx_2, idx_1] = test_result[1]
    
    pvals = fisher_exact_pvalues[np.triu_indices(fisher_exact_pvalues.shape[0], k = 1)]
    p_adj = p_adjust_bh(pvals)
    p_adj_mat = np.zeros((fisher_exact_pvalues.shape[0],fisher_exact_pvalues.shape[0]))
    p_adj_mat[np.triu_indices(p_adj_mat.shape[0], k = 1)] = p_adj
    p_adj_mat = p_adj_mat + p_adj_mat.T
    
    similarity = 1 - (fisher_exact_values - fisher_exact_values.min()) / (fisher_exact_values.max() - fisher_exact_values.min())
    Z = linkage(similarity, linkage_method)
    # Clusterize the data
    labels = fcluster(Z, fcluster_threshold)
    # Keep the indices to sort labels
    labels_order = np.argsort(labels)
    clustered_jaccard_df = pd.DataFrame(
        fisher_exact_values, index=signatures_names, columns=signatures_names).iloc[labels_order, labels_order]
    p_adj_df = pd.DataFrame(
        p_adj_mat, index = signatures_names, columns=signatures_names).iloc[labels_order, labels_order]
    if use_plotly:
        fig = px.imshow(clustered_jaccard_df, color_continuous_scale=cmap)
        fig.update_layout(
            height=plotly_height,  # Added parameter
            legend={'itemsizing': 'trace'},
            plot_bgcolor='rgba(0,0,0,0)',
            yaxis=dict(tickfont=dict(size=fontsize)),
            xaxis=dict(tickfont=dict(size=fontsize)),
        )
        if save is not None:
            fig.write_image(save)
        fig.show()
    else:
        fig, ax = plt.subplots(figsize=figsize)
        sns.heatmap(
            data=clustered_jaccard_df,
            cmap=cmap,
            square=False,
            ax=ax,
            robust=True,
            cbar_kws={"shrink": .50, 'label': 'Jaccard'},
            xticklabels=False,
            vmin=vmin,
            vmax=vmax)
        if save is not None:
            fig.tight_layout()
            fig.savefig(save)
        plt.show()
        plt.close(fig)
    if return_data:
        return clustered_jaccard_df,p_adj_df, Z