Source code for scenicplus.dimensionality_reduction

"""Dimmensionality reduction and clustering based on target genes and target regions AUC.

"""

import pandas as pd
import umap
import sklearn
import matplotlib.backends.backend_pdf
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import matplotlib.patches as mpatches
import matplotlib.patheffects as PathEffects
import matplotlib.pyplot as plt
import random
import numpy as np
from adjustText import adjust_text
import igraph as ig
import leidenalg as la
from sklearn.neighbors import kneighbors_graph
from typing import Dict, List, Tuple
from typing import Optional, Union
from sklearn.decomposition import PCA
import harmonypy as hm

from .scenicplus_class import SCENICPLUS

[docs]def harmony(scplus_obj: SCENICPLUS,
            variable: str,
            auc_key: str = 'eRegulon_AUC',
            out_key: str = 'eRegulon_AUC_harmony',
            signature_keys : List[str] = ['Gene_based', 'Region_based'],
            scale: Optional[bool] = True,
            random_state: Optional[int] = 555,
            **kwargs):
    """
    Apply harmony batch effect correction (Korsunsky et al, 2019) over eRegulon AUC values
    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
        A SCENICPLUS object with eRegulon AUC values calculated.
    variable: str
        Variable in scplus.metadata_cell to correct by.
    auc_key: str, optional
        Key where AUC values are stored.
    out_key: str, optional
        Key where corrected eRegulon values will be output
    signature_keys: list, optional
        Whether to scale probability matrix prior to correction. Default: True
    scale: bool, optional
        Whether to scale the AUC values
    random_state: int, optional
        Random seed used to use with harmony. Default: 555
    References
    ---------
    Korsunsky, I., Millard, N., Fan, J., Slowikowski, K., Zhang, F., Wei, K., ... & Raychaudhuri, S. (2019). Fast, sensitive and accurate integration of
    single-cell data with Harmony. Nature methods, 16(12), 1289-1296.
    """

    for key in signature_keys:
        cell_data = scplus_obj.metadata_cell.copy()
        data = scplus_obj.uns[auc_key][key].copy().T
        if scale:
            data = pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
                data), index=data.index.to_list(), columns=data.columns)
        data_np = data.transpose().to_numpy()
        ho = hm.run_harmony(
            data_np,
            cell_data,
            variable,
            random_state=random_state,
            **kwargs)
        data_harmony = pd.DataFrame(
            ho.Z_corr,
            index=data.index.to_list(),
            columns=data.columns).T
        
        if out_key not in scplus_obj.uns.keys():
            scplus_obj.uns[out_key]={}
        scplus_obj.uns[out_key][key] = data_harmony


[docs]def find_clusters(scplus_obj: SCENICPLUS,
                  auc_key: Optional[str] = 'eRegulon_AUC',
                  signature_keys: Optional[List[str]] = ['Gene_based', 'Region_based'],
                  k: Optional[int] = 10,
                  res: Optional[List[float]] = [0.6],
                  seed: Optional[int] = 555,
                  scale: Optional[bool] = True,
                  prefix: Optional[str] = '',
                  selected_regulons: Optional[List[int]] = None,
                  selected_cells: Optional[List[str]] = None,
                  **kwargs):
    """
    Performing leiden cell or region clustering and add results to SCENICPLUS object's metadata.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A SCENICPLUS object with eRegulons AUC computed.
    auc_key: str, optional
            Key to extract AUC values from. Default: 'eRegulon_AUC'
    signature_keys: List, optional
            Keys to extract AUC values from. Default: ['Gene_based', 'Region_based']
    k: int, optional
            Number of neighbours in the k-neighbours graph. Default: 10
    res: float, optional
            Resolution parameter for the leiden algorithm step. Default: 0.6
    seed: int, optional
            Seed parameter for the leiden algorithm step. Default: 555
    scale: bool, optional
            Whether to scale the enrichment prior to the clustering. Default: False
    prefix: str, optional
            Prefix to add to the clustering name when adding it to the correspondent metadata attribute. Default: ''
    selected_regulons: list, optional
            A list with selected regulons to be used for clustering. Default: None (use all regulons)
    selected_cells: list, optional
            A list with selected features cells to cluster. Default: None (use all cells)
    """

    if scale:
        data_mat = pd.concat([pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            scplus_obj.uns[auc_key][x].T), index=scplus_obj.uns[auc_key][x].T.index.to_list(), columns=scplus_obj.uns[auc_key][x].T.columns) for x in signature_keys])
    else:
        data_mat = pd.concat([scplus_obj.uns[auc_key][x]
                             for x in signature_keys]).T
    data_names = data_mat.columns.tolist()

    if selected_regulons is not None:
        selected_regulons = [
            x for x in selected_regulons if x in data_mat.index]
        data_mat = data_mat.loc[selected_regulons]
    if selected_cells is not None:
        data_mat = data_mat[selected_cells]
        data_names = selected_cells

    data_mat = data_mat.T.fillna(0)

    A = kneighbors_graph(data_mat, k)
    sources, targets = A.nonzero()
    G = ig.Graph(directed=True)
    G.add_vertices(A.shape[0])
    edges = list(zip(sources, targets))
    G.add_edges(edges)
    for C in res:
        partition = la.find_partition(
            G,
            la.RBConfigurationVertexPartition,
            resolution_parameter=C,
            seed=seed)
        cluster = pd.DataFrame(
            partition.membership,
            index=data_names,
            columns=[
                prefix +
                'leiden_' +
                str(k) +
                '_' +
                str(C)]).astype(str)

        scplus_obj.add_cell_data(cluster)


[docs]def run_eRegulons_tsne(scplus_obj: SCENICPLUS,
                       scale: Optional[bool] = True,
                       auc_key: Optional[str] = 'eRegulon_AUC',
                       signature_keys: Optional[List[str]] = ['Gene_based', 'Region_based'],
                       reduction_name: Optional[str] = 'eRegulons_tSNE',
                       random_state: Optional[int] = 555,
                       selected_regulons: Optional[List[int]] = None,
                       selected_cells: Optional[List[str]] = None,
                       **kwargs):
    """
    Run TSNE and add it to the dimensionality reduction dictionary.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A SCENICPLUS object with eRegulons AUC computed.
    scale: bool, optional
            Whether to scale the enrichments prior to the dimensionality reduction. Default: False
    auc_key: str, optional
            Key to extract AUC values from. Default: 'eRegulon_AUC'
    signature_keys: List, optional
            Keys to extract AUC values from. Default: ['Gene_based', 'Region_based']
    reduction_name: str, optional
            Key used to store dimensionality reduction in scplud_obj.dr_cell. Default: eRegulon_AUC.
    random_state: int, optional
            Seed parameter for running UMAP. Default: 555
    selected_regulons: list, optional
            A list with selected regulons to be used for clustering. Default: None (use all regulons)
    selected_cells: list, optional
            A list with selected features cells to cluster. Default: None (use all cells)
    **kwargs
            Parameters to pass to the tSNE functions.
    """

    if scale:
        data_mat = pd.concat([pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            scplus_obj.uns[auc_key][x].T), index=scplus_obj.uns[auc_key][x].T.index.to_list(), columns=scplus_obj.uns[auc_key][x].T.columns) for x in signature_keys])
    else:
        data_mat = pd.concat([scplus_obj.uns[auc_key][x]
                             for x in signature_keys]).T
    data_names = data_mat.columns.tolist()

    if selected_regulons is not None:
        selected_regulons = [
            x for x in selected_regulons if x in data_mat.index]
        data_mat = data_mat.loc[selected_regulons]
    if selected_cells is not None:
        data_mat = data_mat[selected_cells]
        data_names = selected_cells

    data_mat = data_mat.T.fillna(0)

    try:
        import fitsne
        embedding = fitsne.FItSNE(
            np.ascontiguousarray(
                data_mat.to_numpy()),
            rand_seed=random_state,
            perplexity=perplexity, **kwargs)
    except BaseException:
        embedding = sklearn.manifold.TSNE(
            n_components=2, random_state=random_state).fit_transform(
            data_mat.to_numpy(), **kwargs)
    dr = pd.DataFrame(
        embedding,
        index=data_names,
        columns=[
            'tSNE_1',
            'tSNE_2'])
    if not hasattr(scplus_obj, 'dr_cell'):
        scplus_obj.dr_cell = {}
    scplus_obj.dr_cell[reduction_name] = dr


[docs]def run_eRegulons_umap(scplus_obj: SCENICPLUS,
                       scale: Optional[bool] = True,
                       auc_key: Optional[str] = 'eRegulon_AUC',
                       signature_keys: Optional[List[str]] = ['Gene_based', 'Region_based'],
                       reduction_name: Optional[str] = 'eRegulons_UMAP',
                       random_state: Optional[int] = 555,
                       selected_regulons: Optional[List[int]] = None,
                       selected_cells: Optional[List[str]] = None,
                       **kwargs):
    """
    Run UMAP and add it to the dimensionality reduction dictionary.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A SCENICPLUS object with eRegulons AUC computed.
    scale: bool, optional
            Whether to scale the cell-topic or topic-regions contributions prior to the dimensionality reduction. Default: False
    auc_key: str, optional
            Key to extract AUC values from. Default: 'eRegulon_AUC'
    signature_keys: List, optional
            Keys to extract AUC values from. Default: ['Gene_based', 'Region_based']
    reduction_name: str, optional
            Key used to store dimensionality reduction in scplud_obj.dr_cell. Default: eRegulon_AUC.
    random_state: int, optional
            Seed parameter for running UMAP. Default: 555
    selected_regulons: list, optional
            A list with selected regulons to be used for clustering. Default: None (use all regulons)
    selected_cells: list, optional
            A list with selected features cells to cluster. Default: None (use all cells)
    **kwargs
            Parameters to pass to umap.UMAP.
    """

    if scale:
        data_mat = pd.concat([pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            scplus_obj.uns[auc_key][x].T), index=scplus_obj.uns[auc_key][x].T.index.to_list(), columns=scplus_obj.uns[auc_key][x].T.columns) for x in signature_keys])
    else:
        data_mat = pd.concat([scplus_obj.uns[auc_key][x]
                             for x in signature_keys]).T
    data_names = data_mat.columns.tolist()

    if selected_regulons is not None:
        selected_regulons = [
            x for x in selected_regulons if x in data_mat.index]
        data_mat = data_mat.loc[selected_regulons]
    if selected_cells is not None:
        data_mat = data_mat[selected_cells]
        data_names = selected_cells

    if scale:
        data_mat = pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            data_mat), index=data_mat.index.to_list(), columns=data_mat.columns)

    data_mat = data_mat.T.fillna(0)

    reducer = umap.UMAP(random_state=random_state, **kwargs)
    embedding = reducer.fit_transform(data_mat)
    dr = pd.DataFrame(
        embedding,
        index=data_names,
        columns=[
            'UMAP_1',
            'UMAP_2'])
    if not hasattr(scplus_obj, 'dr_cell'):
        scplus_obj.dr_cell = {}
    scplus_obj.dr_cell[reduction_name] = dr


[docs]def run_eRegulons_pca(scplus_obj: SCENICPLUS,
                      scale: Optional[bool] = True,
                      auc_key: Optional[str] = 'eRegulon_AUC',
                      signature_keys: Optional[List[str]] = ['Gene_based', 'Region_based'],
                      reduction_name: Optional[str] = 'eRegulons_PCA',
                      random_state: Optional[int] = 555,
                      selected_regulons: Optional[List[int]] = None,
                      selected_cells: Optional[List[str]] = None,
                      n_pcs: Optional[int] = 50,
                      **kwargs):
    """
    Run UMAP and add it to the dimensionality reduction dictionary.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A SCENICPLUS object with eRegulons AUC computed.
    scale: bool, optional
            Whether to scale the cell-topic or topic-regions contributions prior to the dimensionality reduction. Default: False
    auc_key: str, optional
            Key to extract AUC values from. Default: 'eRegulon_AUC'
    signature_keys: List, optional
            Keys to extract AUC values from. Default: ['Gene_based', 'Region_based']
    reduction_name: str, optional
            Key used to store dimensionality reduction in scplud_obj.dr_cell. Default: eRegulon_AUC.
    random_state: int, optional
            Seed parameter for running UMAP. Default: 555
    selected_regulons: list, optional
            A list with selected regulons to be used for clustering. Default: None (use all regulons)
    selected_cells: list, optional
            A list with selected features cells to cluster. Default: None (use all cells)
    n_pcs: int, optional
            Number of principle components to calculate. Default: 50
    **kwargs
            Parameters to pass to umap.UMAP.
    """

    if scale:
        data_mat = pd.concat([pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            scplus_obj.uns[auc_key][x].T), index=scplus_obj.uns[auc_key][x].T.index.to_list(), columns=scplus_obj.uns[auc_key][x].T.columns) for x in signature_keys])
    else:
        data_mat = pd.concat([scplus_obj.uns[auc_key][x]
                             for x in signature_keys]).T
    data_names = data_mat.columns.tolist()

    if selected_regulons is not None:
        selected_regulons = [
            x for x in selected_regulons if x in data_mat.index]
        data_mat = data_mat.loc[selected_regulons]
    if selected_cells is not None:
        data_mat = data_mat[selected_cells]
        data_names = selected_cells

    if scale:
        data_mat = pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            data_mat), index=data_mat.index.to_list(), columns=data_mat.columns)

    data_mat = data_mat.T.fillna(0)

    reducer = PCA(n_components=n_pcs, random_state=random_state)
    embedding = reducer.fit_transform(data_mat)

    dr = pd.DataFrame(
        embedding,
        index=data_names,
        columns=[f'PC_{i}' for i in range(n_pcs)])[['PC_0', 'PC_1']]
    if not hasattr(scplus_obj, 'dr_cell'):
        scplus_obj.dr_cell = {}
    scplus_obj.dr_cell[reduction_name] = dr

[docs]def plot_metadata_given_ax(scplus_obj,
                  ax: matplotlib.axes,
                  reduction_name: str,
                  variable: str,
                  remove_nan: Optional[bool] = True,
                  show_label: Optional[bool] = True,
                  show_legend: Optional[bool] = False,
                  cmap: Optional[Union[str, 'matplotlib.cm']] = cm.viridis,
                  dot_size: Optional[int] = 10,
                  text_size: Optional[int] = 10,
                  alpha: Optional[Union[float, int]] = 1,
                  seed: Optional[int] = 555,
                  color_dictionary: Optional[Dict[str, str]] = {},
                  selected_cells: Optional[List[str]] = None):
    """
    Plot categorical and continuous metadata into dimensionality reduction.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A SCENICPLUS object with dimensionality reductions.
    ax: matplotlib.axes
            Axes to which to plot metadata.
    reduction_name: str
            Name of the dimensionality reduction to use
    variable: Str
            Variable to plot. It should be included in `class::SCENICPLUS.metadata_cell`.
    remove_nan: bool, optional
            Whether to remove data points for which the variable value is 'nan'. Default: True
    show_label: bool, optional
            For categorical variables, whether to show the label in the plot. Default: True
    show_legend: bool, optional
            For categorical variables, whether to show the legend next to the plot. Default: False
    cmap: str or 'matplotlib.cm', optional
            For continuous variables, color map to use for the legend color bar. Default: cm.viridis
    dot_size: int, optional
            Dot size in the plot. Default: 10
    text_size: int, optional
            For categorical variables and if show_label is True, size of the labels in the plot. Default: 10
    alpha: float, optional
            Transparency value for the dots in the plot. Default: 1
    seed: int, optional
            Random seed used to select random colors. Default: 555
    color_dictionary: dict, optional
            A dictionary containing an entry per variable, whose values are dictionaries with variable levels as keys and corresponding colors as values.
            Default: None
    figsize: tuple, optional
            Size of the figure. If num_columns is 1, this is the size for each figure; if num_columns is above 1, this is the overall size of the figure (if keeping
            default, it will be the size of each subplot in the figure). Default: (6.4, 4.8)
    num_columns: int, optional
            For multiplot figures, indicates the number of columns (the number of rows will be automatically determined based on the number of plots). Default: 1
    selected_cells: list, optional
            A list with selected cells to plot.
    save: str, optional
            Path to save plot. Default: None.
    """
    embedding = scplus_obj.dr_cell[reduction_name]
    data_mat = scplus_obj.metadata_cell.copy()

    if selected_cells is not None:
        data_mat = data_mat.loc[selected_cells]
        embedding = embedding.loc[selected_cells]

    data_mat = data_mat.loc[embedding.index.to_list()]

    var_data = data_mat.copy().loc[:, variable].dropna().to_list()
    if isinstance(var_data[0], str):
        if (remove_nan) & (data_mat[variable].isnull().sum() > 0):
            var_data = data_mat.copy().loc[:, variable].dropna().to_list()
            emb_nan = embedding.loc[data_mat.copy(
            ).loc[:, var].dropna().index.tolist()]
            label_pd = pd.concat(
                [emb_nan, data_mat.loc[:, [variable]].dropna()], axis=1, sort=False)
        else:
            var_data = data_mat.copy().astype(
                str).fillna('NA').loc[:, variable].to_list()
            label_pd = pd.concat([embedding, data_mat.astype(
                str).fillna('NA').loc[:, [variable]]], axis=1, sort=False)

        categories = set(var_data)
        try:
            color_dict = color_dictionary[variable]
        except BaseException:
            random.seed(seed)
            color = list(map(
                lambda i: "#" +
                "%06x" % random.randint(
                    0, 0xFFFFFF), range(len(categories))
            ))
            color_dict = dict(zip(categories, color))

        if (remove_nan) & (data_mat[variable].isnull().sum() > 0):
            ax.scatter(emb_nan.iloc[:, 0], emb_nan.iloc[:, 1], c=data_mat.loc[:, variable].dropna(
            ).apply(lambda x: color_dict[x]), s=dot_size, alpha=alpha)
            ax.set_xlabel(emb_nan.columns[0])
            ax.set_ylabel(emb_nan.columns[1])
        else:
            ax.scatter(embedding.iloc[:, 0], embedding.iloc[:, 1], c=data_mat.astype(str).fillna(
                'NA').loc[:, variable].apply(lambda x: color_dict[x]), s=dot_size, alpha=alpha)
            ax.set_xlabel(embedding.columns[0])
            ax.set_ylabel(embedding.columns[1])

        if show_label:
            label_pos = label_pd.groupby(variable).agg(
                {label_pd.columns[0]: np.mean, label_pd.columns[1]: np.mean})
            texts = []
            for label in label_pos.index.tolist():
                texts.append(
                    ax.text(
                        label_pos.loc[label][0],
                        label_pos.loc[label][1],
                        label,
                        horizontalalignment='center',
                        verticalalignment='center',
                        size=text_size,
                        weight='bold',
                        color=color_dict[label],
                        path_effects=[
                            PathEffects.withStroke(
                                linewidth=3,
                                foreground='w')]))
            adjust_text(texts)

        ax.set_title(variable)
        patchList = []
        for key in color_dict:
            data_key = mpatches.Patch(color=color_dict[key], label=key)
            patchList.append(data_key)
        if show_legend:
            ax.legend(
                handles=patchList, bbox_to_anchor=(
                    1.04, 1), loc="upper left")
        return ax
    else:
        var_data = data_mat.copy().loc[:, variable].to_list()
        o = np.argsort(var_data)
        ax.scatter(embedding.iloc[o, 0], embedding.iloc[o, 1], c=subset_list(
            var_data, o), cmap=cmap, s=dot_size, alpha=alpha)
        ax.set_xlabel(embedding.columns[0])
        ax.set_ylabel(embedding.columns[1])
        ax.set_title(variable)
        # setup the colorbar
        normalize = mcolors.Normalize(
            vmin=np.array(var_data).min(),
            vmax=np.array(var_data).max())
        scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=cmap)
        scalarmappaple.set_array(var_data)
        plt.colorbar(scalarmappaple, ax = ax)
        return ax

[docs]def plot_metadata(scplus_obj: SCENICPLUS,
                  reduction_name: str,
                  variables: List[str],
                  remove_nan: Optional[bool] = True,
                  show_label: Optional[bool] = True,
                  show_legend: Optional[bool] = False,
                  cmap: Optional[Union[str, 'matplotlib.cm']] = cm.viridis,
                  dot_size: Optional[int] = 10,
                  text_size: Optional[int] = 10,
                  alpha: Optional[Union[float, int]] = 1,
                  seed: Optional[int] = 555,
                  color_dictionary: Optional[Dict[str, str]] = {},
                  figsize: Optional[Tuple[float, float]] = (6.4, 4.8),
                  num_columns: Optional[int] = 1,
                  selected_cells: Optional[List[str]] = None,
                  save: Optional[str] = None):
    """
    Plot categorical and continuous metadata into dimensionality reduction.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A SCENICPLUS object with dimensionality reductions.
    reduction_name: str
            Name of the dimensionality reduction to use
    variables: list
            List of variables to plot. They should be included in `class::SCENICPLUS.metadata_cell`.
    remove_nan: bool, optional
            Whether to remove data points for which the variable value is 'nan'. Default: True
    show_label: bool, optional
            For categorical variables, whether to show the label in the plot. Default: True
    show_legend: bool, optional
            For categorical variables, whether to show the legend next to the plot. Default: False
    cmap: str or 'matplotlib.cm', optional
            For continuous variables, color map to use for the legend color bar. Default: cm.viridis
    dot_size: int, optional
            Dot size in the plot. Default: 10
    text_size: int, optional
            For categorical variables and if show_label is True, size of the labels in the plot. Default: 10
    alpha: float, optional
            Transparency value for the dots in the plot. Default: 1
    seed: int, optional
            Random seed used to select random colors. Default: 555
    color_dictionary: dict, optional
            A dictionary containing an entry per variable, whose values are dictionaries with variable levels as keys and corresponding colors as values.
            Default: None
    figsize: tuple, optional
            Size of the figure. If num_columns is 1, this is the size for each figure; if num_columns is above 1, this is the overall size of the figure (if keeping
            default, it will be the size of each subplot in the figure). Default: (6.4, 4.8)
    num_columns: int, optional
            For multiplot figures, indicates the number of columns (the number of rows will be automatically determined based on the number of plots). Default: 1
    selected_cells: list, optional
            A list with selected cells to plot.
    save: str, optional
            Path to save plot. Default: None.
    """

    embedding = scplus_obj.dr_cell[reduction_name]
    data_mat = scplus_obj.metadata_cell.copy()

    if selected_cells is not None:
        data_mat = data_mat.loc[selected_cells]
        embedding = embedding.loc[selected_cells]

    data_mat = data_mat.loc[embedding.index.to_list()]

    pdf = None
    if (save is not None) & (num_columns == 1):
        pdf = matplotlib.backends.backend_pdf.PdfPages(save)

    if num_columns > 1:
        num_rows = int(np.ceil(len(variables) / num_columns))
        if figsize == (6.4, 4.8):
            figsize = (6.4 * num_columns, 4.8 * num_rows)
        i = 1

    fig = plt.figure(figsize=figsize)

    for var in variables:
        var_data = data_mat.copy().loc[:, var].dropna().to_list()
        if isinstance(var_data[0], str):
            if (remove_nan) & (data_mat[var].isnull().sum() > 0):
                var_data = data_mat.copy().loc[:, var].dropna().to_list()
                emb_nan = embedding.loc[data_mat.copy(
                ).loc[:, var].dropna().index.tolist()]
                label_pd = pd.concat(
                    [emb_nan, data_mat.loc[:, [var]].dropna()], axis=1, sort=False)
            else:
                var_data = data_mat.copy().astype(
                    str).fillna('NA').loc[:, var].to_list()
                label_pd = pd.concat([embedding, data_mat.astype(
                    str).fillna('NA').loc[:, [var]]], axis=1, sort=False)

            categories = set(var_data)
            try:
                color_dict = color_dictionary[var]
            except BaseException:
                random.seed(seed)
                color = list(map(
                    lambda i: "#" +
                    "%06x" % random.randint(
                        0, 0xFFFFFF), range(len(categories))
                ))
                color_dict = dict(zip(categories, color))

            if num_columns > 1:
                plt.subplot(num_rows, num_columns, i)
                i = i + 1

            if (remove_nan) & (data_mat[var].isnull().sum() > 0):
                plt.scatter(emb_nan.iloc[:, 0], emb_nan.iloc[:, 1], c=data_mat.loc[:, var].dropna(
                ).apply(lambda x: color_dict[x]), s=dot_size, alpha=alpha)
                plt.xlabel(emb_nan.columns[0])
                plt.ylabel(emb_nan.columns[1])
            else:
                plt.scatter(embedding.iloc[:, 0], embedding.iloc[:, 1], c=data_mat.astype(str).fillna(
                    'NA').loc[:, var].apply(lambda x: color_dict[x]), s=dot_size, alpha=alpha)
                plt.xlabel(embedding.columns[0])
                plt.ylabel(embedding.columns[1])

            if show_label:
                label_pos = label_pd.groupby(var).agg(
                    {label_pd.columns[0]: np.mean, label_pd.columns[1]: np.mean})
                texts = []
                for label in label_pos.index.tolist():
                    texts.append(
                        plt.text(
                            label_pos.loc[label][0],
                            label_pos.loc[label][1],
                            label,
                            horizontalalignment='center',
                            verticalalignment='center',
                            size=text_size,
                            weight='bold',
                            color=color_dict[label],
                            path_effects=[
                                PathEffects.withStroke(
                                    linewidth=3,
                                    foreground='w')]))
                adjust_text(texts)

            plt.title(var)
            patchList = []
            for key in color_dict:
                data_key = mpatches.Patch(color=color_dict[key], label=key)
                patchList.append(data_key)
            if show_legend:
                plt.legend(
                    handles=patchList, bbox_to_anchor=(
                        1.04, 1), loc="upper left")

            if num_columns == 1:
                if save is not None:
                    pdf.savefig(fig, bbox_inches='tight')
                plt.show()
        else:
            var_data = data_mat.copy().loc[:, var].to_list()
            o = np.argsort(var_data)
            if num_columns > 1:
                plt.subplot(num_rows, num_columns, i)
                i = i + 1
            plt.scatter(embedding.iloc[o, 0], embedding.iloc[o, 1], c=subset_list(
                var_data, o), cmap=cmap, s=dot_size, alpha=alpha)
            plt.xlabel(embedding.columns[0])
            plt.ylabel(embedding.columns[1])
            plt.title(var)
            # setup the colorbar
            normalize = mcolors.Normalize(
                vmin=np.array(var_data).min(),
                vmax=np.array(var_data).max())
            scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=cmap)
            scalarmappaple.set_array(var_data)
            plt.colorbar(scalarmappaple)
            if num_columns == 1:
                if save is not None:
                    pdf.savefig(fig, bbox_inches='tight')
                plt.show()

    if num_columns > 1:
        plt.tight_layout()
        if save is not None:
            fig.savefig(save, bbox_inches='tight')
        plt.show()
    if (save is not None) & (num_columns == 1):
        pdf = pdf.close()


def subset_list(target_list, index_list):
    X = list(map(target_list.__getitem__, index_list))
    return X

[docs]def plot_AUC_given_ax(
    scplus_obj: SCENICPLUS,
    reduction_name: str,
    feature: str,
    ax: matplotlib.axes,
    auc_key: str = 'eRegulon_AUC',
    signature_key: str = 'Gene_based',
    cmap: matplotlib.cm = cm.viridis,
    dot_size: int = 10,
    alpha: float = 1,
    scale: bool = False,
    selected_cells: List = None):
    """
    Plot eRegulon AUC values on dimmensionality reduction

    Parameters
    ----------
    scplus_obj: `class::SCENICPLUS`
        A SCENICPLUS object with dimensionality reductions.
    reduction_name: str
        Name of the dimensionality reduction to use.
    feature: str
        eRegulon to plot, should be included in scplus_obj.uns[auc_key][signature_key] matrix.
    ax: matplotlib.axes
        matplotlib axes to which to plot.
    auc_key: str, optional
        key in scplus_obj.uns under which the AUC values are stored
    signature_key: str, optional
        key in scplus_obj.uns[auc_key] to plot (usually Gene_based or Region_based)
    cmap: matplotlib.cm, optional
        color map to use for plotting.
    dot_size: int, optional
        Dot size in the plot. Default: 10
    alpha: float, optional
        Transparency value for the dots in the plot. Default: 1
    scale: bool, optional
        Wether to scale AUC values before plotting
    selected_cells: List, optional
        A list with selected cells to plot.
    """

    embedding = scplus_obj.dr_cell[reduction_name].copy()

    if scale:
        data_mat = pd.DataFrame(
                    sklearn.preprocessing.StandardScaler().fit_transform(
                        scplus_obj.uns[auc_key][signature_key].T), 
                        index=scplus_obj.uns[auc_key][signature_key].T.index.to_list(), 
                        columns=scplus_obj.uns[auc_key][signature_key].T.columns).T
    else:
        data_mat = scplus_obj.uns[auc_key][signature_key]

    if selected_cells is not None:
        data_mat = data_mat[selected_cells]

    data_mat = data_mat.fillna(0)
    feature_data = data_mat[feature].squeeze()
    feature_data = feature_data.sort_values()
    embedding_plot = embedding.loc[feature_data.index.tolist()]
    o = np.argsort(feature_data)
    if not scale:
        ax.scatter(
            embedding_plot.iloc[o, 0], 
            embedding_plot.iloc[o, 1], 
            c=subset_list(feature_data, o), 
            cmap=cmap, 
            s=dot_size, 
            alpha=alpha,
            vmin=0, 
            vmax=max(feature_data))
        normalize = mcolors.Normalize(
            vmin=0, vmax=np.array(feature_data).max())
    else:
        ax.scatter(
            embedding_plot.iloc[o, 0], 
            embedding_plot.iloc[o, 1], 
            c=subset_list(feature_data, o), cmap=cmap, s=dot_size, alpha=alpha)
        normalize = mcolors.Normalize(
            vmin=np.array(feature_data).min(),
            vmax=np.array(feature_data).max())
    ax.set_xlabel(embedding_plot.columns[0])
    ax.set_ylabel(embedding_plot.columns[1])
    ax.set_title(feature)
    # setup the colorbar
    scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=cmap)
    scalarmappaple.set_array(feature_data)
    #plt.colorbar(scalarmappaple)
    return ax


[docs]def plot_eRegulon(scplus_obj: SCENICPLUS,
                  reduction_name: str,
                  auc_key: Optional[str] = 'eRegulon_AUC',
                  signature_keys: Optional[List[str]] = ['Gene_based', 'Region_based'],
                  normalize_tf_expression: Optional[bool] = True,
                  cmap: Optional[Union[str, 'matplotlib.cm']] = cm.viridis,
                  dot_size: Optional[int] = 10,
                  alpha: Optional[Union[float, int]] = 1,
                  scale: Optional[bool] = False,
                  selected_regulons: Optional[List[int]] = None,
                  selected_cells: Optional[List[str]] = None,
                  figsize: Optional[Tuple[float, float]] = (6.4, 4.8),
                  num_columns: Optional[int] = 3,
                  save: Optional[str] = None):
    """
    Plot TF expression and eRegulon AUC (gene and region based) into dimensionality reduction.

    Parameters
    ---------
    scplus_obj: `class::SCENICPLUS`
            A cisTopic object with dimensionality reductions in `class::CistopicObject.projections`.
    reduction_name: str
            Name of the dimensionality reduction to use
    auc_key: str, optional
            Key to extract AUC values from. Default: 'eRegulon_AUC'
    signature_keys: List, optional
            Keys to extract AUC values from. Default: ['Gene_based', 'Region_based'] 
    normalize_tf_expression: bool, optional
            Whether logCPM normalize TF expression. Default: True    
    cmap: str or 'matplotlib.cm', optional
            For continuous variables, color map to use for the legend color bar. Default: cm.viridis
    dot_size: int, optional
            Dot size in the plot. Default: 10
    alpha: float, optional
            Transparency value for the dots in the plot. Default: 1
    scale: bool, optional
            Whether to scale the cell-topic or topic-regions contributions prior to plotting. Default: False
    selected_regulons: list, optional
            A list with selected regulons to be used for clustering. Default: None (use all regulons)
    selected_cells: list, optional
            A list with selected features cells to cluster. Default: None (use all cells)
    figsize: tuple, optional
            Size of the figure. If num_columns is 1, this is the size for each figure; if num_columns is above 1, this is the overall size of the figure (if keeping
            default, it will be the size of each subplot in the figure). Default: (6.4, 4.8)
    num_columns: int, optional
            For multiplot figures, indicates the number of columns (the number of rows will be automatically determined based on the number of plots). Default: 1
    save: str, optional
            Path to save plot. Default: None.
    """

    embedding = scplus_obj.dr_cell[reduction_name].copy()

    if scale:
        data_mat = pd.concat([pd.DataFrame(sklearn.preprocessing.StandardScaler().fit_transform(
            scplus_obj.uns[auc_key][x].T), index=scplus_obj.uns[auc_key][x].T.index.to_list(), columns=scplus_obj.uns[auc_key][x].T.columns) for x in signature_keys])
    else:
        data_mat = pd.concat([scplus_obj.uns[auc_key][x]
                             for x in signature_keys]).T
    data_names = data_mat.columns.tolist()

    if selected_cells is not None:
        data_mat = data_mat[selected_cells]
        data_names = selected_cells

    if selected_regulons is None:
        selected_regulons = [x.rsplit('_', 1)[0] for x in data_mat.index]

    if (save is not None) & (num_columns == 1):
        pdf = matplotlib.backends.backend_pdf.PdfPages(save)

    if num_columns > 1:
        num_rows = int(np.ceil(len(selected_regulons)))
        if figsize == (6.4, 4.8):
            figsize = (6.4 * num_columns, 4.8 * num_rows)
        i = 1

    fig = plt.figure(figsize=figsize)

    dgem = pd.DataFrame(scplus_obj.X_EXP, index=scplus_obj.cell_names,
                        columns=scplus_obj.gene_names).copy()

    if normalize_tf_expression:
        dgem = dgem.T / dgem.T.sum(0) * 10**6
        dgem = np.log1p(dgem).T
    dgem = dgem.T
    data_mat = data_mat.T.fillna(0)
    for regulon in selected_regulons:
        tf_name = regulon.split('_')[0]
        regulon_names = data_mat.columns[data_mat.columns.str.contains(
            regulon + '_(', regex=False)]
        gene_based_name = regulon_names[regulon_names.str.contains(
            'g)', regex=False)]
        region_based_name = regulon_names[regulon_names.str.contains(
            'r)', regex=False)]
        if tf_name in dgem.index:
            # TF expression
            tf_expr = dgem.loc[tf_name]
            tf_expr = tf_expr.sort_values()
            embedding_plot = embedding.loc[tf_expr.index.tolist()]
            o = np.argsort(tf_expr)
            if num_columns > 1:
                plt.subplot(num_rows, num_columns, i)
                i = i + 1
            if not scale:
                plt.scatter(embedding_plot.iloc[o, 0], embedding_plot.iloc[o, 1], c=subset_list(
                    tf_expr, o), cmap=cmap, s=dot_size, alpha=alpha, vmin=0, vmax=max(tf_expr))
                normalize = mcolors.Normalize(
                    vmin=0, vmax=np.array(tf_expr).max())
            else:
                plt.scatter(embedding_plot.iloc[o, 0], embedding_plot.iloc[o, 1], c=subset_list(
                    tf_expr, o), cmap=cmap, s=dot_size, alpha=alpha)
                normalize = mcolors.Normalize(
                    vmin=np.array(tf_expr).min(),
                    vmax=np.array(tf_expr).max())
            plt.xlabel(embedding_plot.columns[0])
            plt.ylabel(embedding_plot.columns[1])
            plt.title(tf_name+'_expression')
            # setup the colorbar
            scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=cmap)
            scalarmappaple.set_array(tf_expr)
            plt.colorbar(scalarmappaple)
            if num_columns == 1:
                if save is not None:
                    pdf.savefig(fig, bbox_inches='tight')
                plt.show()
            # Gene data
            if num_columns > 1:
                plt.subplot(num_rows, num_columns, i)
                i = i + 1
            if 'Gene_based' in signature_keys:
                gene_data = data_mat[gene_based_name].squeeze()
                gene_data = gene_data.sort_values()
                embedding_plot = embedding.loc[gene_data.index.tolist()]
                o = np.argsort(gene_data)
                if not scale:
                    plt.scatter(embedding_plot.iloc[o, 0], embedding_plot.iloc[o, 1], c=subset_list(
                        gene_data, o), cmap=cmap, s=dot_size, alpha=alpha, vmin=0, vmax=max(gene_data))
                    normalize = mcolors.Normalize(
                        vmin=0, vmax=np.array(gene_data).max())
                else:
                    plt.scatter(embedding_plot.iloc[o, 0], embedding_plot.iloc[o, 1], c=subset_list(
                        gene_data, o), cmap=cmap, s=dot_size, alpha=alpha)
                    normalize = mcolors.Normalize(
                        vmin=np.array(gene_data).min(),
                        vmax=np.array(gene_data).max())
                plt.xlabel(embedding_plot.columns[0])
                plt.ylabel(embedding_plot.columns[1])
                plt.title(gene_based_name[0])
                # setup the colorbar
                scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=cmap)
                scalarmappaple.set_array(gene_data)
                plt.colorbar(scalarmappaple)
                if num_columns == 1:
                    if save is not None:
                        pdf.savefig(fig, bbox_inches='tight')
                    plt.show()
            else:
                plt.figure()
            # Region data
            if num_columns > 1:
                plt.subplot(num_rows, num_columns, i)
                i = i + 1
            if 'Region_based' in signature_keys:
                region_data = data_mat[region_based_name].squeeze()
                region_data = region_data.sort_values()
                embedding_plot = embedding.loc[region_data.index.tolist()]
                o = np.argsort(region_data)
                if not scale:
                    plt.scatter(embedding_plot.iloc[o, 0], embedding_plot.iloc[o, 1], c=subset_list(
                        region_data, o), cmap=cmap, s=dot_size, alpha=alpha, vmin=0, vmax=max(region_data))
                    normalize = mcolors.Normalize(
                        vmin=0, vmax=np.array(region_data).max())
                else:
                    plt.scatter(embedding_plot.iloc[o, 0], embedding_plot.iloc[o, 1], c=subset_list(
                        region_data, o), cmap=cmap, s=dot_size, alpha=alpha)
                    normalize = mcolors.Normalize(
                        vmin=np.array(region_data).min(),
                        vmax=np.array(region_data).max())
                plt.xlabel(embedding_plot.columns[0])
                plt.ylabel(embedding_plot.columns[1])
                plt.title(region_based_name[0])
                # setup the colorbar
                scalarmappaple = cm.ScalarMappable(norm=normalize, cmap=cmap)
                scalarmappaple.set_array(region_data)
                plt.colorbar(scalarmappaple)
                if num_columns == 1:
                    if save is not None:
                        pdf.savefig(fig, bbox_inches='tight')
                    plt.show()
            else:
                plt.figure()

    if num_columns > 1:
        plt.tight_layout()
        if save is not None:
            fig.savefig(save, bbox_inches='tight')
        plt.show()

    if (save is not None) & (num_columns == 1):
        pdf.close()