Source code for scenicplus.loom

"""export SCENIC+ object to target genes and target regions loom file.

These files can be visualized in the SCope single cell viewer.

"""

import json
import logging
import numpy as np
import pandas as pd
import scipy.sparse as sparse
import sys
from loomxpy.loomxpy import SCopeLoom
from loomxpy.utils import compress_encode
from ctxcore.genesig import Regulon
from typing import Dict, List, Mapping, Optional, Sequence
from multiprocessing import cpu_count
from collections import OrderedDict
import os
from sklearn.feature_extraction.text import CountVectorizer
from itertools import repeat, chain, islice
import loompy as lp
import re

from .scenicplus_class import SCENICPLUS


[docs] def export_to_loom(scplus_obj: SCENICPLUS, signature_key: str, out_fname: str, eRegulon_metadata_key: Optional[str] = 'eRegulon_metadata', auc_key: Optional[str] = 'eRegulon_AUC', auc_thr_key: Optional[str] = 'eRegulon_AUC_thresholds', keep_direct_and_extended_if_not_direct: Optional[bool] = False, selected_features: Optional[List[str]] = None, selected_cells: Optional[List[str]] = None, cluster_annotation: List[str] = None, tree_structure: Sequence[str] = (), title: str = None, nomenclature: str = "Unknown"): """ Create SCope [Davie et al, 2018] compatible loom files Parameters --------- scplus_obj: class::SCENICPLUS A SCENIC+ object with eRegulons, AUCs, and AUC thresholds computed signature_key: str Whether a 'Gene_based' or a 'Region_based' file should be produced. Possible values: 'Gene_based' or 'Region_based' out_fname: str Path to output file. eRegulon_metadata_key: str, optional Slot where the eRegulon metadata is stored. auc_key: str, optional Slot where the eRegulon AUC are stored auc_thr_key: str, optional Slot where AUC thresholds are stored keep_direct_and_extended_if_not_direct: bool, optional Keep only direct eregulons and add extended ones only if there is not a direct one. selected_features: List, optional A list with selected genes/region to use. selected_cells: List, optional A list with selected cells to use. cluster_annotations: List, optional A list with variables to use as cluster annotations. By default, those included in the DEGs/DARs dictionary will be used (plus those specificed here) tree_structure: sequence, optional A sequence of strings that defines the category tree structure. Needs to be a sequence of strings with three elements. Default: () title: str, optional The title for this loom file. If None than the basename of the filename is used as the title. Default: None nomenclature: str, optional The name of the genome. Default: 'Unknown' References ----------- Davie, K., Janssens, J., Koldere, D., De Waegeneer, M., Pech, U., Kreft, Ł., ... & Aerts, S. (2018). A single-cell transcriptome atlas of the aging Drosophila brain. Cell, 174(4), 982-998. Van de Sande, B., Flerin, C., Davie, K., De Waegeneer, M., Hulselmans, G., Aibar, S., ... & Aerts, S. (2020). A scalable SCENIC workflow for single-cell gene regulatory network analysis. Nature Protocols, 15(7), 2247-2276. """ # Create logger level = logging.INFO log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' handlers = [logging.StreamHandler(stream=sys.stdout)] logging.basicConfig(level=level, format=log_format, handlers=handlers) log = logging.getLogger('SCENIC+') log.info('Formatting data') # Check-up if auc_key not in scplus_obj.uns.keys(): log.error('Compute ' + auc_key + 'first!') if auc_thr_key not in scplus_obj.uns.keys(): log.error('Compute ' + auc_thr_key + 'first!') if eRegulon_metadata_key not in scplus_obj.uns.keys(): log.error('Compute ' + eRegulon_metadata_key + 'first!') # Set keys and subset if needed if signature_key == 'Gene_based': markers_key = 'DEGs' if selected_features is not None or selected_cells is not None: scplus_obj = scplus_obj.subset( scplus_obj, cells=selected_cells, genes=selected_features, return_copy=True) ex_mtx = scplus_obj.X_EXP.T feature_names = scplus_obj.gene_names cell_names = scplus_obj.cell_names else: markers_key = 'DARs' if selected_features is not None or selected_cells is not None: scplus_obj = scplus_obj.subset( scplus_obj, cells=selected_cells, regions=selected_features, return_copy=True) ex_mtx = scplus_obj.X_ACC feature_names = scplus_obj.region_names cell_names = scplus_obj.cell_names if markers_key in scplus_obj.uns.keys(): cluster_markers = scplus_obj.uns[markers_key] if cluster_annotation is None: cluster_annotation = list(cluster_markers.keys()) else: cluster_annotation = list( set(sum([cluster_annotation, list(cluster_markers.keys())], []))) else: cluster_markers = None # Extract cell data cell_data = scplus_obj.metadata_cell # Check ups if cluster_annotation is not None: for annotation in cluster_annotation: if annotation not in cell_data: log.error( 'The cluster annotation', annotation, ' is not included in scplus_obj.metadata_cell') # eGRN AUC values auc_mtx = scplus_obj.uns[auc_key][signature_key].loc[cell_names] auc_mtx.columns = [re.sub('_\(.*\)', '', x) for x in auc_mtx.columns] auc_thresholds = scplus_obj.uns[auc_thr_key][signature_key] auc_thresholds.index = [re.sub('_\(.*\)', '', x) for x in auc_thresholds.index] if auc_mtx.shape[1] > 900 and keep_direct_and_extended_if_not_direct is False: log.info('The number of regulons is more than > 900. keep_direct_and_extended_if_not_direct is set to True') keep_direct_and_extended_if_not_direct = True if keep_direct_and_extended_if_not_direct is True: direct_eRegulons = [x for x in auc_mtx.columns if not 'extended' in x] extended_eRegulons = [x for x in auc_mtx.columns if 'extended' in x and not x.replace('_extended', '') in direct_eRegulons] selected_eRegulons = direct_eRegulons + extended_eRegulons auc_mtx = auc_mtx[selected_eRegulons] auc_thresholds = auc_thresholds.loc[selected_eRegulons] # Add TF expression in Region_based if signature_key == 'Region_based': tf_names = list(set([x.split('_')[0] for x in auc_mtx.columns])) tf_mat = scplus_obj.to_df('EXP')[tf_names].T ex_mtx = sparse.vstack( [ex_mtx, sparse.csr_matrix(tf_mat.values)], format='csr') feature_names = feature_names.tolist() + tf_names # Format regulons if signature_key == 'Gene_based': regulons = {re.sub('_\(.*\)', '', x): ' '.join(list(set(scplus_obj.uns[eRegulon_metadata_key][scplus_obj.uns[eRegulon_metadata_key].Gene_signature_name == x]['Gene']))) for x in list( set(scplus_obj.uns[eRegulon_metadata_key].Gene_signature_name))} cv = CountVectorizer(lowercase=False) else: regulons = {re.sub('_\(.*\)', '', x): ' '.join(list(set(scplus_obj.uns[eRegulon_metadata_key][scplus_obj.uns[eRegulon_metadata_key].Region_signature_name == x]['Region']))) for x in list( set(scplus_obj.uns[eRegulon_metadata_key].Region_signature_name))} cv = CountVectorizer( lowercase=False, token_pattern=r'(?u)\b\w\w+\b:\b\w\w+\b-\b\w\w+\b') regulon_mat = cv.fit_transform(regulons.values()) regulon_mat = pd.DataFrame(regulon_mat.todense( ), columns=cv.get_feature_names_out(), index=regulons.keys()) regulon_mat = regulon_mat.reindex(columns=feature_names, fill_value=0).T if keep_direct_and_extended_if_not_direct is True: regulon_mat = regulon_mat[selected_eRegulons] # Cell annotations and metrics metrics = [] annotations = [] for var in cell_data: if isinstance(cell_data[var][0], np.bool_): annotations.append(cell_data[var]) else: try: metrics.append(cell_data[var].astype('float64')) cell_data[var] = cell_data[var].astype('float64') except BaseException: if len(set(cell_data[var])) < 255: annotations.append(cell_data[var].astype('str')) cell_data[var] = cell_data[var].astype('str') metrics = pd.concat(metrics, axis=1).fillna(0) if len(metrics) > 0 else [] annotations = pd.concat(annotations, axis=1) if len( annotations) > 0 else [] # Embeddings. Cell embeddings in this case embeddings = scplus_obj.dr_cell #only keep first two dimensions of embedding for key in embeddings.keys(): embeddings[key] = embeddings[key][embeddings[key].columns[0:2]] # Add linked_gene information if signature_key == 'Gene_based': linked_gene = scplus_obj.uns[eRegulon_metadata_key][[ 'Region', 'Gene']].groupby('Gene').agg(lambda x: '; '.join(set(x))) linked_gene = linked_gene.reindex( index=feature_names, fill_value='').loc[feature_names] else: linked_gene = scplus_obj.uns[eRegulon_metadata_key][['Region', 'Gene']].groupby( 'Region').agg(lambda x: '; '.join(set(x))) linked_gene = linked_gene.reindex( index=feature_names, fill_value='').loc[feature_names] linked_gene.columns = ['0'] # Create minimal loom log.info('Creating minimal loom') _export_minimal_loom(ex_mtx=ex_mtx, cell_names=cell_names, feature_names=feature_names, out_fname=out_fname, regulons=regulon_mat, cell_annotations=None, tree_structure=tree_structure, title=title, nomenclature=nomenclature, embeddings=embeddings, auc_mtx=auc_mtx, auc_thresholds=auc_thresholds, linked_gene=linked_gene) # Add annotations log.info('Adding annotations') path_to_loom = out_fname loom = SCopeLoom.read_loom(path_to_loom) if len(metrics): _add_metrics(loom, metrics) if len(annotations): _add_annotation(loom, annotations) # Add clusterings if cluster_annotation is not None: log.info('Adding clusterings') _add_clusterings(loom, pd.DataFrame(cell_data[cluster_annotation])) # Add markers if cluster_markers is not None: log.info('Adding markers') annotation_in_markers = [ x for x in cluster_annotation if x in cluster_markers.keys()] annotation_not_in_markers = [ x for x in cluster_annotation if x not in cluster_markers.keys()] for x in annotation_not_in_markers: log.info(x, 'is not in the cluster markers dictionary') cluster_markers = { k: v for k, v in cluster_markers.items() if k in annotation_in_markers } # Keep genes in data for y in cluster_markers: cluster_markers[y] = { x: cluster_markers[y][x][cluster_markers[y] [x].index.isin(feature_names)] for x in cluster_markers[y].keys() } _add_markers(loom, cluster_markers) log.info('Exporting') loom.export(out_fname)
def _export_minimal_loom( ex_mtx: sparse.csr_matrix, cell_names: List[str], feature_names: List[str], out_fname: str, regulons: pd.DataFrame = None, cell_annotations: Optional[Mapping[str, str]] = None, tree_structure: Sequence[str] = (), title: Optional[str] = None, nomenclature: str = "Unknown", num_workers: int = cpu_count(), embeddings: Mapping[str, pd.DataFrame] = {}, auc_mtx=None, auc_thresholds=None, compress: bool = False, linked_gene=None ): """ An internal function to create a minimal loom file """ # Information on the general loom file format: http://linnarssonlab.org/loompy/format/index.html # Information on the SCope specific alterations: https://github.com/aertslab/SCope/wiki/Data-Format if cell_annotations is None: cell_annotations = dict(zip(cell_names, ['-'] * ex_mtx.shape[1])) # Create an embedding based on tSNE. # Name of columns should be "_X" and "_Y". if len(embeddings) == 0: embeddings = { "tSNE (default)": pd.DataFrame(data=TSNE().fit_transform(auc_mtx), index=cell_names, columns=['_X', '_Y']) } # (n_cells, 2) id2name = OrderedDict() embeddings_X = pd.DataFrame(index=cell_names) embeddings_Y = pd.DataFrame(index=cell_names) for idx, (name, df_embedding) in enumerate(embeddings.items()): if len(df_embedding.columns) != 2: raise Exception('The embedding should have two columns.') # Default embedding must have id == -1 for SCope. embedding_id = idx - 1 id2name[embedding_id] = name embedding = df_embedding.copy() embedding.columns = ['_X', '_Y'] embeddings_X = pd.merge( embeddings_X, embedding['_X'].to_frame().rename( columns={'_X': str(embedding_id)}), left_index=True, right_index=True, ) embeddings_Y = pd.merge( embeddings_Y, embedding['_Y'].to_frame().rename( columns={'_Y': str(embedding_id)}), left_index=True, right_index=True, ) # Encode cell type clusters. # The name of the column should match the identifier of the clustering. name2idx = dict(map(reversed, enumerate( sorted(set(cell_annotations.values()))))) clusterings = ( pd.DataFrame(data=cell_names, index=cell_names, columns=['0']) .replace(cell_annotations) .replace(name2idx) ) # Create meta-data structure. def create_structure_array(df): # Create a numpy structured array return np.array([tuple(row) for row in df.values], dtype=np.dtype(list(zip(df.columns, df.dtypes)))) default_embedding = pd.DataFrame([embeddings_X.iloc[:,0], embeddings_Y.iloc[:,0]], columns=cell_names, index=['_X', '_Y']).T column_attrs = { "CellID": np.array(cell_names), "Embedding": create_structure_array(default_embedding), "RegulonsAUC": create_structure_array(auc_mtx), "Clusterings": create_structure_array(clusterings), "ClusterID": clusterings.values, 'Embeddings_X': create_structure_array(embeddings_X), 'Embeddings_Y': create_structure_array(embeddings_Y), } if linked_gene is None: row_attrs = { "Gene": np.array(feature_names), "Regulons": create_structure_array(regulons), } else: row_attrs = { "Gene": np.array(feature_names), "Regulons": create_structure_array(regulons), "linkedGene": np.array(linked_gene['0']) } def fetch_logo(context): for elem in context: if elem.endswith('.png'): return elem return "" regulon_thresholds = [ { "regulon": name, "defaultThresholdValue": (threshold if isinstance(threshold, float) else threshold[0]), "defaultThresholdName": "gaussian_mixture_split", "allThresholds": {"gaussian_mixture_split": (threshold if isinstance(threshold, float) else threshold[0])}, "motifData": "", } for name, threshold in auc_thresholds.iteritems() ] general_attrs = { "title": os.path.splitext(os.path.basename(out_fname))[0] if title is None else title, "MetaData": json.dumps( { "embeddings": [{'id': identifier, 'name': name} for identifier, name in id2name.items()], "annotations": [{"name": "", "values": []}], "clusterings": [ { "id": 0, "group": "celltype", "name": "Cell Type", "clusters": [{"id": idx, "description": name} for name, idx in name2idx.items()], } ], "regulonThresholds": regulon_thresholds, } ), "Genome": nomenclature, } # Add tree structure. # All three levels need to be supplied assert len(tree_structure) <= 3, "" general_attrs.update( ("SCopeTreeL{}".format(idx + 1), category) for idx, category in enumerate(list(islice(chain(tree_structure, repeat("")), 3))) ) # Compress MetaData global attribute if compress: general_attrs["MetaData"] = compress_encode( value=general_attrs["MetaData"]) # Create loom file for use with the SCope tool. lp.create( filename=out_fname, layers=ex_mtx, row_attrs=row_attrs, col_attrs=column_attrs, file_attrs=general_attrs, ) def _get_metadata(loom): """ A helper function to get metadata """ annot_metadata = loom.get_meta_data()['annotations'] annot_mt_column_names = [annot_metadata[x]['name'] for x in range(len(annot_metadata))] annot_mt = pd.concat([pd.DataFrame(loom.col_attrs[annot_mt_column_names[x]]) for x in range(len(annot_mt_column_names))], axis=1) annot_mt.columns = [annot_mt_column_names[x] for x in range(len(annot_mt_column_names))] annot_mt.index = loom.get_cell_ids().tolist() return annot_mt def _add_metrics(loom, metrics: pd.DataFrame): """ A helper function to add metrics """ md_metrics = [] for metric in metrics: md_metrics.append({"name": metric}) loom.col_attrs[metric] = np.array(metrics[metric]) loom.global_attrs["MetaData"].update({'metrics': md_metrics}) def _add_annotation(loom, annots: pd.DataFrame): """ A helper function to add annotations """ md_annot = [] for annot in annots: vals = list(annots[annot]) uniq_vals = np.unique(vals) md_annot.append({ "name": annot, "values": list(map(lambda x: str(x), uniq_vals.tolist())) }) loom.col_attrs[annot] = np.array(annots[annot]) loom.global_attrs["MetaData"].update({'annotations': md_annot}) def _add_clusterings(loom: SCopeLoom, cluster_data: pd.DataFrame): """ A helper function to add clusters """ col_attrs = loom.col_attrs attrs_metadata = {} attrs_metadata["clusterings"] = [] clusterings = pd.DataFrame(index=cluster_data.index.tolist()) j = 0 for cluster_name in cluster_data.columns: clustering_id = j clustering_algorithm = cluster_name clustering_resolution = cluster_name cluster_marker_method = 'Wilcoxon' num_clusters = len(np.unique(cluster_data[cluster_name])) cluster_2_number = { np.unique(cluster_data[cluster_name])[i]: i for i in range(num_clusters) } # Data clusterings[str(clustering_id)] = [cluster_2_number[x] for x in cluster_data[cluster_name].tolist()] # Metadata attrs_metadata["clusterings"] = attrs_metadata["clusterings"] + [{ "id": clustering_id, "group": clustering_algorithm, "name": clustering_algorithm, "clusters": [], "clusterMarkerMetrics": [ { "accessor": "avg_logFC", "name": "Avg. logFC", "description": f"Average log fold change from {cluster_marker_method.capitalize()} test" }, { "accessor": "pval", "name": "Adjusted P-Value", "description": f"Adjusted P-Value from {cluster_marker_method.capitalize()} test" } ] }] for i in range(0, num_clusters): cluster = {} cluster['id'] = i cluster['description'] = np.unique(cluster_data[cluster_name])[i] attrs_metadata['clusterings'][j]['clusters'].append(cluster) j += 1 # Update column attribute Dict col_attrs_clusterings = { # Pick the first one as default clustering (this is purely # arbitrary) "ClusterID": clusterings["0"].values, "Clusterings": _df_to_named_matrix(clusterings) } col_attrs = {**col_attrs, **col_attrs_clusterings} loom.col_attrs = col_attrs loom.global_attrs["MetaData"].update( {'clusterings': attrs_metadata["clusterings"]} ) def _add_markers(loom: SCopeLoom, markers_dict: Dict[str, Dict[str, pd.DataFrame]]): """ A helper function to add markers to clusterings """ attrs_metadata = loom.global_attrs["MetaData"] row_attrs = loom.row_attrs for cluster_name in markers_dict: idx = [i for i in range(len(attrs_metadata['clusterings'])) if attrs_metadata['clusterings'][i]["name"] == cluster_name][0] clustering_id = attrs_metadata['clusterings'][idx]["id"] num_clusters = len(attrs_metadata['clusterings'][idx]["clusters"]) cluster_description = [ attrs_metadata['clusterings'][idx]["clusters"][x]['description'] for x in range(num_clusters) ] # Initialize cluster_markers = pd.DataFrame( index=loom.get_genes(), columns=[str(x) for x in np.arange(num_clusters)] ).fillna(0, inplace=False) cluster_markers_avg_logfc = pd.DataFrame( index=loom.get_genes(), columns=[str(x) for x in np.arange(num_clusters)] ).fillna(0, inplace=False) cluster_markers_pval = pd.DataFrame( index=loom.get_genes(), columns=[str(x) for x in np.arange(num_clusters)] ).fillna(0, inplace=False) # Populate for i in range(0, num_clusters): try: gene_names = markers_dict[cluster_name][cluster_description[i]].index.tolist( ) pvals_adj = markers_dict[cluster_name][cluster_description[i] ]['Adjusted_pval'] logfoldchanges = markers_dict[cluster_name][cluster_description[i]]['Log2FC'] i = str(i) num_genes = len(gene_names) # Replace cluster_markers.loc[gene_names, i] = np.int(1) cluster_markers_avg_logfc.loc[gene_names, i] = logfoldchanges cluster_markers_pval.loc[gene_names, i] = pvals_adj except BaseException: print('No markers for ', cluster_description[i]) # Update row attribute Dict row_attrs_cluster_markers = { f"ClusterMarkers_{str(idx)}": _df_to_named_matrix( cluster_markers.astype(np.int8)), f"ClusterMarkers_{str(idx)}_avg_logFC": _df_to_named_matrix(cluster_markers_avg_logfc.astype(np.float32)), f"ClusterMarkers_{str(idx)}_pval": _df_to_named_matrix(cluster_markers_pval.astype(np.float32)) } row_attrs = {**row_attrs, **row_attrs_cluster_markers} loom.row_attrs = row_attrs def _get_regulons(loom): """ A helper function to get regulons """ regulon_dict = pd.DataFrame( loom.get_regulons(), index=loom.row_attrs['Gene'] ).to_dict() for t in regulon_dict: regulon_dict[t] = {x: y for x, y in regulon_dict[t].items() if y != 0} motif_data = { x['regulon']: x['motifData'] for x in loom.global_attrs['MetaData']['regulonThresholds'] } regulon_list = [ Regulon( name=x, gene2weight=regulon_dict[x], transcription_factor=x.split('_')[0], gene2occurrence=[], context=frozenset( list( motif_data[x])) ) for x in regulon_dict.keys() ] return regulon_list def _df_to_named_matrix(df: pd.DataFrame): """ A helper function to create metadata structure. """ return np.array([tuple(row) for row in df.values], dtype=np.dtype(list(zip(df.columns, df.dtypes))))