Source code for scenicplus.wrappers.run_scenicplus

"""Wrapper functions to run SCENIC+ analysis

After the SCENIC+ object has been generated the complete SCENIC+ workflow can be run with a single function. 

Following operation will be done:

1. Defining a search space surounding each gene
2. Region to gene linking & TF to gene linking
3. eGRN building
4. eGRN scoring (target gene and region based AUC and RSS)
5. Calculating TF-cistrome correlations
6. Dimensionality reductions
7. export to UCSC tracks and to loom file

In case the process is killed prematurely, the function can be restared and the workflow will resume from the last step that was succesfully completed.

"""


from scenicplus.scenicplus_class import SCENICPLUS, create_SCENICPLUS_object
from scenicplus.preprocessing.filtering import *
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.enhancer_to_gene import export_to_UCSC_interact 
from scenicplus.utils import format_egrns, export_eRegulons
from scenicplus.eregulon_enrichment import *
from scenicplus.TF_to_gene import *
from ..grn_builder.gsea_approach import build_grn
from scenicplus.dimensionality_reduction import *
from scenicplus.RSS import *
from scenicplus.diff_features import *
from scenicplus.loom import *
from typing import Dict, List, Mapping, Optional, Sequence
import os
import dill
import time


[docs]def run_scenicplus(scplus_obj: 'SCENICPLUS', variable: List[str], species: str, assembly: str, tf_file: str, save_path: str, biomart_host: Optional[str] = 'http://www.ensembl.org', upstream: Optional[List] = [1000, 150000], downstream: Optional[List] = [1000, 150000], region_ranking: Optional['CisTopicImputedFeatures'] = None, gene_ranking: Optional['CisTopicImputedFeatures'] = None, simplified_eGRN: Optional[bool] = False, calculate_TF_eGRN_correlation: Optional[bool] = True, calculate_DEGs_DARs: Optional[bool] = True, export_to_loom_file: Optional[bool] = True, export_to_UCSC_file: Optional[bool] = True, tree_structure: Sequence[str] = (), path_bedToBigBed: Optional[str] = None, n_cpu: Optional[int] = 1, _temp_dir: Optional[str] = '', save_partial: Optional[bool] = False, **kwargs ): """ Wrapper to run SCENIC+ Parameters --------- scplus_obj: `class::SCENICPLUS` A SCENICPLUS object. variables: List[str] Variables to use for RSS, TF-eGRN correlation and markers. species: str Species from which data comes from. Possible values: 'hsapiens', 'mmusculus', 'dmelanogaster' assembly: str Genome assembly to which the data was mapped. Possible values: 'hg38' tf_file: str Path to file containing genes that are TFs save_path: str Folder in which results will be saved biomart_host: str, optional Path to biomart host. Make sure that the host matches your genome assembly upstream: str, optional Upstream space to use for region to gene relationships downstream: str, optional Upstream space to use for region to gene relationships region_ranking: `class::CisTopicImputedFeatures`, optional Precomputed region ranking gene_ranking: `class::CisTopicImputedFeatures`, optional Precomputed gene ranking simplified_eGRN: bool, optional Whether to output simplified eGRNs (only TF-G sign rather than TF-G_R-G) calculate_TF_eGRN_correlation: bool, optional Whether to calculate the TF-eGRN correlation based on the variables calculate_DEGs_DARs: bool, optional Whether to calculate DARs/DEGs based on the variables export_to_loom_file: bool, optional Whether to export data to loom files (gene based/region based) export_to_UCSC_file: bool, optional Whether to export region-to-gene links and eregulons to bed files tree_structure: sequence, optional Tree structure for loom files path_bedToBigBed: str, optional Path to convert bed files to big bed when exporting to UCSC (required if files are meant to be used in a hub) n_cpu: int, optional Number of cores to use _temp_dir: str, optional Temporary directory for ray """ # Create logger level = logging.INFO log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' handlers = [logging.StreamHandler(stream=sys.stdout)] logging.basicConfig(level=level, format=log_format, handlers=handlers) log = logging.getLogger('SCENIC+_wrapper') start_time = time.time() check_folder = os.path.isdir(save_path) if not check_folder: os.makedirs(save_path) log.info("Created folder : "+ save_path) else: log.info(save_path + " folder already exists.") if 'Cistromes' not in scplus_obj.uns.keys(): log.info('Merging cistromes') merge_cistromes(scplus_obj) if 'search_space' not in scplus_obj.uns.keys(): log.info('Getting search space') get_search_space(scplus_obj, biomart_host = biomart_host, species = species, assembly = assembly, upstream = upstream, downstream = downstream) if 'region_to_gene' not in scplus_obj.uns.keys(): log.info('Inferring region to gene relationships') calculate_regions_to_genes_relationships(scplus_obj, ray_n_cpu = n_cpu, _temp_dir = _temp_dir, importance_scoring_method = 'GBM', importance_scoring_kwargs = GBM_KWARGS, **kwargs) if save_partial: log.info('Saving partial object') with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f: dill.dump(scplus_obj, f, protocol = -1) if 'TF2G_adj' not in scplus_obj.uns.keys(): log.info('Inferring TF to gene relationships') calculate_TFs_to_genes_relationships(scplus_obj, tf_file = tf_file, ray_n_cpu = n_cpu, method = 'GBM', _temp_dir = _temp_dir, key= 'TF2G_adj', **kwargs) if save_partial: log.info('Saving partial object') with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f: dill.dump(scplus_obj, f, protocol = -1) if 'eRegulons' not in scplus_obj.uns.keys(): log.info('Build eGRN') build_grn(scplus_obj, min_target_genes = 10, adj_pval_thr = 1, min_regions_per_gene = 0, quantiles = (0.85, 0.90, 0.95), top_n_regionTogenes_per_gene = (5, 10, 15), top_n_regionTogenes_per_region = (), binarize_using_basc = True, rho_dichotomize_tf2g = True, rho_dichotomize_r2g = True, rho_dichotomize_eregulon = True, rho_threshold = 0.05, keep_extended_motif_annot = True, merge_eRegulons = True, order_regions_to_genes_by = 'importance', order_TFs_to_genes_by = 'importance', key_added = 'eRegulons', cistromes_key = 'Unfiltered', disable_tqdm = False, ray_n_cpu = n_cpu, _temp_dir = _temp_dir, **kwargs) if 'eRegulon_metadata' not in scplus_obj.uns.keys(): log.info('Formatting eGRNs') format_egrns(scplus_obj, eregulons_key = 'eRegulons', TF2G_key = 'TF2G_adj', key_added = 'eRegulon_metadata') if 'eRegulon_signatures' not in scplus_obj.uns.keys(): log.info('Converting eGRNs to signatures') get_eRegulons_as_signatures(scplus_obj, eRegulon_metadata_key='eRegulon_metadata', key_added='eRegulon_signatures') if simplified_eGRN is True: md = scplus_obj.uns['eRegulon_signatures']['Gene_based'] names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()])) scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names} scplus_obj.uns['eRegulon_signatures']['Gene_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Gene_based'][x]))+'g)': scplus_obj.uns['eRegulon_signatures']['Gene_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Gene_based'].keys()} md = scplus_obj.uns['eRegulon_signatures']['Region_based'] names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()])) scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names} scplus_obj.uns['eRegulon_signatures']['Region_based'] = {x+'_('+str(len(scplus_obj.uns['eRegulon_signatures']['Region_based'][x]))+'r)': scplus_obj.uns['eRegulon_signatures']['Region_based'][x] for x in scplus_obj.uns['eRegulon_signatures']['Region_based'].keys()} if 'eRegulon_AUC' not in scplus_obj.uns.keys(): log.info('Calculating eGRNs AUC') if region_ranking is None: log.info('Calculating region ranking') region_ranking = make_rankings(scplus_obj, target='region') with open(os.path.join(save_path,'region_ranking.pkl'), 'wb') as f: dill.dump(region_ranking, f, protocol = -1) log.info('Calculating eGRNs region based AUC') score_eRegulons(scplus_obj, ranking = region_ranking, eRegulon_signatures_key = 'eRegulon_signatures', key_added = 'eRegulon_AUC', enrichment_type= 'region', auc_threshold = 0.05, normalize = False, n_cpu = n_cpu) if gene_ranking is None: log.info('Calculating gene ranking') gene_ranking = make_rankings(scplus_obj, target='gene') with open(os.path.join(save_path,'gene_ranking.pkl'), 'wb') as f: dill.dump(gene_ranking, f, protocol = -1) log.info('Calculating eGRNs gene based AUC') score_eRegulons(scplus_obj, gene_ranking, eRegulon_signatures_key = 'eRegulon_signatures', key_added = 'eRegulon_AUC', enrichment_type = 'gene', auc_threshold = 0.05, normalize= False, n_cpu = n_cpu) if calculate_TF_eGRN_correlation is True: log.info('Calculating TF-eGRNs AUC correlation') for var in variable: generate_pseudobulks(scplus_obj, variable = var, auc_key = 'eRegulon_AUC', signature_key = 'Gene_based', nr_cells = 5, nr_pseudobulks = 100, seed=555) generate_pseudobulks(scplus_obj, variable = var, auc_key = 'eRegulon_AUC', signature_key = 'Region_based', nr_cells = 5, nr_pseudobulks = 100, seed=555) TF_cistrome_correlation(scplus_obj, variable = var, auc_key = 'eRegulon_AUC', signature_key = 'Gene_based', out_key = var+'_eGRN_gene_based') TF_cistrome_correlation(scplus_obj, variable = var, auc_key = 'eRegulon_AUC', signature_key = 'Region_based', out_key = var+'_eGRN_region_based') if 'eRegulon_AUC_thresholds' not in scplus_obj.uns.keys(): log.info('Binarizing eGRNs AUC') binarize_AUC(scplus_obj, auc_key='eRegulon_AUC', out_key='eRegulon_AUC_thresholds', signature_keys=['Gene_based', 'Region_based'], n_cpu=n_cpu) if not hasattr(scplus_obj, 'dr_cell'): scplus_obj.dr_cell = {} if 'eRegulons_UMAP' not in scplus_obj.dr_cell.keys(): log.info('Making eGRNs AUC UMAP') run_eRegulons_umap(scplus_obj, scale=True, signature_keys=['Gene_based', 'Region_based']) if 'eRegulons_tSNE' not in scplus_obj.dr_cell.keys(): log.info('Making eGRNs AUC tSNE') run_eRegulons_tsne(scplus_obj, scale=True, signature_keys=['Gene_based', 'Region_based']) if 'RSS' not in scplus_obj.uns.keys(): log.info('Calculating eRSS') for var in variable: regulon_specificity_scores(scplus_obj, var, signature_keys=['Gene_based'], out_key_suffix='_gene_based', scale=False) regulon_specificity_scores(scplus_obj, var, signature_keys=['Region_based'], out_key_suffix='_region_based', scale=False) if calculate_DEGs_DARs is True: log.info('Calculating DEGs/DARs') for var in variable: get_differential_features(scplus_obj, var, use_hvg = True, contrast_type = ['DEGs', 'DARs']) if save_partial: log.info('Saving partial object') with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f: dill.dump(scplus_obj, f, protocol = -1) if export_to_loom_file is True: log.info('Exporting to loom file') export_to_loom(scplus_obj, signature_key = 'Gene_based', tree_structure = tree_structure, title = 'Gene based eGRN', nomenclature = assembly, out_fname=os.path.join(save_path,'SCENIC+_gene_based.loom')) export_to_loom(scplus_obj, signature_key = 'Region_based', tree_structure = tree_structure, title = 'Region based eGRN', nomenclature = assembly, out_fname=os.path.join(save_path,'SCENIC+_region_based.loom')) if export_to_UCSC_file is True: log.info('Exporting to UCSC') r2g_data = export_to_UCSC_interact(scplus_obj, species, os.path.join(save_path,'r2g.rho.bed'), path_bedToBigBed=path_bedToBigBed, bigbed_outfile=os.path.join(save_path,'r2g.rho.bb'), region_to_gene_key='region_to_gene', pbm_host=biomart_host, assembly=assembly, ucsc_track_name='R2G', ucsc_description='SCENIC+ region to gene links', cmap_neg='Reds', cmap_pos='Greens', key_for_color='rho', scale_by_gene=False, subset_for_eRegulons_regions=True, eRegulons_key='eRegulons') r2g_data = export_to_UCSC_interact(scplus_obj, species, os.path.join(save_path,'r2g.importance.bed'), path_bedToBigBed=path_bedToBigBed, bigbed_outfile=os.path.join(save_path,'r2g.importance.bb'), region_to_gene_key='region_to_gene', pbm_host=biomart_host, assembly=assembly, ucsc_track_name='R2G', ucsc_description='SCENIC+ region to gene links', cmap_neg='Reds', cmap_pos='Greens', key_for_color='importance', scale_by_gene=True, subset_for_eRegulons_regions=True, eRegulons_key='eRegulons') regions = export_eRegulons(scplus_obj, os.path.join(save_path,'eRegulons.bed'), assembly, bigbed_outfile = os.path.join(save_path,'eRegulons.bb'), eRegulon_metadata_key = 'eRegulon_metadata', eRegulon_signature_key = 'eRegulon_signatures', path_bedToBigBed=path_bedToBigBed) log.info('Saving object') with open(os.path.join(save_path,'scplus_obj.pkl'), 'wb') as f: dill.dump(scplus_obj, f, protocol = -1) log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60))