Source code for scenicplus.wrappers.run_pycistarget

"""
Wrapper functions to run motif enrichment analysis using pycistarget

After sets of regions have been defined (e.g. topics or DARs). The complete pycistarget workflo can be run using a single function.

this function will run cistarget based and DEM based motif enrichment analysis with or without promoter regions.
"""

from typing import Dict
import pandas as pd
import dill
import pyranges as pr
from pycistarget.motif_enrichment_cistarget import *
from pycistarget.motif_enrichment_dem import *
from pycistarget.utils import *
import pybiomart as pbm
import time

[docs]def run_pycistarget(region_sets: Dict[str, pr.PyRanges], species: str, save_path: str, custom_annot: pd.DataFrame = None, save_partial: bool = False, ctx_db_path: str = None, dem_db_path: str = None, run_without_promoters: bool = False, biomart_host: str = 'http://www.ensembl.org', promoter_space: int = 500, ctx_auc_threshold: float = 0.005, ctx_nes_threshold: float = 3.0, ctx_rank_threshold: float = 0.05, dem_log2fc_thr: float = 0.5, dem_motif_hit_thr: float = 3.0, dem_max_bg_regions: int = 500, annotation : List[str] = ['Direct_annot', 'Orthology_annot'], motif_similarity_fdr: float = 0.000001, path_to_motif_annotations: str = None, annotation_version: str = 'v9', n_cpu : int = 1, _temp_dir: str = None, exclude_motifs: str = None, exclude_collection: List[str] = None, **kwargs): """ Wrapper function for pycistarget Parameters --------- region_sets: Mapping[str, pr.PyRanges] A dictionary of PyRanges containing region coordinates for the region sets to be analyzed. species: str Species from which genomic coordinates come from, options are: homo_sapiens, mus_musculus, drosophila_melanogaster and gallus_gallus. save_path: str Directory in which to save outputs. custom_annot: pd.DataFrame pandas DataFrame with genome annotation for custom species (i.e. for a species other than homo_sapiens, mus_musculus, drosophila_melanogaster or gallus_gallus). This DataFrame should (minimally) look like the example below, and only contains protein coding genes: >>> custom_annot Chromosome Start Strand Gene Transcript_type 8053 chrY 22490397 1 PRY protein_coding 8153 chrY 12662368 1 USP9Y protein_coding 8155 chrY 12701231 1 USP9Y protein_coding 8158 chrY 12847045 1 USP9Y protein_coding 8328 chrY 22096007 -1 PRY2 protein_coding ... ... ... ... ... ... 246958 chr1 181483738 1 CACNA1E protein_coding 246960 chr1 181732466 1 CACNA1E protein_coding 246962 chr1 181776101 1 CACNA1E protein_coding 246963 chr1 181793668 1 CACNA1E protein_coding 246965 chr1 203305519 1 BTG2 protein_coding [78812 rows x 5 columns] save_partial: bool=False Whether to save the individual analyses as pkl. Useful to run analyses in chunks or add new settings. ctx_db_path: str = None Path to cistarget database containing rankings of motif scores dem_db_path: str = None Path to dem database containing motif scores run_without_promoters: bool = False Boolean specifying wether the analysis should also be run without including promoter regions. biomart_host: str = 'http://www.ensembl.org' url to biomart host, make sure this host matches your assembly promoter_space: int = 500 integer defining space around the TSS to consider as promoter ctx_auc_threshold: float = 0.005 The fraction of the ranked genome to take into account for the calculation of the Area Under the recovery Curve ctx_nes_threshold: float = 3.0 The Normalized Enrichment Score (NES) threshold to select enriched features. ctx_rank_threshold: float = 0.05 The total number of ranked genes to take into account when creating a recovery curve. dem_log2fc_thr: float = 0.5 Log2 Fold-change threshold to consider a motif enriched. dem_motif_hit_thr: float = 3.0 Minimul mean signal in the foreground to consider a motif enriched. dem_max_bg_regions: int = 500 Maximum number of regions to use as background. When set to None, all regions are used annotation : List[str] = ['Direct_annot', 'Orthology_annot'] Annotation to use for forming cistromes. It can be 'Direct_annot' (direct evidence that the motif is linked to that TF), 'Motif_similarity_annot' (based on tomtom motif similarity), 'Orthology_annot' (based on orthology with a TF that is directly linked to that motif) or 'Motif_similarity_and_Orthology_annot'. path_to_motif_annotations: str = None Path to motif annotations. If not provided, they will be downloaded from https://resources.aertslab.org based on the specie name provided (only possible for mus_musculus, homo_sapiens and drosophila_melanogaster). annotation_version: str = 'v9' Motif collection version. n_cpu : int = 1 Number of cores to use. _temp_dir: str = None temp_dir to use for ray. exclude_motifs: str = None Path to csv file containing motif to exclude from the analysis. exclude_collection: List[str] = None List of strings identifying which motif collections to exclude from analysis. """ # Create logger level = logging.INFO log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' handlers = [logging.StreamHandler(stream=sys.stdout)] logging.basicConfig(level=level, format=log_format, handlers=handlers) log = logging.getLogger('pycisTarget_wrapper') start_time = time.time() check_folder = os.path.isdir(save_path) if not check_folder: os.makedirs(save_path) log.info("Created folder : " + save_path) else: log.info(save_path + " folder already exists.") def get_species_annotation(species: str): dataset = pbm.Dataset(name=species, host=biomart_host) annot = dataset.query(attributes=['chromosome_name', 'transcription_start_site', 'strand', 'external_gene_name', 'transcript_biotype']) annot.columns = ['Chromosome', 'Start', 'Strand', 'Gene', 'Transcript_type'] annot['Chromosome'] = annot['Chromosome'].astype('str') filterf = annot['Chromosome'].str.contains('CHR|GL|JH|MT|KI') annot = annot[~filterf] annot['Chromosome'] = annot['Chromosome'].replace(r'(\b\S)', r'chr\1') annot = annot[annot.Transcript_type == 'protein_coding'] annot = annot.dropna(subset = ['Chromosome', 'Start']) # Check if chromosomes have chr check = region_sets[list(region_sets.keys())[0]] if not any(['chr' in c for c in check[list(check.keys())[0]].df['Chromosome']]): annot.Chromosome = annot.Chromosome.str.replace('chr', '') if not any(['chr' in x for x in annot.Chromosome]): annot.Chromosome = [f'chr{x}' for x in annot.Chromosome] annot_dem=annot.copy() # Define promoter space annot['End'] = annot['Start'].astype(int)+promoter_space annot['Start'] = annot['Start'].astype(int)-promoter_space annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']]) return annot, annot_dem # Prepare annotation if species == 'homo_sapiens': annot, annot_dem = get_species_annotation('hsapiens_gene_ensembl') elif species == 'mus_musculus': annot, annot_dem = get_species_annotation('mmusculus_gene_ensembl') elif species == 'drosophila_melanogaster': annot, annot_dem = get_species_annotation('dmelanogaster_gene_ensembl') elif species == 'gallus_gallus': annot, annot_dem = get_species_annotation('ggallus_gene_ensembl') elif species == 'custom': annot_dem = custom_annot annot = annot_dem.copy() # Define promoter space annot['End'] = annot['Start'].astype(int)+promoter_space annot['Start'] = annot['Start'].astype(int)-promoter_space annot = pr.PyRanges(annot[['Chromosome', 'Start', 'End']]) else: raise TypeError("Species not recognized") menr = {} for key in region_sets.keys(): if ctx_db_path is not None: log.info('Loading cisTarget database for ' + key) ## CISTARGET regions = region_sets[key] ctx_db = cisTargetDatabase(ctx_db_path, regions) if exclude_motifs is not None: out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist() ctx_db.db_rankings = ctx_db.db_rankings.drop(out) if exclude_collection is not None: for col in exclude_collection: ctx_db.db_rankings = ctx_db.db_rankings[~ctx_db.db_rankings.index.str.contains(col)] ## DEFAULT log.info('Running cisTarget for '+key) menr['CTX_'+key+'_All'] = run_cistarget(ctx_db = ctx_db, region_sets = regions, specie = species, auc_threshold = ctx_auc_threshold, nes_threshold = ctx_nes_threshold, rank_threshold = ctx_rank_threshold, annotation = annotation, motif_similarity_fdr = motif_similarity_fdr, path_to_motif_annotations = path_to_motif_annotations, n_cpu = n_cpu, _temp_dir= _temp_dir, annotation_version = annotation_version, **kwargs) out_folder = os.path.join(save_path,'CTX_'+key+'_All') check_folder = os.path.isdir(out_folder) if not check_folder: os.makedirs(out_folder) log.info("Created folder : " + out_folder) else: log.info(out_folder + " folder already exists.") for x in menr['CTX_'+key+'_All'].keys(): out_file = os.path.join(out_folder, str(x) +'.html') menr['CTX_'+key+'_All'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80) if(save_partial): with open(os.path.join(save_path,'CTX_'+key+'_All' + '.pkl'), 'wb') as f: dill.dump(menr['CTX_'+key+'_All'], f, protocol=-1) if run_without_promoters is True: ## REMOVE PROMOTERS log.info('Running cisTarget without promoters for '+key) regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()} regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()} db_regions = set(pd.concat([ctx_db.regions_to_db[x] for x in ctx_db.regions_to_db.keys()])['Query']) ctx_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()} menr['CTX_'+key+'_No_promoters'] = run_cistarget(ctx_db = ctx_db, region_sets = regions_np, specie = species, auc_threshold = ctx_auc_threshold, nes_threshold = ctx_nes_threshold, rank_threshold = ctx_rank_threshold, annotation = annotation, motif_similarity_fdr = motif_similarity_fdr, path_to_motif_annotations = path_to_motif_annotations, n_cpu = n_cpu, _temp_dir= _temp_dir, annotation_version = annotation_version, **kwargs) out_folder = os.path.join(save_path,'CTX_'+key+'_No_promoters') check_folder = os.path.isdir(out_folder) if not check_folder: os.makedirs(out_folder) log.info("Created folder:" + out_folder) else: log.info(out_folder + " folder already exists.") for x in menr['CTX_'+key+'_No_promoters'].keys(): out_file = os.path.join(out_folder, str(x) +'.html') menr['CTX_'+key+'_No_promoters'][str(x)].motif_enrichment.to_html(open(out_file, 'w'), escape=False, col_space=80) if(save_partial): with open(os.path.join(save_path,'CTX_'+key+'_No_promoters' + '.pkl'), 'wb') as f: dill.dump(menr['CTX_'+key+'_No_promoters'], f, protocol=-1) ## DEM if dem_db_path is not None: log.info('Running DEM for '+key) regions = region_sets[key] dem_db = DEMDatabase(dem_db_path, regions) if exclude_motifs is not None: out = pd.read_csv(exclude_motifs, header=None).iloc[:,0].tolist() dem_db.db_scores = dem_db.db_scores.drop(out) if exclude_collection is not None: for col in exclude_collection: dem_db.db_scores = dem_db.db_scores[~dem_db.db_scores.index.str.contains(col)] menr['DEM_'+key+'_All'] = DEM(dem_db = dem_db, region_sets = regions, log2fc_thr = dem_log2fc_thr, motif_hit_thr = dem_motif_hit_thr, max_bg_regions = dem_max_bg_regions, specie = species, genome_annotation = annot_dem, promoter_space = promoter_space, motif_annotation = annotation, motif_similarity_fdr = motif_similarity_fdr, path_to_motif_annotations = path_to_motif_annotations, n_cpu = n_cpu, annotation_version = annotation_version, tmp_dir = save_path, _temp_dir= _temp_dir, **kwargs) out_folder = os.path.join(save_path,'DEM_'+key+'_All') check_folder = os.path.isdir(out_folder) if not check_folder: os.makedirs(out_folder) log.info("Created folder : "+ out_folder) else: log.info(out_folder + " folder already exists.") for x in menr['DEM_'+key+'_All'].motif_enrichment.keys(): out_file = os.path.join(out_folder, str(x) +'.html') menr['DEM_'+key+'_All'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80) if(save_partial): with open(os.path.join(save_path, 'DEM_'+key+'_All'+'.pkl'), 'wb') as f: dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1) if run_without_promoters is True: log.info('Running DEM without promoters for '+key) ## REMOVE PROMOTERS regions_overlaps = {key: regions[key].count_overlaps(annot) for key in regions.keys()} regions_np = {key: regions_overlaps[key][regions_overlaps[key].NumberOverlaps == 0][['Chromosome', 'Start', 'End']] for key in regions.keys()} db_regions = set(pd.concat([dem_db.regions_to_db[x] for x in dem_db.regions_to_db.keys()])['Query']) dem_db.regions_to_db = {x: target_to_query(regions_np[x], list(db_regions), fraction_overlap = 0.4) for x in regions_np.keys()} menr['DEM_'+key+'_No_promoters'] = DEM(dem_db = dem_db, region_sets = regions_np, log2fc_thr = dem_log2fc_thr, motif_hit_thr = dem_motif_hit_thr, max_bg_regions = dem_max_bg_regions, specie = species, promoter_space = promoter_space, motif_annotation = annotation, motif_similarity_fdr = motif_similarity_fdr, path_to_motif_annotations = path_to_motif_annotations, n_cpu = n_cpu, annotation_version = annotation_version, tmp_dir = save_path, _temp_dir= _temp_dir, **kwargs) out_folder = os.path.join(save_path,'DEM_'+key+'_No_promoters') check_folder = os.path.isdir(out_folder) if not check_folder: os.makedirs(out_folder) log.info("Created folder : "+ out_folder) else: log.info(out_folder + " folder already exists.") for x in menr['DEM_'+key+'_No_promoters'].motif_enrichment.keys(): out_file = os.path.join(out_folder, str(x) +'.html') menr['DEM_'+key+'_No_promoters'].motif_enrichment[str(x)].to_html(open(out_file, 'w'), escape=False, col_space=80) if(save_partial): with open(os.path.join(save_path, 'DEM_'+key+'_No_promoters'+'.pkl'), 'wb') as f: dill.dump(menr['DEM_'+key+'_All'], f, protocol=-1) log.info('Saving object') with open(os.path.join(save_path,'menr.pkl'), 'wb') as f: dill.dump(menr, f, protocol=-1) log.info('Finished! Took {} minutes'.format((time.time() - start_time)/60))