Source code for scenicplus.preprocessing.filtering
"""Filter outlier genes and regions.
"""
from ..scenicplus_class import SCENICPLUS
from ..eregulon_enrichment import get_eRegulons_as_signatures
import numpy as np
import logging
import sys
level = logging.INFO
format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
handlers = [logging.StreamHandler(stream=sys.stdout)]
logging.basicConfig(level=level, format=format, handlers=handlers)
log = logging.getLogger('Preprocessing')
[docs]def filter_genes(SCENICPLUS_obj: SCENICPLUS,
min_pct: int = 0,
max_pct: int = 100,
return_copy=False) -> SCENICPLUS:
"""
Filter scenciplus object genes
Parameters
----------
SCENICPLUS_obj
An instance of :class: `~scenicplus.scenicplus_class.SCENICPLUS`.
min_pct
only keep genes which are expressed in at least `min_pct` of cells.
default: 0
max_pct
only keep genes which are expressed in maximal `max_pct` of cells.
default: 100
return_copy
If set to True a new SCENICPLUS object will be generated containing filtered data.
default: False
"""
percent_of_cell_gene_expressed = np.array(
(SCENICPLUS_obj.X_EXP > 0).sum(0) / SCENICPLUS_obj.n_cells).flatten()
genes_to_keep = SCENICPLUS_obj.gene_names[
np.logical_and(percent_of_cell_gene_expressed > (min_pct / 100),
percent_of_cell_gene_expressed < (max_pct / 100))]
log.info(
f'Going from {SCENICPLUS_obj.n_genes} genes to {len(genes_to_keep)} genes.')
if return_copy:
return SCENICPLUS_obj.subset(genes=genes_to_keep, return_copy=return_copy)
else:
SCENICPLUS_obj.subset(genes=genes_to_keep, return_copy=return_copy)
[docs]def filter_regions(SCENICPLUS_obj: SCENICPLUS,
min_pct: int = 0,
max_pct: int = 100,
return_copy=False) -> SCENICPLUS:
"""
Filter scenciplus object regions
Parameters
----------
SCENICPLUS_obj
An instance of :class: `~scenicplus.scenicplus_class.SCENICPLUS`.
min_pct
only keep regions which are accessible in at least `min_pct` of cells.
default: 0
max_pct
only keep regions which are accessible in maximal `max_pct` of cells.
default: 100
return_copy
If set to True a new SCENICPLUS object will be generated containing filtered data.
default: False
"""
percent_of_cells_region_accessible = np.array(
(SCENICPLUS_obj.X_ACC > 0).sum(1) / SCENICPLUS_obj.n_cells).flatten()
regions_to_keep = SCENICPLUS_obj.region_names[
np.logical_and(percent_of_cells_region_accessible > (min_pct / 100),
percent_of_cells_region_accessible < (max_pct / 100))]
log.info(
f'Going from {SCENICPLUS_obj.n_regions} regions to {len(regions_to_keep)} regions.')
if return_copy:
return SCENICPLUS_obj.subset(regions=regions_to_keep, return_copy=return_copy)
else:
SCENICPLUS_obj.subset(regions=regions_to_keep, return_copy=return_copy)
def simplify_eregulon(scplus_obj, eRegulon_signatures_key):
md = scplus_obj.uns[eRegulon_signatures_key]['Gene_based']
names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
scplus_obj.uns[eRegulon_signatures_key]['Gene_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
scplus_obj.uns[eRegulon_signatures_key]['Gene_based'] = {x+'_('+str(len(scplus_obj.uns[eRegulon_signatures_key]['Gene_based'][x]))+'g)': scplus_obj.uns[eRegulon_signatures_key]['Gene_based'][x] for x in scplus_obj.uns[eRegulon_signatures_key]['Gene_based'].keys()}
md = scplus_obj.uns[eRegulon_signatures_key]['Region_based']
names = list(set([x.split('_(')[0][:len(x.split('_(')[0]) - 2] for x in md.keys()]))
scplus_obj.uns[eRegulon_signatures_key]['Region_based'] = {x:list(set(sum([value for key, value in md.items() if key.startswith(x)], []))) for x in names}
scplus_obj.uns[eRegulon_signatures_key]['Region_based'] = {x+'_('+str(len(scplus_obj.uns[eRegulon_signatures_key]['Region_based'][x]))+'r)': scplus_obj.uns[eRegulon_signatures_key]['Region_based'][x] for x in scplus_obj.uns[eRegulon_signatures_key]['Region_based'].keys()}
def remove_second_sign(x):
if 'extended' not in x:
TF, first, second, n = x.split('_')
return f'{TF}_{first}_{n}'
else:
TF, extended, first, second, n = x.split('_')
return f'{TF}_{extended}_{first}_{n}'
def apply_std_filtering_to_eRegulons(scplus_obj):
## only keep positive R2G
print("Only keeping positive R2G")
scplus_obj.uns['eRegulon_metadata_filtered'] = scplus_obj.uns['eRegulon_metadata'].query('R2G_rho > 0')
## only keep extended if no direct
print("Only keep extended if not direct")
scplus_obj.uns['eRegulon_metadata_filtered']['Consensus_name'] = scplus_obj.uns['eRegulon_metadata_filtered'].apply(lambda x: f"{x.TF}_{'+' if x.TF2G_rho > 0 else '-'}_{'+' if x.R2G_rho > 0 else '-'}", axis = 1)
eRegulons_direct = set(
scplus_obj.uns['eRegulon_metadata_filtered'].loc[
scplus_obj.uns['eRegulon_metadata_filtered']['is_extended'] == "False",
'Consensus_name'
])
eRegulons_extended = set(
scplus_obj.uns['eRegulon_metadata_filtered'].loc[
scplus_obj.uns['eRegulon_metadata_filtered']['is_extended'] == "True",
'Consensus_name'
])
extended_not_direct = list(eRegulons_extended - eRegulons_direct)
scplus_obj.uns['eRegulon_metadata_filtered'] = scplus_obj.uns['eRegulon_metadata_filtered'].loc[
np.logical_or(
np.logical_and(
scplus_obj.uns['eRegulon_metadata_filtered']['is_extended'] == "True",
np.isin(scplus_obj.uns['eRegulon_metadata_filtered']['Consensus_name'], extended_not_direct)
),
scplus_obj.uns['eRegulon_metadata_filtered']['is_extended'] == "False")]
print("Getting signatures...")
get_eRegulons_as_signatures(scplus_obj,
eRegulon_metadata_key='eRegulon_metadata_filtered',
key_added='eRegulon_signatures_filtered')
print("Simplifying eRegulons ...")
simplify_eregulon(scplus_obj, 'eRegulon_signatures_filtered')
scplus_obj.uns['eRegulon_metadata_filtered']['Gene_signature_name'] = [remove_second_sign(x) for x in scplus_obj.uns['eRegulon_metadata_filtered']['Gene_signature_name']]
scplus_obj.uns['eRegulon_metadata_filtered']['Region_signature_name'] = [remove_second_sign(x) for x in scplus_obj.uns['eRegulon_metadata_filtered']['Region_signature_name']]