Source code for scenicplus.diff_features
"""Calculate differentially expressed genes (DEGs) and differentially accessible regions (DARs).
"""
import anndata
import scanpy as sc
from typing import Optional, List
import logging
import numpy as np
import pandas as pd
import sys
from .scenicplus_class import SCENICPLUS
pd.options.mode.chained_assignment = None
def _format_df(df, key, adjpval_thr, log2fc_thr):
"""
A helper function to format differential test results
"""
df.index = df['names']
df = df[['logfoldchanges', 'pvals_adj']]
df.columns = ['Log2FC', 'Adjusted_pval']
df['Contrast'] = key
df.index.name = None
df = df.loc[df['Adjusted_pval'] <= adjpval_thr]
df = df.loc[df['Log2FC'] >= log2fc_thr]
df = df.sort_values(
['Log2FC', 'Adjusted_pval'], ascending=[False, True]
)
return df
[docs]def get_differential_features(scplus_obj: SCENICPLUS,
variable,
use_hvg: Optional[bool] = True,
contrast_type: Optional[List] = ['DARs', 'DEGs'],
adjpval_thr: Optional[float] = 0.05,
log2fc_thr: Optional[float] = np.log2(1.5),
min_cells: Optional[int] = 2
):
"""
Get DARs of DEGs given reference variable.
Parameters
---------
scplus_obj: `class::SCENICPLUS`
A SCENICPLUS object.
variable: str
Variable to compute DARs/DEGs by (has to be included in scplus_obj.metadata_cell)
use_hvg: bool, optional
Whether to use only highly variable genes/regions
contrast_type: list, optional
Wheter to compute DARs and/or DEGs per variable
adjpval_thr: float, optional
P-value threshold
log2fc_thr
Log2FC threshold
"""
# Create logger
level = logging.INFO
log_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
handlers = [logging.StreamHandler(stream=sys.stdout)]
logging.basicConfig(level=level, format=log_format, handlers=handlers)
log = logging.getLogger('SCENIC+')
for contrast in contrast_type:
log.info('Calculating ' + contrast + ' for variable ' + variable)
if contrast == 'DEGs':
adata = anndata.AnnData(X=scplus_obj.X_EXP.copy(), obs=pd.DataFrame(
index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.gene_names))
min_disp = 0.5
if contrast == 'DARs':
adata = anndata.AnnData(X=scplus_obj.X_ACC.copy().T, obs=pd.DataFrame(
index=scplus_obj.cell_names), var=pd.DataFrame(index=scplus_obj.region_names))
min_disp = 0.05
adata.obs = scplus_obj.metadata_cell
# remove annotations with less than 'min_cells'
label_count = adata.obs[variable].value_counts()
keeplabels = [label for label, count in zip(label_count.index, label_count.values) if count >= min_cells]
keepcellids = [cellid for cellid in adata.obs.index if adata.obs[variable][cellid] in keeplabels]
adata = adata[keepcellids]
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
if use_hvg:
sc.pp.highly_variable_genes(
adata, min_mean=0.0125, max_mean=3, min_disp=min_disp, max_disp=np.inf)
var_features = adata.var.highly_variable[adata.var.highly_variable].index.tolist(
)
adata = adata[:, var_features]
log.info('There are ' + str(len(var_features)) +
' variable features')
sc.tl.rank_genes_groups(
adata, variable, method='wilcoxon', corr_method='bonferroni')
groups = adata.uns['rank_genes_groups']['names'].dtype.names
diff_dict = {group: _format_df(sc.get.rank_genes_groups_df(
adata, group=group), group, adjpval_thr, log2fc_thr) for group in groups}
if contrast not in scplus_obj.uns.keys():
scplus_obj.uns[contrast] = {}
scplus_obj.uns[contrast][variable] = diff_dict
log.info('Finished calculating ' + contrast +
' for variable ' + variable)