-
Notifications
You must be signed in to change notification settings - Fork 29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Subset regions from cistromes params and fix dotplot repressors #337
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -366,19 +366,38 @@ def run_motif_enrichment_dem( | |
|
||
def prepare_motif_enrichment_results( | ||
paths_to_motif_enrichment_results: List[str], | ||
|
||
multiome_mudata_fname: pathlib.Path, | ||
out_file_direct_annotation: pathlib.Path, | ||
out_file_extended_annotation: pathlib.Path, | ||
out_file_tf_names: pathlib.Path, | ||
|
||
direct_annotation: List[str], | ||
extended_annotation: List[str]) -> None: | ||
extended_annotation: List[str], | ||
path_to_regions_to_subset: Optional[str] = None) -> None: | ||
from scenicplus.data_wrangling.cistarget_wrangling import get_and_merge_cistromes | ||
from scenicplus.utils import target_to_overlapping_query, coord_to_region_names | ||
from scenicplus.utils import region_names_to_coordinates | ||
log.info("Reading multiome MuData.") | ||
mdata = mudata.read(multiome_mudata_fname.__str__()) | ||
log.info("Getting cistromes.") | ||
regions = set(mdata['scATAC'].var_names) | ||
regions_to_overlap = set(mdata['scATAC'].var_names) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move this within the if statement, see next comment as well. |
||
#pr_regions = pr.PyRanges(region_names_to_coordinates(regions)) | ||
if path_to_regions_to_subset is not None: | ||
log.info("Subsetting regions from cistromes.") | ||
if path_to_regions_to_subset.endswith(".bed"): | ||
subset = pr.read_bed(path_to_regions_to_subset, | ||
as_df=False) | ||
pr_regions = pr.PyRanges(region_names_to_coordinates(regions)) | ||
regions_to_overlap = target_to_overlapping_query(pr_regions, subset) #regions_to_overlap will be a pyranges df | ||
# transform regions to overlap to set of regions | ||
regions_to_overlap = set(coord_to_region_names(regions_to_overlap)) | ||
|
||
adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes( | ||
paths_to_motif_enrichment_results=paths_to_motif_enrichment_results, | ||
scplus_regions=set(mdata['scATAC'].var_names), | ||
scplus_regions=regions, | ||
subset_regions = regions_to_overlap, #could be the set of all ATAC regions or a subset if subset is not None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would move this function call within the if statement block. if path_to_regions_to_subset is not None:
[...]
adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes(
[...],
subset_regions = regions_to_overlap
)
else:
adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes(
[...],
) |
||
direct_annotation=direct_annotation, | ||
extended_annotation=extended_annotation) | ||
# Get transcription factor names from cistromes | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -117,6 +117,7 @@ def prepare_menr_data(arg): | |
prepare_motif_enrichment_results( | ||
paths_to_motif_enrichment_results=arg.paths_to_motif_enrichment_results, | ||
multiome_mudata_fname=arg.multiome_mudata_fname, | ||
path_to_regions_to_subset=arg.path_to_regions_to_subset, | ||
out_file_direct_annotation=arg.out_file_direct_annotation, | ||
out_file_extended_annotation=arg.out_file_extended_annotation, | ||
out_file_tf_names=arg.out_file_tf_names, | ||
|
@@ -132,6 +133,10 @@ def prepare_menr_data(arg): | |
"--multiome_mudata_fname", dest="multiome_mudata_fname", | ||
action="store", type=pathlib.Path, required=True, | ||
help="Path to multiome MuData object (from scenicplus prepare_GEX_ACC).") | ||
parser.add_argument( | ||
"--path_to_regions_to_subset", dest="path_to_regions_to_subset", | ||
action="store", type=str, required=False,default ="", | ||
help="Path to bed file for regions to subset when merging cistromes (MACS called peaks).") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They can be any bed file right, don't necessarily need to be peaks called by MACS? If this is the case I would remove " There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. True could be removed. or could add for example. Will fix it |
||
parser.add_argument( | ||
"--out_file_tf_names", dest="out_file_tf_names", | ||
action="store", type=pathlib.Path, required=True, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,7 +10,7 @@ | |
from pycistarget.utils import get_TF_list, get_motifs_per_TF | ||
from typing import Set, Dict, List, Iterable, Tuple | ||
from dataclasses import dataclass | ||
|
||
from scenicplus.utils import region_names_to_coordinates | ||
|
||
@dataclass | ||
class Cistrome: | ||
|
@@ -61,6 +61,7 @@ def _get_cistromes( | |
motif_enrichment_table: pd.DataFrame, | ||
motif_hits: Dict[str, str], | ||
scplus_regions: Set[str], | ||
subset_regions: Set[str], | ||
direct_annotation: List[str], | ||
extended_annotation: List[str]) -> List[Cistrome]: | ||
""" | ||
|
@@ -74,6 +75,8 @@ def _get_cistromes( | |
dict of motif hits (mapping motifs to regions) | ||
scplus_regions: | ||
set of regions in the scplus_obj | ||
subset_regions: | ||
set of regions to subset cistromes | ||
direct_annotation: | ||
list of annotations to use as 'direct' | ||
extended_annotation: | ||
|
@@ -102,11 +105,12 @@ def _get_cistromes( | |
target_regions_motif_direct.update(motif_hits[motif]) | ||
else: | ||
raise ValueError(f"Motif enrichment table and motif hits don't match for the TF: {tf_name}") | ||
|
||
cistromes.append( | ||
Cistrome( | ||
tf_name = tf_name, | ||
motifs = set(motifs_annotated_to_tf), | ||
target_regions = target_regions_motif_direct & scplus_regions, | ||
target_regions = _overlap_if_necessary(target_regions_motif_direct, scplus_regions, subset_regions), | ||
extended = False)) | ||
for tf_name in tfs_extended: | ||
motifs_annotated_to_tf = get_motifs_per_TF( | ||
|
@@ -120,14 +124,24 @@ def _get_cistromes( | |
target_regions_motif_extended.update(motif_hits[motif]) | ||
else: | ||
raise ValueError(f"Motif enrichment table and motif hits don't match for the TF: {tf_name}") | ||
|
||
cistromes.append( | ||
Cistrome( | ||
tf_name = tf_name, | ||
motifs = set(motifs_annotated_to_tf), | ||
target_regions = target_regions_motif_extended & scplus_regions, | ||
target_regions = _overlap_if_necessary(target_regions_motif_extended, scplus_regions, subset_regions), | ||
extended = True)) | ||
return cistromes | ||
|
||
def _overlap_if_necessary(s_query_regions, test_regions, regions_to_overlap): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If I'm not mistaken this function returns either a set (else block) or a pyranges object (if block), while the Cistrome class expects a set for the Also I would run this function not while providing the arguments to clearly seperate these two things (i.e. 1. initiating the class and 2. putting the subset region in the same coordinate system as SCENIC+). |
||
#s_query_regions = set(coord_to_region_names(d[k])) | ||
#if the signature regions are already in the scplus_obj coordinate system, do nothing, otherwise overlap | ||
if len(s_query_regions & test_regions) != len(s_query_regions): | ||
signature_regions = target_to_overlapping_query(pr.PyRanges(region_names_to_coordinates(regions_to_overlap)), pr.PyRanges(region_names_to_coordinates(s_query_regions))) | ||
else: | ||
signature_regions = s_query_regions | ||
return signature_regions | ||
|
||
def _merge_cistromes(cistromes: List[Cistrome]) -> Iterable[Cistrome]: | ||
a_cistromes = np.array(cistromes, dtype = 'object') | ||
tf_names = np.array([cistrome.tf_name for cistrome in a_cistromes]) | ||
|
@@ -173,6 +187,7 @@ def _cistromes_to_adata(cistromes: List[Cistrome]) -> anndata.AnnData: | |
def get_and_merge_cistromes( | ||
paths_to_motif_enrichment_results: List[str], | ||
scplus_regions: Set[str], | ||
subset_regions: Set[str], | ||
direct_annotation: List[str] = ['Direct_annot'], | ||
extended_annotation: List[str] = ['Orthology_annot'] | ||
) -> Tuple[anndata.AnnData, anndata.AnnData]: | ||
|
@@ -184,6 +199,8 @@ def get_and_merge_cistromes( | |
A list of paths to motif enrichment results generated by pycistarget | ||
scplus_regions: Set[str] | ||
A set of regions to be used in the SCENIC+ analysis | ||
subset_regions: | ||
A set of regions to subset when merging cistromes | ||
direct_annotation: List[str] = ['Direct_annot'] | ||
A list of annotations to use as annotations with direct evidence | ||
extended_annotation: List[str] = ['Orthology_annot'] | ||
|
@@ -204,6 +221,7 @@ def get_and_merge_cistromes( | |
motif_enrichment_table = motif_enrichment_table, | ||
motif_hits = motif_hits, | ||
scplus_regions = scplus_regions, | ||
subset_regions = subset_regions, | ||
direct_annotation = direct_annotation, | ||
extended_annotation = extended_annotation)) | ||
# merge cistromes. Seperatly for direct and extended | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,7 +124,7 @@ def heatmap_dotplot( | |
# Plotting | ||
plotnine.options.figure_size = figsize | ||
plotting_df["repressor_activator"] = [ | ||
"activator" if "+" in n.split("_")[2] else "repressor" for n in plotting_df[feature_name_key]] | ||
"repressor" if "-" in n.split("_")[2] else "activator" for n in plotting_df[feature_name_key]] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ah yes, that was an obvious mistake from my part. Thanks! |
||
if split_repressor_activator and len(set(plotting_df["repressor_activator"])) == 2: | ||
if orientation == 'vertical': | ||
plot = ( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there any particular reason why you added blank lines between the parameters? Otherwise I would delete them.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No no reason. I will delete them