aertslab · DanieFD · Mar 25, 2024 · Apr 9, 2024 · SeppeDeWinter · Mar 26, 2024
diff --git a/src/scenicplus/cli/commands.py b/src/scenicplus/cli/commands.py
@@ -366,19 +366,38 @@ def run_motif_enrichment_dem(
 
 def prepare_motif_enrichment_results(
         paths_to_motif_enrichment_results: List[str],
+
         multiome_mudata_fname: pathlib.Path,
         out_file_direct_annotation: pathlib.Path,
         out_file_extended_annotation: pathlib.Path,
         out_file_tf_names: pathlib.Path,
+
         direct_annotation: List[str],
-        extended_annotation: List[str]) -> None:
+        extended_annotation: List[str],
+        path_to_regions_to_subset: Optional[str] = None) -> None:
     from scenicplus.data_wrangling.cistarget_wrangling import get_and_merge_cistromes
+    from scenicplus.utils import target_to_overlapping_query, coord_to_region_names
+    from scenicplus.utils import region_names_to_coordinates
     log.info("Reading multiome MuData.")
     mdata = mudata.read(multiome_mudata_fname.__str__())
     log.info("Getting cistromes.")
+    regions = set(mdata['scATAC'].var_names)
+    regions_to_overlap = set(mdata['scATAC'].var_names)
+    #pr_regions = pr.PyRanges(region_names_to_coordinates(regions))
+    if path_to_regions_to_subset is not None:
+        log.info("Subsetting regions from cistromes.")
+        if path_to_regions_to_subset.endswith(".bed"):
+            subset = pr.read_bed(path_to_regions_to_subset, 
+                                as_df=False)
+            pr_regions = pr.PyRanges(region_names_to_coordinates(regions))
+            regions_to_overlap = target_to_overlapping_query(pr_regions, subset) #regions_to_overlap will be a pyranges df
+            # transform regions to overlap to set of regions
+            regions_to_overlap = set(coord_to_region_names(regions_to_overlap))                   
+
     adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes(
         paths_to_motif_enrichment_results=paths_to_motif_enrichment_results,
-        scplus_regions=set(mdata['scATAC'].var_names),
+        scplus_regions=regions,
+        subset_regions = regions_to_overlap, #could be the set of all ATAC regions or a subset if subset is not None
         direct_annotation=direct_annotation,
         extended_annotation=extended_annotation)
     # Get transcription factor names from cistromes

diff --git a/src/scenicplus/cli/scenicplus.py b/src/scenicplus/cli/scenicplus.py
@@ -117,6 +117,7 @@ def prepare_menr_data(arg):
         prepare_motif_enrichment_results(
             paths_to_motif_enrichment_results=arg.paths_to_motif_enrichment_results,
             multiome_mudata_fname=arg.multiome_mudata_fname,
+            path_to_regions_to_subset=arg.path_to_regions_to_subset,
             out_file_direct_annotation=arg.out_file_direct_annotation,
             out_file_extended_annotation=arg.out_file_extended_annotation,
             out_file_tf_names=arg.out_file_tf_names,
@@ -132,6 +133,10 @@ def prepare_menr_data(arg):
         "--multiome_mudata_fname", dest="multiome_mudata_fname",
         action="store", type=pathlib.Path, required=True,
         help="Path to multiome MuData object (from scenicplus prepare_GEX_ACC).")
+    parser.add_argument(
+        "--path_to_regions_to_subset", dest="path_to_regions_to_subset",
+        action="store", type=str, required=False,default ="",
+        help="Path to bed file for regions to subset when merging cistromes (MACS called peaks).")
     parser.add_argument(
         "--out_file_tf_names", dest="out_file_tf_names",
         action="store", type=pathlib.Path, required=True,

diff --git a/src/scenicplus/data_wrangling/cistarget_wrangling.py b/src/scenicplus/data_wrangling/cistarget_wrangling.py
@@ -10,7 +10,7 @@
 from pycistarget.utils import get_TF_list, get_motifs_per_TF
 from typing import Set, Dict, List, Iterable, Tuple
 from dataclasses import dataclass
-
+from scenicplus.utils import region_names_to_coordinates
 
 @dataclass
 class Cistrome:
@@ -61,6 +61,7 @@ def _get_cistromes(
         motif_enrichment_table: pd.DataFrame,
         motif_hits: Dict[str, str],
         scplus_regions: Set[str],
+        subset_regions: Set[str],
         direct_annotation: List[str],
         extended_annotation: List[str]) -> List[Cistrome]:
     """
@@ -74,6 +75,8 @@ def _get_cistromes(
             dict of motif hits (mapping motifs to regions)
         scplus_regions:
             set of regions in the scplus_obj
+        subset_regions:
+            set of regions to subset cistromes
         direct_annotation: 
             list of annotations to use as 'direct'
         extended_annotation: 
@@ -102,11 +105,12 @@ def _get_cistromes(
                 target_regions_motif_direct.update(motif_hits[motif])
             else:
                 raise ValueError(f"Motif enrichment table and motif hits don't match for the TF: {tf_name}")
+
         cistromes.append(
             Cistrome(
                 tf_name = tf_name,
                 motifs = set(motifs_annotated_to_tf),
-                target_regions = target_regions_motif_direct & scplus_regions,
+                target_regions = _overlap_if_necessary(target_regions_motif_direct,  scplus_regions, subset_regions),
                 extended = False))
     for tf_name in tfs_extended:
         motifs_annotated_to_tf = get_motifs_per_TF(
@@ -120,14 +124,24 @@ def _get_cistromes(
                 target_regions_motif_extended.update(motif_hits[motif])
             else:
                 raise ValueError(f"Motif enrichment table and motif hits don't match for the TF: {tf_name}")
+
         cistromes.append(
             Cistrome(
                 tf_name = tf_name,
                 motifs = set(motifs_annotated_to_tf),
-                target_regions = target_regions_motif_extended & scplus_regions,
+                target_regions =  _overlap_if_necessary(target_regions_motif_extended,  scplus_regions, subset_regions),
                 extended = True))
     return cistromes
 
+def _overlap_if_necessary(s_query_regions, test_regions, regions_to_overlap):
+        #s_query_regions = set(coord_to_region_names(d[k]))
+        #if the signature regions are already in the scplus_obj coordinate system, do nothing, otherwise overlap
+        if len(s_query_regions & test_regions) != len(s_query_regions):
+            signature_regions = target_to_overlapping_query(pr.PyRanges(region_names_to_coordinates(regions_to_overlap)), pr.PyRanges(region_names_to_coordinates(s_query_regions)))
+        else:
+            signature_regions = s_query_regions
+        return signature_regions
+
 def _merge_cistromes(cistromes: List[Cistrome]) -> Iterable[Cistrome]:
     a_cistromes = np.array(cistromes, dtype = 'object')
     tf_names = np.array([cistrome.tf_name for cistrome in a_cistromes])
@@ -173,6 +187,7 @@ def _cistromes_to_adata(cistromes: List[Cistrome]) -> anndata.AnnData:
 def get_and_merge_cistromes(
         paths_to_motif_enrichment_results: List[str],
         scplus_regions: Set[str],
+        subset_regions: Set[str],
         direct_annotation: List[str] = ['Direct_annot'],
         extended_annotation: List[str] = ['Orthology_annot']
         ) -> Tuple[anndata.AnnData, anndata.AnnData]:
@@ -184,6 +199,8 @@ def get_and_merge_cistromes(
         A list of paths to motif enrichment results generated by pycistarget
     scplus_regions: Set[str]
         A set of regions to be used in the SCENIC+ analysis
+    subset_regions:
+        A set of regions to subset when merging cistromes
     direct_annotation: List[str] = ['Direct_annot']
         A list of annotations to use as annotations with direct evidence
     extended_annotation: List[str] = ['Orthology_annot']
@@ -204,6 +221,7 @@ def get_and_merge_cistromes(
                 motif_enrichment_table = motif_enrichment_table,
                 motif_hits = motif_hits,
                 scplus_regions = scplus_regions,
+                subset_regions = subset_regions, 
                 direct_annotation = direct_annotation,
                 extended_annotation = extended_annotation))
     # merge cistromes. Seperatly for direct and extended

diff --git a/src/scenicplus/plotting/dotplot.py b/src/scenicplus/plotting/dotplot.py
@@ -124,7 +124,7 @@ def heatmap_dotplot(
     # Plotting
     plotnine.options.figure_size = figsize
     plotting_df["repressor_activator"] = [
-            "activator" if "+" in n.split("_")[2] else "repressor" for n in plotting_df[feature_name_key]]
+            "repressor" if "-" in n.split("_")[2] else "activator" for n in plotting_df[feature_name_key]]
     if split_repressor_activator and len(set(plotting_df["repressor_activator"])) == 2:
         if orientation == 'vertical':
             plot = (

diff --git a/src/scenicplus/snakemake/Snakefile b/src/scenicplus/snakemake/Snakefile
@@ -204,6 +204,7 @@ rule prepare_menr:
         cistromes_direct=config["output_data"]["cistromes_direct"],
         cistromes_extended=config["output_data"]["cistromes_extended"]
     params:
+        path_to_regions_to_subset=lambda wildcards: config["params_data_preparation"]["path_to_regions_to_subset"] if 'path_to_regions_to_subset' in config and config["params_data_preparation"]["path_to_regions_to_subset"] != "" else None,
         direct_annotation=lambda wildcards: config["params_data_preparation"]["direct_annotation"],
         extended_annotation=lambda wildcards: config["params_data_preparation"]["extended_annotation"]
     shell:
@@ -214,6 +215,7 @@ rule prepare_menr:
             --out_file_tf_names {output.tf_names} \
             --out_file_direct_annotation {output.cistromes_direct} \
             --out_file_extended_annotation {output.cistromes_extended} \
+            --path_to_regions_to_subset {params.path_to_regions_to_subset} \
             --direct_annotation {params.direct_annotation} \
             --extended_annotation {params.extended_annotation}
         """

diff --git a/src/scenicplus/snakemake/config.yaml b/src/scenicplus/snakemake/config.yaml
@@ -6,6 +6,7 @@ input_data:
   dem_db_fname: ""
   path_to_motif_annotations: ""
 
+
 output_data:
   # output for prepare_GEX_ACC .h5mu
   combined_GEX_ACC_mudata: "ACC_GEX.h5mu"
@@ -50,6 +51,7 @@ params_data_preparation:
   key_to_group_by: ""
   nr_cells_per_metacells: 10
   # Params for prepare_menr
+  path_to_regions_to_subset: ""
   direct_annotation: "Direct_annot"
   extended_annotation: "Orthology_annot"
   # Params for download_genome_annotations