Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subset regions from cistromes params and fix dotplot repressors #337

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/scenicplus/cli/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,19 +366,38 @@ def run_motif_enrichment_dem(

def prepare_motif_enrichment_results(
paths_to_motif_enrichment_results: List[str],

multiome_mudata_fname: pathlib.Path,
out_file_direct_annotation: pathlib.Path,
out_file_extended_annotation: pathlib.Path,
out_file_tf_names: pathlib.Path,

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any particular reason why you added blank lines between the parameters? Otherwise I would delete them.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No no reason. I will delete them

direct_annotation: List[str],
extended_annotation: List[str]) -> None:
extended_annotation: List[str],
path_to_regions_to_subset: Optional[str] = None) -> None:
from scenicplus.data_wrangling.cistarget_wrangling import get_and_merge_cistromes
from scenicplus.utils import target_to_overlapping_query, coord_to_region_names
from scenicplus.utils import region_names_to_coordinates
log.info("Reading multiome MuData.")
mdata = mudata.read(multiome_mudata_fname.__str__())
log.info("Getting cistromes.")
regions = set(mdata['scATAC'].var_names)
regions_to_overlap = set(mdata['scATAC'].var_names)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this within the if statement, see next comment as well.

#pr_regions = pr.PyRanges(region_names_to_coordinates(regions))
if path_to_regions_to_subset is not None:
log.info("Subsetting regions from cistromes.")
if path_to_regions_to_subset.endswith(".bed"):
subset = pr.read_bed(path_to_regions_to_subset,
as_df=False)
pr_regions = pr.PyRanges(region_names_to_coordinates(regions))
regions_to_overlap = target_to_overlapping_query(pr_regions, subset) #regions_to_overlap will be a pyranges df
# transform regions to overlap to set of regions
regions_to_overlap = set(coord_to_region_names(regions_to_overlap))

adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes(
paths_to_motif_enrichment_results=paths_to_motif_enrichment_results,
scplus_regions=set(mdata['scATAC'].var_names),
scplus_regions=regions,
subset_regions = regions_to_overlap, #could be the set of all ATAC regions or a subset if subset is not None
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would move this function call within the if statement block.

if path_to_regions_to_subset is not None:
    [...]
    adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes(
        [...],
        subset_regions = regions_to_overlap
    )
else:
    adata_direct_cistromes, adata_extended_cistromes = get_and_merge_cistromes(
        [...],
    )

direct_annotation=direct_annotation,
extended_annotation=extended_annotation)
# Get transcription factor names from cistromes
Expand Down
5 changes: 5 additions & 0 deletions src/scenicplus/cli/scenicplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def prepare_menr_data(arg):
prepare_motif_enrichment_results(
paths_to_motif_enrichment_results=arg.paths_to_motif_enrichment_results,
multiome_mudata_fname=arg.multiome_mudata_fname,
path_to_regions_to_subset=arg.path_to_regions_to_subset,
out_file_direct_annotation=arg.out_file_direct_annotation,
out_file_extended_annotation=arg.out_file_extended_annotation,
out_file_tf_names=arg.out_file_tf_names,
Expand All @@ -132,6 +133,10 @@ def prepare_menr_data(arg):
"--multiome_mudata_fname", dest="multiome_mudata_fname",
action="store", type=pathlib.Path, required=True,
help="Path to multiome MuData object (from scenicplus prepare_GEX_ACC).")
parser.add_argument(
"--path_to_regions_to_subset", dest="path_to_regions_to_subset",
action="store", type=str, required=False,default ="",
help="Path to bed file for regions to subset when merging cistromes (MACS called peaks).")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They can be any bed file right, don't necessarily need to be peaks called by MACS? If this is the case I would remove "(MACS called peaks)" from the documentation to avoid confusion.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

True could be removed. or could add for example. Will fix it

parser.add_argument(
"--out_file_tf_names", dest="out_file_tf_names",
action="store", type=pathlib.Path, required=True,
Expand Down
24 changes: 21 additions & 3 deletions src/scenicplus/data_wrangling/cistarget_wrangling.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pycistarget.utils import get_TF_list, get_motifs_per_TF
from typing import Set, Dict, List, Iterable, Tuple
from dataclasses import dataclass

from scenicplus.utils import region_names_to_coordinates

@dataclass
class Cistrome:
Expand Down Expand Up @@ -61,6 +61,7 @@ def _get_cistromes(
motif_enrichment_table: pd.DataFrame,
motif_hits: Dict[str, str],
scplus_regions: Set[str],
subset_regions: Set[str],
direct_annotation: List[str],
extended_annotation: List[str]) -> List[Cistrome]:
"""
Expand All @@ -74,6 +75,8 @@ def _get_cistromes(
dict of motif hits (mapping motifs to regions)
scplus_regions:
set of regions in the scplus_obj
subset_regions:
set of regions to subset cistromes
direct_annotation:
list of annotations to use as 'direct'
extended_annotation:
Expand Down Expand Up @@ -102,11 +105,12 @@ def _get_cistromes(
target_regions_motif_direct.update(motif_hits[motif])
else:
raise ValueError(f"Motif enrichment table and motif hits don't match for the TF: {tf_name}")

cistromes.append(
Cistrome(
tf_name = tf_name,
motifs = set(motifs_annotated_to_tf),
target_regions = target_regions_motif_direct & scplus_regions,
target_regions = _overlap_if_necessary(target_regions_motif_direct, scplus_regions, subset_regions),
extended = False))
for tf_name in tfs_extended:
motifs_annotated_to_tf = get_motifs_per_TF(
Expand All @@ -120,14 +124,24 @@ def _get_cistromes(
target_regions_motif_extended.update(motif_hits[motif])
else:
raise ValueError(f"Motif enrichment table and motif hits don't match for the TF: {tf_name}")

cistromes.append(
Cistrome(
tf_name = tf_name,
motifs = set(motifs_annotated_to_tf),
target_regions = target_regions_motif_extended & scplus_regions,
target_regions = _overlap_if_necessary(target_regions_motif_extended, scplus_regions, subset_regions),
extended = True))
return cistromes

def _overlap_if_necessary(s_query_regions, test_regions, regions_to_overlap):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I'm not mistaken this function returns either a set (else block) or a pyranges object (if block), while the Cistrome class expects a set for the target_regions parameter.

Also I would run this function not while providing the arguments to clearly seperate these two things (i.e. 1. initiating the class and 2. putting the subset region in the same coordinate system as SCENIC+).

#s_query_regions = set(coord_to_region_names(d[k]))
#if the signature regions are already in the scplus_obj coordinate system, do nothing, otherwise overlap
if len(s_query_regions & test_regions) != len(s_query_regions):
signature_regions = target_to_overlapping_query(pr.PyRanges(region_names_to_coordinates(regions_to_overlap)), pr.PyRanges(region_names_to_coordinates(s_query_regions)))
else:
signature_regions = s_query_regions
return signature_regions

def _merge_cistromes(cistromes: List[Cistrome]) -> Iterable[Cistrome]:
a_cistromes = np.array(cistromes, dtype = 'object')
tf_names = np.array([cistrome.tf_name for cistrome in a_cistromes])
Expand Down Expand Up @@ -173,6 +187,7 @@ def _cistromes_to_adata(cistromes: List[Cistrome]) -> anndata.AnnData:
def get_and_merge_cistromes(
paths_to_motif_enrichment_results: List[str],
scplus_regions: Set[str],
subset_regions: Set[str],
direct_annotation: List[str] = ['Direct_annot'],
extended_annotation: List[str] = ['Orthology_annot']
) -> Tuple[anndata.AnnData, anndata.AnnData]:
Expand All @@ -184,6 +199,8 @@ def get_and_merge_cistromes(
A list of paths to motif enrichment results generated by pycistarget
scplus_regions: Set[str]
A set of regions to be used in the SCENIC+ analysis
subset_regions:
A set of regions to subset when merging cistromes
direct_annotation: List[str] = ['Direct_annot']
A list of annotations to use as annotations with direct evidence
extended_annotation: List[str] = ['Orthology_annot']
Expand All @@ -204,6 +221,7 @@ def get_and_merge_cistromes(
motif_enrichment_table = motif_enrichment_table,
motif_hits = motif_hits,
scplus_regions = scplus_regions,
subset_regions = subset_regions,
direct_annotation = direct_annotation,
extended_annotation = extended_annotation))
# merge cistromes. Seperatly for direct and extended
Expand Down
2 changes: 1 addition & 1 deletion src/scenicplus/plotting/dotplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def heatmap_dotplot(
# Plotting
plotnine.options.figure_size = figsize
plotting_df["repressor_activator"] = [
"activator" if "+" in n.split("_")[2] else "repressor" for n in plotting_df[feature_name_key]]
"repressor" if "-" in n.split("_")[2] else "activator" for n in plotting_df[feature_name_key]]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yes, that was an obvious mistake from my part. Thanks!

if split_repressor_activator and len(set(plotting_df["repressor_activator"])) == 2:
if orientation == 'vertical':
plot = (
Expand Down
2 changes: 2 additions & 0 deletions src/scenicplus/snakemake/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ rule prepare_menr:
cistromes_direct=config["output_data"]["cistromes_direct"],
cistromes_extended=config["output_data"]["cistromes_extended"]
params:
path_to_regions_to_subset=lambda wildcards: config["params_data_preparation"]["path_to_regions_to_subset"] if 'path_to_regions_to_subset' in config and config["params_data_preparation"]["path_to_regions_to_subset"] != "" else None,
direct_annotation=lambda wildcards: config["params_data_preparation"]["direct_annotation"],
extended_annotation=lambda wildcards: config["params_data_preparation"]["extended_annotation"]
shell:
Expand All @@ -214,6 +215,7 @@ rule prepare_menr:
--out_file_tf_names {output.tf_names} \
--out_file_direct_annotation {output.cistromes_direct} \
--out_file_extended_annotation {output.cistromes_extended} \
--path_to_regions_to_subset {params.path_to_regions_to_subset} \
--direct_annotation {params.direct_annotation} \
--extended_annotation {params.extended_annotation}
"""
Expand Down
2 changes: 2 additions & 0 deletions src/scenicplus/snakemake/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ input_data:
dem_db_fname: ""
path_to_motif_annotations: ""


output_data:
# output for prepare_GEX_ACC .h5mu
combined_GEX_ACC_mudata: "ACC_GEX.h5mu"
Expand Down Expand Up @@ -50,6 +51,7 @@ params_data_preparation:
key_to_group_by: ""
nr_cells_per_metacells: 10
# Params for prepare_menr
path_to_regions_to_subset: ""
direct_annotation: "Direct_annot"
extended_annotation: "Orthology_annot"
# Params for download_genome_annotations
Expand Down