Skip to content

Commit

Permalink
Fix se with inputs (#190)
Browse files Browse the repository at this point in the history
* align output names for macs peaks if broad

* increase resources for attempts for macs

* remove blank row in out bed files

* revert macs option SE handling

* revert the PE options for macs

* peak calling design sorted

* fix peak control match

* amend config slurm queue

* fix get control for peaks

* lint design

* fix lancetron no input rule

* changes defaults in config

* update snp snakefile

* add snp to design and test initial

* update config for snp

* typo in tests

* fixed config for tests

* create common rules

* fix snp tests

* update envs

* fix inputs for se in design

* fix design to handle se and pe with or without inputs

* update spikein with input from stats files

* less stringent spike in filter bams

* remove quality filter from spike in bam filter

* change peak calling method for atac to macs and remove from chip tests

* update fastq files to test

* move multiple peak calling test to atac

* update test data

* less stringent filtering for spikein

* put peak callers back to chip

* macs for atac only

* test peak callers on atac

---------

Co-authored-by: Alastair Smith <[email protected]>
  • Loading branch information
CChahrour and alsmith151 authored May 28, 2024
1 parent 4059dc1 commit ba75033
Show file tree
Hide file tree
Showing 6 changed files with 260 additions and 99 deletions.
86 changes: 80 additions & 6 deletions seqnado/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def setup_configuration(assay, genome, template_data):
genome_dict[genome] = {
"indices": genome_values[genome].get(
"star_indices" if assay in ["rna"] else "bt2_indices"
"star_indices" if assay in ["rna"] else "bt2_indices"
),
"chromosome_sizes": genome_values[genome].get("chromosome_sizes", ""),
"gtf": genome_values[genome].get("gtf", ""),
Expand Down Expand Up @@ -162,6 +163,26 @@ def setup_configuration(assay, genome, template_data):
template_data["pileup_method"] = "False"
template_data["scale"] = "False"
template_data["make_heatmaps"] = "False"
if assay not in ["snp"]:
template_data["make_bigwigs"] = get_user_input(
"Do you want to make bigwigs? (yes/no)", default="no", is_boolean=True
)
if template_data["make_bigwigs"]:
template_data["pileup_method"] = get_user_input(
"Pileup method:",
default="deeptools",
choices=["deeptools", "homer"],
)
template_data["scale"] = get_user_input(
"Scale bigwigs? (yes/no)", default="no", is_boolean=True
)
template_data["make_heatmaps"] = get_user_input(
"Do you want to make heatmaps? (yes/no)", default="no", is_boolean=True
)
else:
template_data["pileup_method"] = "False"
template_data["scale"] = "False"
template_data["make_heatmaps"] = "False"

# Call peaks
if assay in ["chip", "atac"]:
Expand Down Expand Up @@ -202,6 +223,38 @@ def setup_configuration(assay, genome, template_data):
else "False"
)

# SNP options
template_data["call_snps"] = (
get_user_input("Call SNPs? (yes/no)", default="no", is_boolean=True)
if assay == "snp"
else "False"
)
if assay == "snp" and template_data["call_snps"]:

template_data["snp_calling_method"] = get_user_input(
"SNP caller:",
default="bcftools",
choices=["bcftools", "deepvariant"],
)

template_data["fasta"] = get_user_input(
"Path to reference fasta:", default="path/to/reference.fasta"
)

template_data["fasta_index"] = get_user_input(
"Path to reference fasta index:", default="path/to/reference.fasta.fai"
)

template_data["snp_database"] = get_user_input(
"Path to SNP database:",
default="path/to/snp_database",
)
else:
template_data["snp_calling_method"] = "False"
template_data["fasta"] = "False"
template_data["fasta_index"] = "False"
template_data["snp_database"] = "False"

# SNP options
template_data["call_snps"] = (
get_user_input("Call SNPs? (yes/no)", default="no", is_boolean=True)
Expand Down Expand Up @@ -240,11 +293,9 @@ def setup_configuration(assay, genome, template_data):
)

template_data["UCSC_hub_directory"] = (
get_user_input("UCSC hub directory:", default="seqnado_output/hub/")
get_user_input("UCSC hub directory:", default="seqnado_output/hub/")
if template_data["make_ucsc_hub"]
else "seqnado_output/hub/"
else "seqnado_output/hub/"
)
template_data["email"] = (
get_user_input("What is your email address?", default=f"{username}@example.com")
Expand All @@ -265,6 +316,13 @@ def setup_configuration(assay, genome, template_data):
if assay == "rna"
else TOOL_OPTIONS_SNP if assay == "snp" else ""
)
TOOL_OPTIONS
if assay in ["chip", "atac"]
else (
TOOL_OPTIONS_RNA
if assay == "rna"
else TOOL_OPTIONS_SNP if assay == "snp" else ""
)
)


Expand Down Expand Up @@ -308,8 +366,6 @@ def setup_configuration(assay, genome, template_data):
heatmap:
options: -b 1000 -m 5000 -a 1000
colormap: RdYlBu_r
options: -b 1000 -m 5000 -a 1000
colormap: RdYlBu_r
"""

TOOL_OPTIONS_RNA = """
Expand Down Expand Up @@ -345,8 +401,26 @@ def setup_configuration(assay, genome, template_data):
heatmap:
options: -b 1000 -m 5000 -a 1000
colormap: RdYlBu_r
options: -b 1000 -m 5000 -a 1000
colormap: RdYlBu_r
"""


TOOL_OPTIONS_SNP = """
trim_galore:
threads: 8
options: --2colour 20
bowtie2:
threads: 8
options:
picard:
threads: 8
options:
bcftools:
threads: 16
options:
"""


Expand Down
98 changes: 92 additions & 6 deletions seqnado/design.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,9 @@ def controls_performed(self) -> List[str]:
control.add(f.control_performed)
return list(control)

def query(
self, sample_name: str, full_experiment: bool = False
) -> Union[FastqSetIP, Dict[str, FastqSetIP]]:
def query(
self, sample_name: str, full_experiment: bool = False
) -> Union[FastqSetIP, Dict[str, FastqSetIP]]:
Expand All @@ -496,6 +499,9 @@ def query(
)
is_control = False

experiment_files = dict()
is_control = False

experiment_files = dict()

if sample_name in ip_names or sample_name in control_names:
Expand All @@ -504,13 +510,19 @@ def query(
experiment_files["ip"] = experiment.ip
experiment_files["control"] = experiment.control

experiment_files["ip"] = experiment.ip
experiment_files["control"] = experiment.control

elif (
experiment.has_control
and experiment.control_fullname == sample_name
):
is_control = True
experiment_files["ip"] = experiment.ip
experiment_files["control"] = experiment.control
is_control = True
experiment_files["ip"] = experiment.ip
experiment_files["control"] = experiment.control
else:
raise ValueError(f"Could not find sample with name {sample_name}")

Expand All @@ -523,6 +535,15 @@ def query(
else experiment_files["control"]
)

if full_experiment:
return experiment_files
else:
return (
experiment_files["ip"]
if not is_control
else experiment_files["control"]
)

@classmethod
def from_fastq_files(cls, fq: List[Union[str, pathlib.Path]], **kwargs):
"""
Expand Down Expand Up @@ -636,10 +657,9 @@ def to_dataframe(self) -> pd.DataFrame:
experiment.control.r1.path if experiment.control else None
),
"control_r2": (
experiment.control.r2.path if experiment.control else None
experiment.control.r2.path if experiment.control and experiment.control.r2 else None
),
}

for k, v in metadata.model_dump(exclude_none=True).items():
row[k] = v

Expand Down Expand Up @@ -741,7 +761,6 @@ def from_design(
include_controls: bool = False,
):


if isinstance(design, Design):
df = (
design.to_dataframe()
Expand Down Expand Up @@ -891,6 +910,13 @@ class BigWigFiles(BaseModel):
"homer",
]
],
Literal["deeptools", "homer", False],
List[
Literal[
"deeptools",
"homer",
]
],
] = None
make_bigwigs: bool = False
scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw", "merged"]] = None
Expand All @@ -905,9 +931,6 @@ def model_post_init(self, __context: Any) -> None:
self.scale_method = [
"unscaled",
]
self.scale_method = [
"unscaled",
]
elif self.include_unscaled and self.scale_method:
self.scale_method = ["unscaled", self.scale_method]
else:
Expand Down Expand Up @@ -974,6 +997,7 @@ def files(self) -> List[str]:
class HeatmapFiles(BaseModel):
assay: Literal["ChIP", "ATAC", "RNA", "SNP"]
make_heatmaps: bool = False
make_heatmaps: bool = False

@property
def heatmap_files(self) -> List[str]:
Expand All @@ -989,6 +1013,10 @@ def files(self) -> List[str]:
return self.heatmap_files
else:
return []
if self.make_heatmaps:
return self.heatmap_files
else:
return []


class HubFiles(BaseModel):
Expand Down Expand Up @@ -1042,11 +1070,15 @@ class Output(BaseModel):
sample_names: List[str]

make_bigwigs: bool = False
pileup_method: Union[
Literal["deeptools", "homer", False],
List[Literal["deeptools", "homer"]],
pileup_method: Union[
Literal["deeptools", "homer", False],
List[Literal["deeptools", "homer"]],
] = None


scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw"]] = None

make_heatmaps: bool = False
Expand Down Expand Up @@ -1259,8 +1291,10 @@ def peaks(self):
s
for s in self.sample_names
if not any([c in s for c in self.control_names])
if not any([c in s for c in self.control_names])
]


pcf_samples = PeakCallingFiles(
assay=self.assay,
names=ip_sample_names,
Expand Down Expand Up @@ -1359,3 +1393,55 @@ def files(self) -> List[str]:
files.append(self.snp_files)

return files


class SNPOutput(Output):
assay: Literal["SNP"]
call_snps: bool = False
sample_names: List[str]
make_ucsc_hub: bool = False
snp_calling_method: Optional[
Union[
Literal["bcftools", "deepvariant", False],
List[Literal["bcftools", "deepvariant"]],
]
] = None

@property
def design(self):
return ["seqnado_output/design.csv"]

@property
def snp_files(self) -> List[str]:
if self.call_snps:
return expand(
"seqnado_output/variant/{method}/{sample}.vcf.gz",
sample=self.sample_names,
method=self.snp_calling_method,
)
else:
return []

@computed_field
@property
def files(self) -> List[str]:
files = []
files.extend(
QCFiles(
assay=self.assay,
fastq_screen=self.fastq_screen,
library_complexity=self.library_complexity,
).files
)

for file_list in (
self.snp_files,
self.design,
):
if file_list:
files.extend(file_list)

if self.call_snps:
files.append(self.snp_files)

return files
15 changes: 12 additions & 3 deletions seqnado/workflow/rules/exogenous_norm.smk
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ use rule align_paired as align_paired_spikein with:
resources:
mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",


use rule align_single as align_single_spikein with:
output:
bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"),
Expand All @@ -24,6 +25,7 @@ use rule sort_bam as sort_bam_spikein with:
log:
"seqnado_output/logs/aligned_spikein/{sample}_sort.log",


use rule index_bam as index_bam_spikein with:
input:
bam=rules.sort_bam_spikein.output.bam,
Expand All @@ -42,7 +44,7 @@ rule filter_bam_spikein:
"seqnado_output/logs/aligned_spikein/{sample}_filter.log",
shell:
"""
samtools view -b -F 3332 -q 30 -@ 8 {input.bam} > {output.bam} &&
samtools view -b -F 260 -@ 8 {input.bam} > {output.bam} &&
echo 'Filtered bam number of mapped reads:' > {log} 2>&1 &&
samtools view -c {output.bam} >> {log} 2>&1
"""
Expand Down Expand Up @@ -96,7 +98,10 @@ if config["spikein_options"]["normalisation_method"] == "orlando":

rule calculate_normalisation_factors:
input:
lambda wc: expand(rules.split_bam.output.stats, sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL),
lambda wc: expand(
rules.split_bam.output.stats,
sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL,
),
output:
normalisation_table="seqnado_output/resources/{group}_normalisation_factors.tsv",
normalisation_factors="seqnado_output/resources/{group}_normalisation_factors.json",
Expand All @@ -109,7 +114,11 @@ elif config["spikein_options"]["normalisation_method"] == "with_input":

rule calculate_normalisation_factors:
input:
lambda wc: expand(rules.split_bam.output.stats, sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL),
lambda wc: expand(
rules.split_bam.output.stats,
sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL,
),
design="seqnado_output/design.csv",
output:
normalisation_table="seqnado_output/resources/{group}_normalisation_factors.tsv",
normalisation_factors="seqnado_output/resources/{group}_normalisation_factors.json",
Expand Down
Loading

0 comments on commit ba75033

Please sign in to comment.