From ba75033e55b8d154e68436560635b26feb947b7a Mon Sep 17 00:00:00 2001 From: Catherine Chahrour <74187550+CChahrour@users.noreply.github.com> Date: Tue, 28 May 2024 17:20:35 +0100 Subject: [PATCH] Fix se with inputs (#190) * align output names for macs peaks if broad * increase resources for attempts for macs * remove blank row in out bed files * revert macs option SE handling * revert the PE options for macs * peak calling design sorted * fix peak control match * amend config slurm queue * fix get control for peaks * lint design * fix lancetron no input rule * changes defaults in config * update snp snakefile * add snp to design and test initial * update config for snp * typo in tests * fixed config for tests * create common rules * fix snp tests * update envs * fix inputs for se in design * fix design to handle se and pe with or without inputs * update spikein with input from stats files * less stringent spike in filter bams * remove quality filter from spike in bam filter * change peak calling method for atac to macs and remove from chip tests * update fastq files to test * move multiple peak calling test to atac * update test data * less stringent filtering for spikein * put peak callers back to chip * macs for atac only * test peak callers on atac --------- Co-authored-by: Alastair Smith <49727900+alsmith151@users.noreply.github.com> --- seqnado/config.py | 86 ++++++++++++- seqnado/design.py | 98 ++++++++++++++- seqnado/workflow/rules/exogenous_norm.smk | 15 ++- seqnado/workflow/rules/peak_call_chip.smk | 27 ++--- .../scripts/calculate_spikein_norm_factors.py | 113 ++++++++---------- tests/test_pipelines.py | 20 +++- 6 files changed, 260 insertions(+), 99 deletions(-) diff --git a/seqnado/config.py b/seqnado/config.py index b95897c6..af53429a 100755 --- a/seqnado/config.py +++ b/seqnado/config.py @@ -62,6 +62,7 @@ def setup_configuration(assay, genome, template_data): genome_dict[genome] = { "indices": genome_values[genome].get( "star_indices" if assay in ["rna"] else "bt2_indices" + "star_indices" if assay in ["rna"] else "bt2_indices" ), "chromosome_sizes": genome_values[genome].get("chromosome_sizes", ""), "gtf": genome_values[genome].get("gtf", ""), @@ -162,6 +163,26 @@ def setup_configuration(assay, genome, template_data): template_data["pileup_method"] = "False" template_data["scale"] = "False" template_data["make_heatmaps"] = "False" + if assay not in ["snp"]: + template_data["make_bigwigs"] = get_user_input( + "Do you want to make bigwigs? (yes/no)", default="no", is_boolean=True + ) + if template_data["make_bigwigs"]: + template_data["pileup_method"] = get_user_input( + "Pileup method:", + default="deeptools", + choices=["deeptools", "homer"], + ) + template_data["scale"] = get_user_input( + "Scale bigwigs? (yes/no)", default="no", is_boolean=True + ) + template_data["make_heatmaps"] = get_user_input( + "Do you want to make heatmaps? (yes/no)", default="no", is_boolean=True + ) + else: + template_data["pileup_method"] = "False" + template_data["scale"] = "False" + template_data["make_heatmaps"] = "False" # Call peaks if assay in ["chip", "atac"]: @@ -202,6 +223,38 @@ def setup_configuration(assay, genome, template_data): else "False" ) + # SNP options + template_data["call_snps"] = ( + get_user_input("Call SNPs? (yes/no)", default="no", is_boolean=True) + if assay == "snp" + else "False" + ) + if assay == "snp" and template_data["call_snps"]: + + template_data["snp_calling_method"] = get_user_input( + "SNP caller:", + default="bcftools", + choices=["bcftools", "deepvariant"], + ) + + template_data["fasta"] = get_user_input( + "Path to reference fasta:", default="path/to/reference.fasta" + ) + + template_data["fasta_index"] = get_user_input( + "Path to reference fasta index:", default="path/to/reference.fasta.fai" + ) + + template_data["snp_database"] = get_user_input( + "Path to SNP database:", + default="path/to/snp_database", + ) + else: + template_data["snp_calling_method"] = "False" + template_data["fasta"] = "False" + template_data["fasta_index"] = "False" + template_data["snp_database"] = "False" + # SNP options template_data["call_snps"] = ( get_user_input("Call SNPs? (yes/no)", default="no", is_boolean=True) @@ -240,11 +293,9 @@ def setup_configuration(assay, genome, template_data): ) template_data["UCSC_hub_directory"] = ( - get_user_input("UCSC hub directory:", default="seqnado_output/hub/") get_user_input("UCSC hub directory:", default="seqnado_output/hub/") if template_data["make_ucsc_hub"] else "seqnado_output/hub/" - else "seqnado_output/hub/" ) template_data["email"] = ( get_user_input("What is your email address?", default=f"{username}@example.com") @@ -265,6 +316,13 @@ def setup_configuration(assay, genome, template_data): if assay == "rna" else TOOL_OPTIONS_SNP if assay == "snp" else "" ) + TOOL_OPTIONS + if assay in ["chip", "atac"] + else ( + TOOL_OPTIONS_RNA + if assay == "rna" + else TOOL_OPTIONS_SNP if assay == "snp" else "" + ) ) @@ -308,8 +366,6 @@ def setup_configuration(assay, genome, template_data): heatmap: options: -b 1000 -m 5000 -a 1000 colormap: RdYlBu_r - options: -b 1000 -m 5000 -a 1000 - colormap: RdYlBu_r """ TOOL_OPTIONS_RNA = """ @@ -345,8 +401,26 @@ def setup_configuration(assay, genome, template_data): heatmap: options: -b 1000 -m 5000 -a 1000 colormap: RdYlBu_r - options: -b 1000 -m 5000 -a 1000 - colormap: RdYlBu_r +""" + + +TOOL_OPTIONS_SNP = """ +trim_galore: + threads: 8 + options: --2colour 20 + +bowtie2: + threads: 8 + options: + +picard: + threads: 8 + options: + +bcftools: + threads: 16 + options: + """ diff --git a/seqnado/design.py b/seqnado/design.py index 0af812d6..49de44a8 100755 --- a/seqnado/design.py +++ b/seqnado/design.py @@ -484,6 +484,9 @@ def controls_performed(self) -> List[str]: control.add(f.control_performed) return list(control) + def query( + self, sample_name: str, full_experiment: bool = False + ) -> Union[FastqSetIP, Dict[str, FastqSetIP]]: def query( self, sample_name: str, full_experiment: bool = False ) -> Union[FastqSetIP, Dict[str, FastqSetIP]]: @@ -496,6 +499,9 @@ def query( ) is_control = False + experiment_files = dict() + is_control = False + experiment_files = dict() if sample_name in ip_names or sample_name in control_names: @@ -504,6 +510,9 @@ def query( experiment_files["ip"] = experiment.ip experiment_files["control"] = experiment.control + experiment_files["ip"] = experiment.ip + experiment_files["control"] = experiment.control + elif ( experiment.has_control and experiment.control_fullname == sample_name @@ -511,6 +520,9 @@ def query( is_control = True experiment_files["ip"] = experiment.ip experiment_files["control"] = experiment.control + is_control = True + experiment_files["ip"] = experiment.ip + experiment_files["control"] = experiment.control else: raise ValueError(f"Could not find sample with name {sample_name}") @@ -523,6 +535,15 @@ def query( else experiment_files["control"] ) + if full_experiment: + return experiment_files + else: + return ( + experiment_files["ip"] + if not is_control + else experiment_files["control"] + ) + @classmethod def from_fastq_files(cls, fq: List[Union[str, pathlib.Path]], **kwargs): """ @@ -636,10 +657,9 @@ def to_dataframe(self) -> pd.DataFrame: experiment.control.r1.path if experiment.control else None ), "control_r2": ( - experiment.control.r2.path if experiment.control else None + experiment.control.r2.path if experiment.control and experiment.control.r2 else None ), } - for k, v in metadata.model_dump(exclude_none=True).items(): row[k] = v @@ -741,7 +761,6 @@ def from_design( include_controls: bool = False, ): - if isinstance(design, Design): df = ( design.to_dataframe() @@ -891,6 +910,13 @@ class BigWigFiles(BaseModel): "homer", ] ], + Literal["deeptools", "homer", False], + List[ + Literal[ + "deeptools", + "homer", + ] + ], ] = None make_bigwigs: bool = False scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw", "merged"]] = None @@ -905,9 +931,6 @@ def model_post_init(self, __context: Any) -> None: self.scale_method = [ "unscaled", ] - self.scale_method = [ - "unscaled", - ] elif self.include_unscaled and self.scale_method: self.scale_method = ["unscaled", self.scale_method] else: @@ -974,6 +997,7 @@ def files(self) -> List[str]: class HeatmapFiles(BaseModel): assay: Literal["ChIP", "ATAC", "RNA", "SNP"] make_heatmaps: bool = False + make_heatmaps: bool = False @property def heatmap_files(self) -> List[str]: @@ -989,6 +1013,10 @@ def files(self) -> List[str]: return self.heatmap_files else: return [] + if self.make_heatmaps: + return self.heatmap_files + else: + return [] class HubFiles(BaseModel): @@ -1042,11 +1070,15 @@ class Output(BaseModel): sample_names: List[str] make_bigwigs: bool = False + pileup_method: Union[ + Literal["deeptools", "homer", False], + List[Literal["deeptools", "homer"]], pileup_method: Union[ Literal["deeptools", "homer", False], List[Literal["deeptools", "homer"]], ] = None + scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw"]] = None make_heatmaps: bool = False @@ -1259,8 +1291,10 @@ def peaks(self): s for s in self.sample_names if not any([c in s for c in self.control_names]) + if not any([c in s for c in self.control_names]) ] + pcf_samples = PeakCallingFiles( assay=self.assay, names=ip_sample_names, @@ -1359,3 +1393,55 @@ def files(self) -> List[str]: files.append(self.snp_files) return files + + +class SNPOutput(Output): + assay: Literal["SNP"] + call_snps: bool = False + sample_names: List[str] + make_ucsc_hub: bool = False + snp_calling_method: Optional[ + Union[ + Literal["bcftools", "deepvariant", False], + List[Literal["bcftools", "deepvariant"]], + ] + ] = None + + @property + def design(self): + return ["seqnado_output/design.csv"] + + @property + def snp_files(self) -> List[str]: + if self.call_snps: + return expand( + "seqnado_output/variant/{method}/{sample}.vcf.gz", + sample=self.sample_names, + method=self.snp_calling_method, + ) + else: + return [] + + @computed_field + @property + def files(self) -> List[str]: + files = [] + files.extend( + QCFiles( + assay=self.assay, + fastq_screen=self.fastq_screen, + library_complexity=self.library_complexity, + ).files + ) + + for file_list in ( + self.snp_files, + self.design, + ): + if file_list: + files.extend(file_list) + + if self.call_snps: + files.append(self.snp_files) + + return files diff --git a/seqnado/workflow/rules/exogenous_norm.smk b/seqnado/workflow/rules/exogenous_norm.smk index 47610fb1..38c9b4df 100755 --- a/seqnado/workflow/rules/exogenous_norm.smk +++ b/seqnado/workflow/rules/exogenous_norm.smk @@ -7,6 +7,7 @@ use rule align_paired as align_paired_spikein with: resources: mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB", + use rule align_single as align_single_spikein with: output: bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"), @@ -24,6 +25,7 @@ use rule sort_bam as sort_bam_spikein with: log: "seqnado_output/logs/aligned_spikein/{sample}_sort.log", + use rule index_bam as index_bam_spikein with: input: bam=rules.sort_bam_spikein.output.bam, @@ -42,7 +44,7 @@ rule filter_bam_spikein: "seqnado_output/logs/aligned_spikein/{sample}_filter.log", shell: """ - samtools view -b -F 3332 -q 30 -@ 8 {input.bam} > {output.bam} && + samtools view -b -F 260 -@ 8 {input.bam} > {output.bam} && echo 'Filtered bam number of mapped reads:' > {log} 2>&1 && samtools view -c {output.bam} >> {log} 2>&1 """ @@ -96,7 +98,10 @@ if config["spikein_options"]["normalisation_method"] == "orlando": rule calculate_normalisation_factors: input: - lambda wc: expand(rules.split_bam.output.stats, sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL), + lambda wc: expand( + rules.split_bam.output.stats, + sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL, + ), output: normalisation_table="seqnado_output/resources/{group}_normalisation_factors.tsv", normalisation_factors="seqnado_output/resources/{group}_normalisation_factors.json", @@ -109,7 +114,11 @@ elif config["spikein_options"]["normalisation_method"] == "with_input": rule calculate_normalisation_factors: input: - lambda wc: expand(rules.split_bam.output.stats, sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL), + lambda wc: expand( + rules.split_bam.output.stats, + sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL, + ), + design="seqnado_output/design.csv", output: normalisation_table="seqnado_output/resources/{group}_normalisation_factors.tsv", normalisation_factors="seqnado_output/resources/{group}_normalisation_factors.json", diff --git a/seqnado/workflow/rules/peak_call_chip.smk b/seqnado/workflow/rules/peak_call_chip.smk index cc467ad5..5d559b48 100755 --- a/seqnado/workflow/rules/peak_call_chip.smk +++ b/seqnado/workflow/rules/peak_call_chip.smk @@ -11,7 +11,6 @@ def get_lanceotron_threshold(wildcards): return threshold def format_macs_options(wildcards, options): - query_name = f"{wildcards.sample}_{wildcards.treatment}" query_name = f"{wildcards.sample}_{wildcards.treatment}" is_paired = DESIGN.query(query_name).is_paired options = check_options(options) @@ -23,16 +22,6 @@ def format_macs_options(wildcards, options): return options -def get_control_file(wildcards, file_type: Literal["bam", "tag", "bigwig"], allow_null=False): - control_info = DESIGN.query(sample_name=f"{wildcards.sample}_{wildcards.treatment}", full_experiment=True)["control"] - if control_info: - match file_type: - case "bam": - return f"seqnado_output/aligned/{control_info.sample_name}.bam" - case "tag": - return f"seqnado_output/tag_dirs/{control_info.sample_name}" - case "bigwig": - return f"seqnado_output/bigwigs/deeptools/unscaled/{control_info.sample_name}.bigWig" def get_control_file(wildcards, file_type: Literal["bam", "tag", "bigwig"], allow_null=False): control_info = DESIGN.query(sample_name=f"{wildcards.sample}_{wildcards.treatment}", full_experiment=True)["control"] if control_info: @@ -48,21 +37,24 @@ def get_control_file(wildcards, file_type: Literal["bam", "tag", "bigwig"], allo return [] else: return "UNDEFINED" + if allow_null: + return [] + else: + return "UNDEFINED" rule macs2_with_input: input: treatment="seqnado_output/aligned/{sample}_{treatment}.bam", control=lambda wc: get_control_file(wc, file_type="bam", allow_null=False), + control=lambda wc: get_control_file(wc, file_type="bam", allow_null=False), output: peaks="seqnado_output/peaks/macs/{sample}_{treatment}.bed", params: options=lambda wc: format_macs_options(wc, config["macs"]["callpeak"]), raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"), - raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"), basename=lambda wc, output: output.peaks.replace(".bed", ""), threads: 1 resources: - mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB", mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB", runtime="2h", log: @@ -71,7 +63,6 @@ rule macs2_with_input: """ macs2 callpeak -t {input.treatment} -c {input.control} -n {params.basename} {params.options} > {log} 2>&1 && cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks} - cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks} """ @@ -79,16 +70,15 @@ rule macs2_no_input: input: treatment="seqnado_output/aligned/{sample}_{treatment}.bam", control=lambda wc: get_control_file(wc, file_type="bam", allow_null=True), + control=lambda wc: get_control_file(wc, file_type="bam", allow_null=True), output: peaks="seqnado_output/peaks/macs/{sample}_{treatment}.bed", params: options=lambda wc: format_macs_options(wc, config["macs"]["callpeak"]), raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"), - raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"), basename=lambda wc, output: output.peaks.replace(".bed", ""), threads: 1 resources: - mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB", mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB", runtime="2h", log: @@ -97,7 +87,6 @@ rule macs2_no_input: """ macs2 callpeak -t {input.treatment} -n {params.basename} {params.options} > {log} 2>&1 && cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks} - cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks} """ @@ -105,6 +94,7 @@ rule homer_with_input: input: treatment="seqnado_output/tag_dirs/{sample}_{treatment}", control=lambda wc: get_control_file(wc, file_type="tag", allow_null=False), + control=lambda wc: get_control_file(wc, file_type="tag", allow_null=False), output: peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed", log: @@ -127,6 +117,7 @@ rule homer_no_input: input: treatment="seqnado_output/tag_dirs/{sample}_{treatment}", control=lambda wc: get_control_file(wc, file_type="tag", allow_null=True), + control=lambda wc: get_control_file(wc, file_type="tag", allow_null=True), output: peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed", log: @@ -149,6 +140,7 @@ rule lanceotron_with_input: input: treatment="seqnado_output/bigwigs/deeptools/unscaled/{sample}_{treatment}.bigWig", control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=False), + control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=False), output: peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed", log: @@ -174,6 +166,7 @@ rule lanceotron_no_input: input: treatment="seqnado_output/bigwigs/deeptools/unscaled/{sample}_{treatment}.bigWig", control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=True), + control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=True), output: peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed", log: diff --git a/seqnado/workflow/scripts/calculate_spikein_norm_factors.py b/seqnado/workflow/scripts/calculate_spikein_norm_factors.py index 621a96ef..a17698b0 100755 --- a/seqnado/workflow/scripts/calculate_spikein_norm_factors.py +++ b/seqnado/workflow/scripts/calculate_spikein_norm_factors.py @@ -1,75 +1,58 @@ import pandas as pd import pathlib -import pysam -from typing import List from loguru import logger # Set up logging logger.add(snakemake.log[0], level="INFO") - -def get_readcounts(bam_files: List[pathlib.Path]): - readcounts = {} - for bam_file in bam_files: - bam = pysam.AlignmentFile(bam_file, "rb") - readcounts[bam_file.stem] = bam.mapped - return pd.Series(readcounts) - - with logger.catch(): logger.info("Calculating normalization factors") - # Calculate readcounts for reference and spikein samples - bam_ref = [pathlib.Path(p) for p in snakemake.input.bam_ref] - bam_spikein = [pathlib.Path(p) for p in snakemake.input.bam_spikein] - readcounts_ref = get_readcounts(bam_ref) - readcounts_spikein = get_readcounts(bam_spikein) - - # Read in design matrix - df_design_raw = pd.read_csv(snakemake.input.design, sep=",").assign( - ip=lambda df: df["sample"] + "_" + df["antibody"] - )[["sample", "antibody", "ip", "control"]] - - df_design = df_design_raw.melt( - id_vars=["sample", "antibody"], var_name="type", value_name="name" - ) - - # Merge readcounts with design matrix - df_counts = pd.DataFrame( - [readcounts_ref, readcounts_spikein], index=["ref", "spikein"] - ).T - - df_counts = df_design.merge(df_counts, left_on="name", right_index=True) - - # Pivot for easier handling - df_counts = df_counts.pivot_table( - index=["sample", "antibody"], columns="type", values=["ref", "spikein"] - ) - df_counts.columns = ["_".join(col) for col in df_counts.columns.values] - df_counts = df_counts.reset_index() - - # Calculate normalization factors - df_counts = df_counts.assign( - reads_per_spikein_ip=lambda df: df["ref_ip"] / df["spikein_ip"], - reads_per_spikein_control=lambda df: df["ref_control"] / df["spikein_control"], - relative_signal=lambda df: df["reads_per_spikein_ip"] - / df["reads_per_spikein_control"], - reads_per_million_ip=lambda df: 10e6 / df["ref_ip"], - reads_per_million_control=lambda df: 10e6 / df["ref_control"], - norm_factor=lambda df: df["relative_signal"] * df["reads_per_million_ip"], - ) - - # Write out normalization factors - df_counts = df_counts.merge(df_design_raw, on=["sample", "antibody"]) - df_counts.to_csv(snakemake.output.normalisation_table, sep="\t", index=False) - - norm_ip = df_counts[["ip", "norm_factor"]].set_index("ip")["norm_factor"] - norm_control = ( - df_counts[["control", "norm_factor"]] - .assign(norm_factor=1) - .drop_duplicates() - .set_index("control")["norm_factor"] - ) - - norm = pd.concat([norm_ip, norm_control]) - norm.to_json(snakemake.output.normalisation_factors) \ No newline at end of file +# Read in stats +stats_files = snakemake.input + +all_readcounts = [] + +for stats in stats_files: + file_path = pathlib.Path(stats) + readcounts = pd.read_csv(file_path, sep="\t") + all_readcounts.append(readcounts) + + +df_counts = pd.concat(all_readcounts, ignore_index=True) +df_counts["sample_name"] = df_counts["sample"].str.split("_", expand=True)[0] +df_counts["ip"] = df_counts["sample"].str.split("_", expand=True)[1] +df_counts.drop(columns=["sample"], inplace=True) +df_counts['ip'] = df_counts['ip'].apply(lambda x: x.lower() if x == 'Input' else x) +df_counts = df_counts[['sample_name', 'ip', 'reference_reads', 'spikein_reads']] + +df_counts_ip = df_counts[df_counts['ip'] != 'input'] +df_counts_input = df_counts[df_counts['ip'] == 'input'].drop(columns=['ip']) +df_counts = df_counts_ip.merge(df_counts_input, on="sample_name", suffixes=('_ip', '_control')) + +# Calculate normalization factors +df_counts = df_counts.assign( + reads_per_spikein_ip=lambda df: df["reference_reads_ip"] / df["spikein_reads_ip"], + reads_per_spikein_control=lambda df: df["reference_reads_control"] / df["spikein_reads_control"], + relative_signal=lambda df: df["reads_per_spikein_ip"] + / df["reads_per_spikein_control"], + reads_per_million_ip=lambda df: 10e6 / df["reference_reads_ip"], + reads_per_million_control=lambda df: 10e6 / df["reference_reads_control"], + norm_factor=lambda df: df["relative_signal"] * df["reads_per_million_ip"], +) + + +df_counts['sample'] = df_counts['sample_name'] + "_" + df_counts['ip'] +# Write out normalization factors +df_counts.to_csv(snakemake.output.normalisation_table, sep="\t", index=False) + +norm_ip = df_counts[["sample", "norm_factor"]].set_index(["sample"])["norm_factor"] +df_counts_input['sample'] = df_counts_input['sample_name'] + "_Input" +df_counts_input['norm_factor'] = 1 +norm_input = df_counts_input[["sample", "norm_factor"]].set_index(["sample"])["norm_factor"] +norm = pd.concat([norm_ip, norm_input], axis=0) +# unique norm +norm = norm[~norm.index.duplicated(keep='first')].sort_index( + ascending=True +) +norm.to_json(snakemake.output.normalisation_factors) \ No newline at end of file diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 8534cce9..ea30b3f0 100755 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -16,6 +16,9 @@ scope="function", params=["atac", "chip", "chip-rx", "rna", "rna-rx", "snp"], autouse=True, + scope="function", + params=["atac", "chip", "chip-rx", "rna", "rna-rx", "snp"], + autouse=True, ) def assay(request): return request.param @@ -163,7 +166,7 @@ def fastqs(test_data_path, assay) -> list[pathlib.Path]: path = test_data_path / "fastq" if not path.exists(): - url = f"https://userweb.molbiol.ox.ac.uk/public/project/milne_group/asmith/ngs_pipeline/fastq.tar.gz" + url = f"https://userweb.molbiol.ox.ac.uk/public/project/milne_group/cchahrou/seqnado_reference/fastq.tar.gz" r = requests.get(url, stream=True) tar_path = path.with_suffix(".tar.gz") @@ -188,6 +191,8 @@ def fastqs(test_data_path, assay) -> list[pathlib.Path]: files = list(path.glob("rna-spikein*.fastq.gz")) case "snp": files = list(path.glob("snp*.fastq.gz")) + case "snp": + files = list(path.glob("snp*.fastq.gz")) return files @@ -278,6 +283,11 @@ def user_inputs( "call_snps": "no", } + defaults_snp = { + "remove_pcr_duplicates": "no", + "call_snps": "no", + } + hub = { "make_ucsc_hub": "yes", "UCSC_hub_directory": "test_hub", @@ -298,6 +308,8 @@ def user_inputs( return {**defaults, **defaults_rna_rx, **hub} case "snp": return {**defaults, **defaults_snp, **hub} + case "snp": + return {**defaults, **defaults_snp, **hub} @pytest.fixture(scope="function") @@ -336,12 +348,15 @@ def config_yaml_for_testing(config_yaml, assay): if assay == "chip": config["pileup_method"] = ["deeptools", "homer"] - config["peak_calling_method"] = ["lanceotron", "macs", "homer"] + config["peak_calling_method"] = ["lanceotron"] config["library_complexity"] = False config["bowtie2"]["options"] = "--no-mixed --no-discordant" elif assay == "chip-rx": config["call_peaks"] = True config["peak_calling_method"] = ["seacr"] + elif assay == "atac": + config["call_peaks"] = True + config["peak_calling_method"] = ["homer", "lanceotron", "macs"] with open(config_yaml, "w") as f: yaml.dump(config, f) @@ -364,6 +379,7 @@ def design(seqnado_run_dir, assay_type, assay): # Add merge column to design file import pandas as pd + df = pd.read_csv(seqnado_run_dir / "design.csv", index_col=0) df["merge"] = "MLL-MERGED-TOGETHER" df.to_csv(seqnado_run_dir / "design.csv")