From ba75033e55b8d154e68436560635b26feb947b7a Mon Sep 17 00:00:00 2001
From: Catherine Chahrour <74187550+CChahrour@users.noreply.github.com>
Date: Tue, 28 May 2024 17:20:35 +0100
Subject: [PATCH] Fix se with inputs (#190)

* align output names for macs peaks if broad

* increase resources for attempts for macs

* remove blank row in out bed files

* revert macs option SE handling

* revert the PE options for macs

* peak calling design sorted

* fix peak control match

* amend config slurm queue

* fix get control for peaks

* lint design

* fix lancetron no input rule

* changes defaults in config

* update snp snakefile

* add snp to design and test initial

* update config for snp

* typo in tests

* fixed config for tests

* create common rules

* fix snp tests

* update envs

* fix inputs for se in design

* fix design to handle se and pe with or without inputs

* update spikein with input from stats files

* less stringent spike in filter bams

* remove quality filter from spike in bam filter

* change peak calling method for atac to macs and remove from chip tests

* update fastq files to test

* move multiple peak calling test to atac

* update test data

* less stringent filtering for spikein

* put peak callers back to chip

* macs for atac only

* test peak callers on atac

---------

Co-authored-by: Alastair Smith <49727900+alsmith151@users.noreply.github.com>
---
 seqnado/config.py                             |  86 ++++++++++++-
 seqnado/design.py                             |  98 ++++++++++++++-
 seqnado/workflow/rules/exogenous_norm.smk     |  15 ++-
 seqnado/workflow/rules/peak_call_chip.smk     |  27 ++---
 .../scripts/calculate_spikein_norm_factors.py | 113 ++++++++----------
 tests/test_pipelines.py                       |  20 +++-
 6 files changed, 260 insertions(+), 99 deletions(-)

diff --git a/seqnado/config.py b/seqnado/config.py
index b95897c6..af53429a 100755
--- a/seqnado/config.py
+++ b/seqnado/config.py
@@ -62,6 +62,7 @@ def setup_configuration(assay, genome, template_data):
             genome_dict[genome] = {
                 "indices": genome_values[genome].get(
                     "star_indices" if assay in ["rna"] else "bt2_indices"
+                    "star_indices" if assay in ["rna"] else "bt2_indices"
                 ),
                 "chromosome_sizes": genome_values[genome].get("chromosome_sizes", ""),
                 "gtf": genome_values[genome].get("gtf", ""),
@@ -162,6 +163,26 @@ def setup_configuration(assay, genome, template_data):
             template_data["pileup_method"] = "False"
             template_data["scale"] = "False"
             template_data["make_heatmaps"] = "False"
+    if assay not in ["snp"]:
+        template_data["make_bigwigs"] = get_user_input(
+            "Do you want to make bigwigs? (yes/no)", default="no", is_boolean=True
+        )
+        if template_data["make_bigwigs"]:
+            template_data["pileup_method"] = get_user_input(
+                "Pileup method:",
+                default="deeptools",
+                choices=["deeptools", "homer"],
+            )
+            template_data["scale"] = get_user_input(
+                "Scale bigwigs? (yes/no)", default="no", is_boolean=True
+            )
+            template_data["make_heatmaps"] = get_user_input(
+                "Do you want to make heatmaps? (yes/no)", default="no", is_boolean=True
+            )
+        else:
+            template_data["pileup_method"] = "False"
+            template_data["scale"] = "False"
+            template_data["make_heatmaps"] = "False"
 
     # Call peaks
     if assay in ["chip", "atac"]:
@@ -202,6 +223,38 @@ def setup_configuration(assay, genome, template_data):
         else "False"
     )
 
+    # SNP options
+    template_data["call_snps"] = (
+        get_user_input("Call SNPs? (yes/no)", default="no", is_boolean=True)
+        if assay == "snp"
+        else "False"
+    )
+    if assay == "snp" and template_data["call_snps"]:
+
+        template_data["snp_calling_method"] = get_user_input(
+            "SNP caller:",
+            default="bcftools",
+            choices=["bcftools", "deepvariant"],
+        )
+
+        template_data["fasta"] = get_user_input(
+            "Path to reference fasta:", default="path/to/reference.fasta"
+        )
+
+        template_data["fasta_index"] = get_user_input(
+            "Path to reference fasta index:", default="path/to/reference.fasta.fai"
+        )
+
+        template_data["snp_database"] = get_user_input(
+            "Path to SNP database:",
+            default="path/to/snp_database",
+        )
+    else:
+        template_data["snp_calling_method"] = "False"
+        template_data["fasta"] = "False"
+        template_data["fasta_index"] = "False"
+        template_data["snp_database"] = "False"
+
     # SNP options
     template_data["call_snps"] = (
         get_user_input("Call SNPs? (yes/no)", default="no", is_boolean=True)
@@ -240,11 +293,9 @@ def setup_configuration(assay, genome, template_data):
     )
 
     template_data["UCSC_hub_directory"] = (
-        get_user_input("UCSC hub directory:", default="seqnado_output/hub/")
         get_user_input("UCSC hub directory:", default="seqnado_output/hub/")
         if template_data["make_ucsc_hub"]
         else "seqnado_output/hub/"
-        else "seqnado_output/hub/"
     )
     template_data["email"] = (
         get_user_input("What is your email address?", default=f"{username}@example.com")
@@ -265,6 +316,13 @@ def setup_configuration(assay, genome, template_data):
             if assay == "rna"
             else TOOL_OPTIONS_SNP if assay == "snp" else ""
         )
+        TOOL_OPTIONS
+        if assay in ["chip", "atac"]
+        else (
+            TOOL_OPTIONS_RNA
+            if assay == "rna"
+            else TOOL_OPTIONS_SNP if assay == "snp" else ""
+        )
     )
 
 
@@ -308,8 +366,6 @@ def setup_configuration(assay, genome, template_data):
 heatmap:
     options: -b 1000 -m 5000 -a 1000
     colormap: RdYlBu_r 
-    options: -b 1000 -m 5000 -a 1000
-    colormap: RdYlBu_r 
 """
 
 TOOL_OPTIONS_RNA = """
@@ -345,8 +401,26 @@ def setup_configuration(assay, genome, template_data):
 heatmap:
     options: -b 1000 -m 5000 -a 1000
     colormap: RdYlBu_r 
-    options: -b 1000 -m 5000 -a 1000
-    colormap: RdYlBu_r 
+"""
+
+
+TOOL_OPTIONS_SNP = """
+trim_galore:
+    threads: 8
+    options: --2colour 20 
+
+bowtie2:
+    threads: 8
+    options:
+
+picard:
+    threads: 8
+    options:
+
+bcftools:
+    threads: 16
+    options:
+    
 """
 
 
diff --git a/seqnado/design.py b/seqnado/design.py
index 0af812d6..49de44a8 100755
--- a/seqnado/design.py
+++ b/seqnado/design.py
@@ -484,6 +484,9 @@ def controls_performed(self) -> List[str]:
                 control.add(f.control_performed)
         return list(control)
 
+    def query(
+        self, sample_name: str, full_experiment: bool = False
+    ) -> Union[FastqSetIP, Dict[str, FastqSetIP]]:
     def query(
         self, sample_name: str, full_experiment: bool = False
     ) -> Union[FastqSetIP, Dict[str, FastqSetIP]]:
@@ -496,6 +499,9 @@ def query(
         )
         is_control = False
 
+        experiment_files = dict()
+        is_control = False
+
         experiment_files = dict()
 
         if sample_name in ip_names or sample_name in control_names:
@@ -504,6 +510,9 @@ def query(
                     experiment_files["ip"] = experiment.ip
                     experiment_files["control"] = experiment.control
 
+                    experiment_files["ip"] = experiment.ip
+                    experiment_files["control"] = experiment.control
+
                 elif (
                     experiment.has_control
                     and experiment.control_fullname == sample_name
@@ -511,6 +520,9 @@ def query(
                     is_control = True
                     experiment_files["ip"] = experiment.ip
                     experiment_files["control"] = experiment.control
+                    is_control = True
+                    experiment_files["ip"] = experiment.ip
+                    experiment_files["control"] = experiment.control
         else:
             raise ValueError(f"Could not find sample with name {sample_name}")
 
@@ -523,6 +535,15 @@ def query(
                 else experiment_files["control"]
             )
 
+        if full_experiment:
+            return experiment_files
+        else:
+            return (
+                experiment_files["ip"]
+                if not is_control
+                else experiment_files["control"]
+            )
+
     @classmethod
     def from_fastq_files(cls, fq: List[Union[str, pathlib.Path]], **kwargs):
         """
@@ -636,10 +657,9 @@ def to_dataframe(self) -> pd.DataFrame:
                     experiment.control.r1.path if experiment.control else None
                 ),
                 "control_r2": (
-                    experiment.control.r2.path if experiment.control else None
+                    experiment.control.r2.path if experiment.control and experiment.control.r2 else None
                 ),
             }
-
             for k, v in metadata.model_dump(exclude_none=True).items():
                 row[k] = v
 
@@ -741,7 +761,6 @@ def from_design(
         include_controls: bool = False,
     ):
 
-
         if isinstance(design, Design):
             df = (
                 design.to_dataframe()
@@ -891,6 +910,13 @@ class BigWigFiles(BaseModel):
                 "homer",
             ]
         ],
+        Literal["deeptools", "homer", False],
+        List[
+            Literal[
+                "deeptools",
+                "homer",
+            ]
+        ],
     ] = None
     make_bigwigs: bool = False
     scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw", "merged"]] = None
@@ -905,9 +931,6 @@ def model_post_init(self, __context: Any) -> None:
             self.scale_method = [
                 "unscaled",
             ]
-            self.scale_method = [
-                "unscaled",
-            ]
         elif self.include_unscaled and self.scale_method:
             self.scale_method = ["unscaled", self.scale_method]
         else:
@@ -974,6 +997,7 @@ def files(self) -> List[str]:
 class HeatmapFiles(BaseModel):
     assay: Literal["ChIP", "ATAC", "RNA", "SNP"]
     make_heatmaps: bool = False
+    make_heatmaps: bool = False
 
     @property
     def heatmap_files(self) -> List[str]:
@@ -989,6 +1013,10 @@ def files(self) -> List[str]:
             return self.heatmap_files
         else:
             return []
+        if self.make_heatmaps:
+            return self.heatmap_files
+        else:
+            return []
 
 
 class HubFiles(BaseModel):
@@ -1042,11 +1070,15 @@ class Output(BaseModel):
     sample_names: List[str]
 
     make_bigwigs: bool = False
+    pileup_method: Union[
+        Literal["deeptools", "homer", False],
+        List[Literal["deeptools", "homer"]],
     pileup_method: Union[
         Literal["deeptools", "homer", False],
         List[Literal["deeptools", "homer"]],
     ] = None
 
+
     scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw"]] = None
 
     make_heatmaps: bool = False
@@ -1259,8 +1291,10 @@ def peaks(self):
             s
             for s in self.sample_names
             if not any([c in s for c in self.control_names])
+            if not any([c in s for c in self.control_names])
         ]
 
+
         pcf_samples = PeakCallingFiles(
             assay=self.assay,
             names=ip_sample_names,
@@ -1359,3 +1393,55 @@ def files(self) -> List[str]:
             files.append(self.snp_files)
 
         return files
+
+
+class SNPOutput(Output):
+    assay: Literal["SNP"]
+    call_snps: bool = False
+    sample_names: List[str]
+    make_ucsc_hub: bool = False
+    snp_calling_method: Optional[
+        Union[
+            Literal["bcftools", "deepvariant", False],
+            List[Literal["bcftools", "deepvariant"]],
+        ]
+    ] = None
+
+    @property
+    def design(self):
+        return ["seqnado_output/design.csv"]
+
+    @property
+    def snp_files(self) -> List[str]:
+        if self.call_snps:
+            return expand(
+                "seqnado_output/variant/{method}/{sample}.vcf.gz",
+                sample=self.sample_names,
+                method=self.snp_calling_method,
+            )
+        else:
+            return []
+
+    @computed_field
+    @property
+    def files(self) -> List[str]:
+        files = []
+        files.extend(
+            QCFiles(
+                assay=self.assay,
+                fastq_screen=self.fastq_screen,
+                library_complexity=self.library_complexity,
+            ).files
+        )
+
+        for file_list in (
+            self.snp_files,
+            self.design,
+        ):
+            if file_list:
+                files.extend(file_list)
+
+        if self.call_snps:
+            files.append(self.snp_files)
+
+        return files
diff --git a/seqnado/workflow/rules/exogenous_norm.smk b/seqnado/workflow/rules/exogenous_norm.smk
index 47610fb1..38c9b4df 100755
--- a/seqnado/workflow/rules/exogenous_norm.smk
+++ b/seqnado/workflow/rules/exogenous_norm.smk
@@ -7,6 +7,7 @@ use rule align_paired as align_paired_spikein with:
     resources:
         mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
 
+
 use rule align_single as align_single_spikein with:
     output:
         bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"),
@@ -24,6 +25,7 @@ use rule sort_bam as sort_bam_spikein with:
     log:
         "seqnado_output/logs/aligned_spikein/{sample}_sort.log",
 
+
 use rule index_bam as index_bam_spikein with:
     input:
         bam=rules.sort_bam_spikein.output.bam,
@@ -42,7 +44,7 @@ rule filter_bam_spikein:
         "seqnado_output/logs/aligned_spikein/{sample}_filter.log",
     shell:
         """
-    samtools view -b -F 3332 -q 30 -@ 8 {input.bam} > {output.bam} &&
+    samtools view -b -F 260 -@ 8 {input.bam} > {output.bam} &&
     echo 'Filtered bam number of mapped reads:' > {log} 2>&1 &&
     samtools view -c {output.bam} >> {log} 2>&1
     """
@@ -96,7 +98,10 @@ if config["spikein_options"]["normalisation_method"] == "orlando":
 
     rule calculate_normalisation_factors:
         input:
-            lambda wc: expand(rules.split_bam.output.stats, sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL),
+            lambda wc: expand(
+                rules.split_bam.output.stats,
+                sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL,
+            ),
         output:
             normalisation_table="seqnado_output/resources/{group}_normalisation_factors.tsv",
             normalisation_factors="seqnado_output/resources/{group}_normalisation_factors.json",
@@ -109,7 +114,11 @@ elif config["spikein_options"]["normalisation_method"] == "with_input":
 
     rule calculate_normalisation_factors:
         input:
-            lambda wc: expand(rules.split_bam.output.stats, sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL),
+            lambda wc: expand(
+                rules.split_bam.output.stats,
+                sample=SAMPLE_NAMES_IP + SAMPLE_NAMES_CONTROL,
+            ),
+            design="seqnado_output/design.csv",
         output:
             normalisation_table="seqnado_output/resources/{group}_normalisation_factors.tsv",
             normalisation_factors="seqnado_output/resources/{group}_normalisation_factors.json",
diff --git a/seqnado/workflow/rules/peak_call_chip.smk b/seqnado/workflow/rules/peak_call_chip.smk
index cc467ad5..5d559b48 100755
--- a/seqnado/workflow/rules/peak_call_chip.smk
+++ b/seqnado/workflow/rules/peak_call_chip.smk
@@ -11,7 +11,6 @@ def get_lanceotron_threshold(wildcards):
     return threshold
 
 def format_macs_options(wildcards, options):
-    query_name = f"{wildcards.sample}_{wildcards.treatment}"    
     query_name = f"{wildcards.sample}_{wildcards.treatment}"    
     is_paired = DESIGN.query(query_name).is_paired
     options = check_options(options)
@@ -23,16 +22,6 @@ def format_macs_options(wildcards, options):
         return options
 
 
-def get_control_file(wildcards, file_type: Literal["bam", "tag", "bigwig"], allow_null=False):
-    control_info = DESIGN.query(sample_name=f"{wildcards.sample}_{wildcards.treatment}", full_experiment=True)["control"]
-    if control_info:
-        match file_type:
-            case "bam":
-                return f"seqnado_output/aligned/{control_info.sample_name}.bam"
-            case "tag":
-                return f"seqnado_output/tag_dirs/{control_info.sample_name}"
-            case "bigwig":
-                return f"seqnado_output/bigwigs/deeptools/unscaled/{control_info.sample_name}.bigWig"
 def get_control_file(wildcards, file_type: Literal["bam", "tag", "bigwig"], allow_null=False):
     control_info = DESIGN.query(sample_name=f"{wildcards.sample}_{wildcards.treatment}", full_experiment=True)["control"]
     if control_info:
@@ -48,21 +37,24 @@ def get_control_file(wildcards, file_type: Literal["bam", "tag", "bigwig"], allo
             return []
         else:
             return "UNDEFINED"
+        if allow_null:
+            return []
+        else:
+            return "UNDEFINED"
 
 rule macs2_with_input:
     input:
         treatment="seqnado_output/aligned/{sample}_{treatment}.bam",
         control=lambda wc: get_control_file(wc, file_type="bam", allow_null=False),
+        control=lambda wc: get_control_file(wc, file_type="bam", allow_null=False),
     output:
         peaks="seqnado_output/peaks/macs/{sample}_{treatment}.bed",
     params:
         options=lambda wc: format_macs_options(wc, config["macs"]["callpeak"]),
         raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"),
-        raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"),
         basename=lambda wc, output: output.peaks.replace(".bed", ""),
     threads: 1
     resources:
-        mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB",
         mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB",
         runtime="2h",
     log:
@@ -71,7 +63,6 @@ rule macs2_with_input:
         """
         macs2 callpeak -t {input.treatment} -c {input.control} -n {params.basename} {params.options} > {log} 2>&1 &&
         cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks}
-        cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks}
         """
 
 
@@ -79,16 +70,15 @@ rule macs2_no_input:
     input:
         treatment="seqnado_output/aligned/{sample}_{treatment}.bam",
         control=lambda wc: get_control_file(wc, file_type="bam", allow_null=True), 
+        control=lambda wc: get_control_file(wc, file_type="bam", allow_null=True), 
     output:
         peaks="seqnado_output/peaks/macs/{sample}_{treatment}.bed",
     params:
         options=lambda wc: format_macs_options(wc, config["macs"]["callpeak"]),
         raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"),
-        raw=lambda wc, output: output.peaks.replace(".bed", "_peaks.xls"),
         basename=lambda wc, output: output.peaks.replace(".bed", ""),
     threads: 1
     resources:
-        mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB",
         mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt)}GB",
         runtime="2h",
     log:
@@ -97,7 +87,6 @@ rule macs2_no_input:
         """
         macs2 callpeak -t {input.treatment} -n {params.basename} {params.options} > {log} 2>&1 &&
         cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks}
-        cat {params.raw} | grep -v '^#' | grep -vE '^chr\\s+start\\s+end.*' | grep -v '^$' | cut -f 1-3 > {output.peaks}
         """
 
 
@@ -105,6 +94,7 @@ rule homer_with_input:
     input:
         treatment="seqnado_output/tag_dirs/{sample}_{treatment}",
         control=lambda wc: get_control_file(wc, file_type="tag", allow_null=False),
+        control=lambda wc: get_control_file(wc, file_type="tag", allow_null=False),
     output:
         peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed",
     log:
@@ -127,6 +117,7 @@ rule homer_no_input:
     input:
         treatment="seqnado_output/tag_dirs/{sample}_{treatment}",
         control=lambda wc: get_control_file(wc, file_type="tag", allow_null=True),
+        control=lambda wc: get_control_file(wc, file_type="tag", allow_null=True),
     output:
         peaks="seqnado_output/peaks/homer/{sample}_{treatment}.bed",
     log:
@@ -149,6 +140,7 @@ rule lanceotron_with_input:
     input:
         treatment="seqnado_output/bigwigs/deeptools/unscaled/{sample}_{treatment}.bigWig",
         control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=False),
+        control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=False),
     output:
         peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed",
     log:
@@ -174,6 +166,7 @@ rule lanceotron_no_input:
     input:
         treatment="seqnado_output/bigwigs/deeptools/unscaled/{sample}_{treatment}.bigWig",
         control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=True),
+        control=lambda wc: get_control_file(wc, file_type="bigwig", allow_null=True),
     output:
         peaks="seqnado_output/peaks/lanceotron/{sample}_{treatment}.bed",
     log:
diff --git a/seqnado/workflow/scripts/calculate_spikein_norm_factors.py b/seqnado/workflow/scripts/calculate_spikein_norm_factors.py
index 621a96ef..a17698b0 100755
--- a/seqnado/workflow/scripts/calculate_spikein_norm_factors.py
+++ b/seqnado/workflow/scripts/calculate_spikein_norm_factors.py
@@ -1,75 +1,58 @@
 import pandas as pd
 import pathlib
-import pysam
-from typing import List
 from loguru import logger
 
 # Set up logging
 logger.add(snakemake.log[0], level="INFO")
 
-
-def get_readcounts(bam_files: List[pathlib.Path]):
-    readcounts = {}
-    for bam_file in bam_files:
-        bam = pysam.AlignmentFile(bam_file, "rb")
-        readcounts[bam_file.stem] = bam.mapped
-    return pd.Series(readcounts)
-
-
 with logger.catch():
     logger.info("Calculating normalization factors")
 
-    # Calculate readcounts for reference and spikein samples
-    bam_ref = [pathlib.Path(p) for p in snakemake.input.bam_ref]
-    bam_spikein = [pathlib.Path(p) for p in snakemake.input.bam_spikein]
-    readcounts_ref = get_readcounts(bam_ref)
-    readcounts_spikein = get_readcounts(bam_spikein)
-
-    # Read in design matrix
-    df_design_raw = pd.read_csv(snakemake.input.design, sep=",").assign(
-        ip=lambda df: df["sample"] + "_" + df["antibody"]
-    )[["sample", "antibody", "ip", "control"]]
-
-    df_design = df_design_raw.melt(
-        id_vars=["sample", "antibody"], var_name="type", value_name="name"
-    )
-
-    # Merge readcounts with design matrix
-    df_counts = pd.DataFrame(
-        [readcounts_ref, readcounts_spikein], index=["ref", "spikein"]
-    ).T
-
-    df_counts = df_design.merge(df_counts, left_on="name", right_index=True)
-
-    # Pivot for easier handling
-    df_counts = df_counts.pivot_table(
-        index=["sample", "antibody"], columns="type", values=["ref", "spikein"]
-    )
-    df_counts.columns = ["_".join(col) for col in df_counts.columns.values]
-    df_counts = df_counts.reset_index()
-
-    # Calculate normalization factors
-    df_counts = df_counts.assign(
-        reads_per_spikein_ip=lambda df: df["ref_ip"] / df["spikein_ip"],
-        reads_per_spikein_control=lambda df: df["ref_control"] / df["spikein_control"],
-        relative_signal=lambda df: df["reads_per_spikein_ip"]
-        / df["reads_per_spikein_control"],
-        reads_per_million_ip=lambda df: 10e6 / df["ref_ip"],
-        reads_per_million_control=lambda df: 10e6 / df["ref_control"],
-        norm_factor=lambda df: df["relative_signal"] * df["reads_per_million_ip"],
-    )
-
-    # Write out normalization factors
-    df_counts = df_counts.merge(df_design_raw, on=["sample", "antibody"])
-    df_counts.to_csv(snakemake.output.normalisation_table, sep="\t", index=False)
-
-    norm_ip = df_counts[["ip", "norm_factor"]].set_index("ip")["norm_factor"]
-    norm_control = (
-        df_counts[["control", "norm_factor"]]
-        .assign(norm_factor=1)
-        .drop_duplicates()
-        .set_index("control")["norm_factor"]
-    )
-
-    norm = pd.concat([norm_ip, norm_control])
-    norm.to_json(snakemake.output.normalisation_factors)
\ No newline at end of file
+# Read in stats
+stats_files = snakemake.input
+
+all_readcounts = []
+
+for stats in stats_files:
+    file_path = pathlib.Path(stats)
+    readcounts = pd.read_csv(file_path, sep="\t")
+    all_readcounts.append(readcounts)
+
+
+df_counts = pd.concat(all_readcounts, ignore_index=True)
+df_counts["sample_name"] = df_counts["sample"].str.split("_", expand=True)[0]
+df_counts["ip"] = df_counts["sample"].str.split("_", expand=True)[1]
+df_counts.drop(columns=["sample"], inplace=True)
+df_counts['ip'] = df_counts['ip'].apply(lambda x: x.lower() if x == 'Input' else x)
+df_counts = df_counts[['sample_name', 'ip', 'reference_reads', 'spikein_reads']]
+
+df_counts_ip = df_counts[df_counts['ip'] != 'input']
+df_counts_input = df_counts[df_counts['ip'] == 'input'].drop(columns=['ip'])
+df_counts = df_counts_ip.merge(df_counts_input, on="sample_name", suffixes=('_ip', '_control'))
+
+# Calculate normalization factors
+df_counts = df_counts.assign(
+    reads_per_spikein_ip=lambda df: df["reference_reads_ip"] / df["spikein_reads_ip"],
+    reads_per_spikein_control=lambda df: df["reference_reads_control"] / df["spikein_reads_control"],
+    relative_signal=lambda df: df["reads_per_spikein_ip"]
+    / df["reads_per_spikein_control"],
+    reads_per_million_ip=lambda df: 10e6 / df["reference_reads_ip"],
+    reads_per_million_control=lambda df: 10e6 / df["reference_reads_control"],
+    norm_factor=lambda df: df["relative_signal"] * df["reads_per_million_ip"],
+)
+
+
+df_counts['sample'] = df_counts['sample_name'] + "_" + df_counts['ip']
+# Write out normalization factors
+df_counts.to_csv(snakemake.output.normalisation_table, sep="\t", index=False)
+
+norm_ip = df_counts[["sample", "norm_factor"]].set_index(["sample"])["norm_factor"]
+df_counts_input['sample'] = df_counts_input['sample_name'] + "_Input"
+df_counts_input['norm_factor'] = 1
+norm_input = df_counts_input[["sample", "norm_factor"]].set_index(["sample"])["norm_factor"]
+norm = pd.concat([norm_ip, norm_input], axis=0)
+# unique norm 
+norm = norm[~norm.index.duplicated(keep='first')].sort_index(
+    ascending=True
+)
+norm.to_json(snakemake.output.normalisation_factors)
\ No newline at end of file
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 8534cce9..ea30b3f0 100755
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -16,6 +16,9 @@
     scope="function",
     params=["atac", "chip", "chip-rx", "rna", "rna-rx", "snp"],
     autouse=True,
+    scope="function",
+    params=["atac", "chip", "chip-rx", "rna", "rna-rx", "snp"],
+    autouse=True,
 )
 def assay(request):
     return request.param
@@ -163,7 +166,7 @@ def fastqs(test_data_path, assay) -> list[pathlib.Path]:
     path = test_data_path / "fastq"
 
     if not path.exists():
-        url = f"https://userweb.molbiol.ox.ac.uk/public/project/milne_group/asmith/ngs_pipeline/fastq.tar.gz"
+        url = f"https://userweb.molbiol.ox.ac.uk/public/project/milne_group/cchahrou/seqnado_reference/fastq.tar.gz"
         r = requests.get(url, stream=True)
 
         tar_path = path.with_suffix(".tar.gz")
@@ -188,6 +191,8 @@ def fastqs(test_data_path, assay) -> list[pathlib.Path]:
             files = list(path.glob("rna-spikein*.fastq.gz"))
         case "snp":
             files = list(path.glob("snp*.fastq.gz"))
+        case "snp":
+            files = list(path.glob("snp*.fastq.gz"))
 
     return files
 
@@ -278,6 +283,11 @@ def user_inputs(
         "call_snps": "no",
     }
 
+    defaults_snp = {
+        "remove_pcr_duplicates": "no",
+        "call_snps": "no",
+    }
+
     hub = {
         "make_ucsc_hub": "yes",
         "UCSC_hub_directory": "test_hub",
@@ -298,6 +308,8 @@ def user_inputs(
             return {**defaults, **defaults_rna_rx, **hub}
         case "snp":
             return {**defaults, **defaults_snp, **hub}
+        case "snp":
+            return {**defaults, **defaults_snp, **hub}
 
 
 @pytest.fixture(scope="function")
@@ -336,12 +348,15 @@ def config_yaml_for_testing(config_yaml, assay):
 
     if assay == "chip":
         config["pileup_method"] = ["deeptools", "homer"]
-        config["peak_calling_method"] = ["lanceotron", "macs", "homer"]
+        config["peak_calling_method"] = ["lanceotron"]
         config["library_complexity"] = False
         config["bowtie2"]["options"] = "--no-mixed --no-discordant"
     elif assay == "chip-rx":
         config["call_peaks"] = True
         config["peak_calling_method"] = ["seacr"]
+    elif assay == "atac":
+        config["call_peaks"] = True
+        config["peak_calling_method"] = ["homer", "lanceotron", "macs"]
 
     with open(config_yaml, "w") as f:
         yaml.dump(config, f)
@@ -364,6 +379,7 @@ def design(seqnado_run_dir, assay_type, assay):
         # Add merge column to design file
         import pandas as pd
 
+
         df = pd.read_csv(seqnado_run_dir / "design.csv", index_col=0)
         df["merge"] = "MLL-MERGED-TOGETHER"
         df.to_csv(seqnado_run_dir / "design.csv")