alsmith151 · alsmith151 · Apr 9, 2024 · Apr 8, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/docs/faq.md b/docs/faq.md
@@ -20,4 +20,21 @@ remote has no library client (see https://apptainer.org/docs/user/latest/endpoin
 Fix:
 
 apptainer remote add --no-login SylabsCloud cloud.sylabs.io  
-apptainer remote use SylabsCloud  
+apptainer remote use SylabsCloud  
+
+
+## Optional configuration
+
+### Can I merge multiple samples into a single sample?
+
+Yes, you can merge multiple samples into a single sample to generate merged bigWig files and consensus peaks. To do this, you need to create a design file that specifies the samples to be merged. The design file should have a column named "merge" that specifies the samples to be merged e.g.:
+
+
+| sample | r1 | r2 | deseq2 | merge |
+|--------|----|----|--------|-------|
+| rna1 | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna1_2.fastq.gz | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna1_1.fastq.gz | control | control |
+| rna2 | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna2_2.fastq.gz | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna2_1.fastq.gz | control | control |
+| rna3 | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna3_2.fastq.gz | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna3_1.fastq.gz | control | control |
+| rna4 | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna4_2.fastq.gz | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna4_1.fastq.gz | treated | treated |
+| rna5 | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna5_2.fastq.gz | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna5_1.fastq.gz | treated | treated |
+| rna6 | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna6_2.fastq.gz | /tmp/pytest-of-asmith/pytest-7/data2/2024-01-13_rna_test/rna6_1.fastq.gz | treated | treated |
diff --git a/seqnado/cli.py b/seqnado/cli.py
@@ -53,12 +53,30 @@
     """
     import pathlib
     from seqnado.design import Design, DesignIP, FastqFile, FastqFileIP
-
+    
     if not files:
-        files = list(pathlib.Path(".").glob("*.fastq.gz"))
+        potential_file_locations = [
+            ".",
+            "fastqs",
+            "fastq",
+            "data",
+            "data/fastqs",
+        ]
+
+        for location in potential_file_locations:
+            files = list(pathlib.Path(location).glob("*.fastq.gz"))
+            if files:
+                break
 
         if not files:
-            raise ValueError("No fastq files provided or found in current directory.")
+            logger.error("No fastq files provided or found in current directory")
+            logger.error(f"""
+                         Fastq files can be provided as arguments or found in the following directories:
+                         {potential_file_locations}
+                         """)
+            raise ValueError("No fastq files provided or found in current directory" )
+
+
 
     if not method == "chip":
         design = Design.from_fastq_files([FastqFile(path=fq) for fq in files])
@@ -92,14 +110,26 @@
             """,
     type=click.Choice(choices=["lc", "ls", "ss"]),
 )
+@click.option(
+    "--clean-symlinks",
+    is_flag=True,
+    help="Remove symlinks created by previous runs. Useful for re-running pipeline after misconfiguration.",
+)
+@click.option(
+    "-v",
+    "--verbose",
+    is_flag=True,
+    help="Increase logging verbosity",
+)
 @click.argument("pipeline_options", nargs=-1, type=click.UNPROCESSED)
 def cli_pipeline(
     method,
     pipeline_options,
     help=False,
     preset="local",
     version=False,
-    apptainer_args="",
+    verbose=False,
+    clean_symlinks=False,
 ):
     """Runs the data processing pipeline"""
 
@@ -113,9 +143,24 @@
         _version = version("seqnado")
         print(f"SeqNado version {_version}")
         sys.exit(0)
+
+    if verbose:
+        logger.remove()
+        logger.add(sys.stderr, level="DEBUG")
+    else:
+        logger.remove()
+        logger.add(sys.stderr, level="INFO")
 
     pipeline_options, cores = extract_cores_from_options(pipeline_options)
 
+    # Removes old symlinks if requested
+    if clean_symlinks:
+        logger.info("Cleaning symlinks")
+        links = pathlib.Path("seqnado_output/fastqs").glob("*")
+        for link in links:
+            if link.is_symlink():
+                link.unlink()
+
     cmd = [
         "snakemake",
         "-c",

diff --git a/seqnado/design.py b/seqnado/design.py
@@ -1,17 +1,15 @@
+import os
 import pathlib
 import re
-from typing import Any, Dict, List, Optional, Union, Literal, LiteralString
 import sys
+from typing import Any, Dict, List, Literal, LiteralString, Optional, Union
 
+import numpy as np
 import pandas as pd
 from loguru import logger
-from pydantic import BaseModel, Field, computed_field
+from pydantic import BaseModel, Field, computed_field, field_validator
 from snakemake.io import expand
 
-
-logger.add(sink=sys.stderr, level="WARNING")
-
-
 def is_path(path: Optional[Union[str, pathlib.Path]]) -> Optional[pathlib.Path]:
     if isinstance(path, str):
         p = pathlib.Path(path)
@@ -32,7 +30,7 @@
     def model_post_init(self, *args):
         self.path = pathlib.Path(self.path).resolve()
 
-        if not self.path.exists():
+        if not self.path.exists() or str(self.path) in ["-", ".", "", None]:
             raise FileNotFoundError(f"{self.path} does not exist.")
 
     @computed_field
@@ -257,6 +255,19 @@
             )
 
 
+class Metadata(BaseModel):
+    deseq2: Optional[str] = None
+    merge: Optional[str] = None
+    scale_group: Union[str, int] = "all"
+
+    @field_validator("deseq2", "merge")
+    @classmethod
+    def prevent_none(cls, v):
+        none_vals = [None, "None", "none", "null", "Null", "NULL", ".", "", "NA", np.nan]
+        if any([v == n for n in none_vals]):
+            assert v is not None, "None is not allowed when setting metadata"
+        return v
+
 class Design(BaseModel):
     assays: Dict[str, AssayNonIP] = Field(
         default_factory=dict,
@@ -317,14 +328,19 @@
         for assay_name, row in df.iterrows():
             if simplified:
                 metadata = {}
+
                 for k, v in row.items():
                     if k not in ["r1", "r2"]:
                         metadata[k] = v
+
+                # Validate the metadata
+                metadata = Metadata(**metadata)
+
                 assays[assay_name] = AssayNonIP(
                     name=assay_name,
                     r1=FastqFile(path=row["r1"]),
                     r2=FastqFile(path=row["r2"]) if row["r2"] else None,
-                    metadata=metadata,
+                    metadata=metadata.model_dump(exclude_none=True),
                 )
             else:
                 raise NotImplementedError("Not implemented")
@@ -424,7 +440,13 @@
         for experiment in self.assays.values():
 
             name_ip = experiment.name
-            name_control = f"{experiment.control_files.r1.sample_base_without_ip}_{experiment.control_files.r1.ip}"
+
+            try:
+                control_base = experiment.control_files.r1.sample_base_without_ip
+                control_ip = experiment.control_files.r1.ip
+                name_control = f"{control_base}_{control_ip}"
+            except AttributeError:
+                name_control = None
 
             if name_to_query == name_ip or name_to_query == name_control:
                 if control is not None:
@@ -559,6 +581,9 @@
                         "control",
                     ]:
                         metadata[k] = v
+
+                # Validate the metadata
+                metadata = Metadata(**metadata)
 
                 # Add the experiment
                 ip = row["ip"]
@@ -577,7 +602,7 @@
                         ),
                         ip=ip,
                         control=None,
-                        metadata=metadata,
+                        metadata=metadata.model_dump(exclude_none=True),
                     )
                 else:
                     experiments[experiment_name] = ExperimentIP(
@@ -601,7 +626,7 @@
                         ),
                         ip=ip,
                         control=control,
-                        metadata=metadata,
+                        metadata=metadata.model_dump(exclude_none=True),
                     )
             else:
                 raise NotImplementedError("Not implemented")
@@ -743,7 +768,7 @@
         Literal["deeptools", "homer"], List[Literal["deeptools", "homer"]]
     ] = None
     make_bigwigs: bool = False
-    scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw", "grouped"]] = None
+    scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw", "merged"]] = None
     prefix: Optional[str] = "seqnado_output/bigwigs/"
 
     def model_post_init(self, __context: Any) -> None:
@@ -798,11 +823,12 @@
         List[Literal["macs", "homer", "lanceotron", "seacr"]],
     ] = None
     call_peaks: bool = False
+    prefix: Optional[str] = "seqnado_output/peaks/"
 
     @property
     def peak_files(self) -> List[str]:
         return expand(
-            "seqnado_output/peaks/{method}/{sample}.bed",
+            self.prefix + "{method}/{sample}.bed",
             sample=self.names,
             method=self.peak_calling_method,
         )
@@ -883,7 +909,7 @@
    sample_names: List[str]

    make_bigwigs: bool = False
    pileup_method: Optional[
        Union[Literal["deeptools", "homer"], List[Literal["deeptools", "homer"]]]
    ] = None
    scale_method: Optional[Literal["cpm", "rpkm", "spikein", "csaw"]] = None
@@ -922,8 +948,8 @@
                 assay=self.assay,
                 names=self.design_dataframe["merge"].unique().tolist(),
                 make_bigwigs=self.make_bigwigs,
-                pileup_method=self.pileup_method,
-                scale_method="rpkm",
+                pileup_method="deeptools",
+                scale_method="merged",
             )
 
             files = bwf_samples.files + bwf_merged.files
@@ -1009,7 +1035,7 @@
 class NonRNAOutput(Output):
    assay: Union[Literal["ChIP"], Literal["ATAC"]]
    call_peaks: bool = False
    peak_calling_method: Optional[Union[
        Literal["macs", "homer", "lanceotron", False],
        List[Literal["macs", "homer", "lanceotron"]],
    ]] = None
@@ -1024,7 +1050,8 @@
             assay=self.assay,
             names=self.design_dataframe["merge"].unique().tolist(),
             call_peaks=self.call_peaks,
-            peak_calling_method=self.peak_calling_method,
+            peak_calling_method="lanceotron",
+            prefix="seqnado_output/peaks/merged/",
         )
 
     @computed_field
@@ -1080,7 +1107,7 @@
    ip_names: List[str]
    control_names: List[str]
    call_peaks: bool = False
    peak_calling_method: Optional[Union[
        Literal["macs", "homer", "lanceotron", "seacr", False],
        List[Literal["macs", "homer", "lanceotron", "seacr"]],
    ]] = None

diff --git a/seqnado/helpers.py b/seqnado/helpers.py
@@ -3,6 +3,9 @@
 import numpy as np
 import shlex
 
+from loguru import logger
+
+
 from seqnado.design import Design, DesignIP
 
 
@@ -19,7 +22,6 @@
     """
     Extract the number of cores from the snakemake options.
     """
-    from loguru import logger
 
     try:
         cores_flag = options.index("-c")
@@ -62,12 +64,16 @@
     """
     Create a symlink in the output directory with the new file name.
     """
+
     new_path = output_dir / new_file_name
     if not new_path.exists() and source_path.is_file():
-        try:
+        logger.debug(f"Symlinking {source_path} to {output_dir / new_file_name}")
+        if str(source_path) in [".", "..", "", None, "None"]:
+            logger.warning(f"Source path is empty for {new_file_name}. Will not symlink.")
+
+        else:
             new_path.symlink_to(source_path.resolve())
-        except FileExistsError:
-            print(f"Symlink for {new_path} already exists.")
+            logger.debug(f"Symlinked {source_path} to {output_dir / new_file_name} successfully.")
 
 
 def symlink_fastq_files(

diff --git a/seqnado/workflow/rules/alignment_post_processing.smk b/seqnado/workflow/rules/alignment_post_processing.smk
@@ -203,14 +203,19 @@ rule move_bam_to_final_location:
 def get_bam_files_for_merge(wildcards):
     from seqnado.design import NormGroups
     norm_groups = NormGroups.from_design(DESIGN, subset_column="merge")
-    return norm_groups.get_sample_group(wildcards.group)
+
+    sample_names = norm_groups.get_grouped_samples(wildcards.group)
+
+    return [
+        f"seqnado_output/aligned/{sample}.bam" for sample in sample_names
+    ]
 
 
 rule merge_bams:
     input:
         bams=get_bam_files_for_merge,
     output:
-        temp("seqnado_output/aligned/grouped/{group}.bam"),
+        temp("seqnado_output/aligned/merged/{group}.bam"),
     threads: 8
     log:
         "seqnado_output/logs/merge_bam/{group}.log",
@@ -222,9 +227,9 @@ rule merge_bams:
 
 use rule index_bam as index_consensus_bam with:
     input:
-        bam="seqnado_output/aligned/grouped/{group}.bam",
+        bam="seqnado_output/aligned/merged/{group}.bam",
     output:
-        bai="seqnado_output/aligned/grouped/{group}.bam.bai",
+        bai="seqnado_output/aligned/merged/{group}.bam.bai",
     threads: 8
 
 

diff --git a/seqnado/workflow/rules/peak_call_grouped.smk b/seqnado/workflow/rules/peak_call_grouped.smk
@@ -1,10 +1,16 @@
+from seqnado.helpers import check_options
 
 rule lanceotron_no_input_consensus:
     input:
-        bigwig="seqnado_output/bigwigs/deeptools/grouped/{group}.bigWig",
+        bigwig="seqnado_output/bigwigs/deeptools/merged/{group}.bigWig",
     output:
-        peaks="seqnado_output/peaks/lanceotron/grouped/{group}.bed",
+        peaks="seqnado_output/peaks/merged/lanceotron/{group}.bed",
     threads: 8
+    params:
+        outdir="seqnado_output/peaks/merged/lanceotron",
+        options=check_options(config["lanceotron"]["callpeak"])
+    container:
+        "library://asmith151/seqnado/seqnado_extra:latest"
     log:
         "seqnado_output/logs/lanceotron/{group}.log",
     shell:

diff --git a/seqnado/workflow/rules/pileup_grouped.smk b/seqnado/workflow/rules/pileup_grouped.smk
@@ -1,10 +1,10 @@
 
 use rule deeptools_make_bigwigs as deeptools_make_bigwigs_consensus with:
     input:
-        bam="seqnado_output/aligned/grouped/{group}.bam",
-        bai="seqnado_output/aligned/grouped/{group}.bam.bai",
+        bam="seqnado_output/aligned/merged/{sample}.bam",
+        bai="seqnado_output/aligned/merged/{sample}.bam.bai",
     output:
-        bigwig="seqnado_output/bigwigs/deeptools/grouped/{group}.bigWig",
+        bigwig="seqnado_output/bigwigs/deeptools/merged/{sample}.bigWig",
     threads: 8
     log:
-        "seqnado_output/logs/bigwigs/{group}.log",
+        "seqnado_output/logs/bigwigs/{sample}.log",
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
@@ -348,7 +348,14 @@ def design(seqnado_run_dir, assay_type, assay):
     completed = subprocess.run(" ".join(cmd), shell=True, cwd=seqnado_run_dir)
     assert completed.returncode == 0
 
-    if assay == "rna-rx":
+    if assay == "chip":
+        # Add merge column to design file
+        import pandas as pd
+        df = pd.read_csv(seqnado_run_dir / "design.csv", index_col=0)
+        df["merge"] = df.index.str.split("-").str[-1]
+        df.to_csv(seqnado_run_dir / "design.csv")
+
+    elif assay == "rna-rx":
         # Add deseq2 column to design file
         import pandas as pd