feat(pipeline): Added -s option to allow for scaling resources (memor…

…y and time) (#202) * Refactor pyproject.toml to remove drmaa dependency * feat: add container def file * feat: Add option to scale memory and time resources for pipeline * Add seqnado.sif to .gitignore * Refactor assay-specific Snakefiles to include SCALE_RESOURCES variable The commit refactors the assay-specific Snakefiles (`snakefile_atac`, `snakefile_chip`, `snakefile_snp`) to include the `SCALE_RESOURCES` variable. This variable is used to scale memory and time resources for the pipeline. The default value is set to 1, but it can be customized by setting the `SCALE_RESOURCES` environment variable. Co-authored-by: alsmith <[email protected]> * Refactor Snakefiles to include SCALE_RESOURCES variable * Refactor define_memory_requested and define_time_requested functions * Refactor assay-specific Snakefiles to include SCALE_RESOURCES variable * Refactor define_memory_requested and define_time_requested functions * Refactor define_memory_requested and define_time_requested functions to fix calculation bug * Refactor peak_call_grouped.smk to use define_memory_requested function for memory resource calculation * Refactor peak_call_grouped.smk to use define_memory_requested function for memory resource calculation * Refactor peak_call_grouped.smk to use define_memory_requested function for memory resource calculation * Ensure that the define_*_requested functions are imported in all rules
alsmith151 · Jul 25, 2024 · 6c9fdcc · 6c9fdcc
1 parent 6230dd9
commit 6c9fdcc
Show file tree

Hide file tree

Showing 26 changed files with 168 additions and 92 deletions.
diff --git a/.gitignore b/.gitignore
@@ -19,3 +19,4 @@ test_rna_size_factors.ipynb
 tests/data/*
 pytestdebug.log
 sps*
+seqnado.sif
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,6 @@ dynamic = ["version"]
 dependencies = [
     "click",
     "cookiecutter",
-    "drmaa",
     "pandas",
     "pandera",
     "pulp<=2.7.0",

diff --git a/seqnado.def b/seqnado.def
@@ -0,0 +1,39 @@
+BootStrap: docker
+From: mambaorg/micromamba:bookworm
+
+%post
+
+  # Pre set up
+  cd /opt/
+  apt update
+  apt install -y curl gcc git cmake make wget
+
+  # Install singularity
+  wget https://github.com/apptainer/apptainer/releases/download/v1.3.3/apptainer_1.3.3_amd64.deb
+  apt install -y ./apptainer_1.3.3_amd64.deb
+  apptainer remote add --no-login SylabsCloud cloud.sylabs.io
+  apptainer remote use SylabsCloud
+
+
+
+  # Mamba packages
+  #micromamba install -y -n base -f /opt/environment.yml
+  micromamba install -y -n base -c conda-forge python pip
+  export PATH="/opt/conda/bin:$PATH"
+
+  # Install Seqnado
+  git clone https://github.com/alsmith151/SeqNado.git
+  cd SeqNado
+  /opt/conda/bin/python -m pip install .
+
+  # Clean
+  #curl gcc git cmake make libtool g++ pkgconfig openssl-dev linux-headers
+  micromamba clean -afy
+  /opt/conda/bin/python -m pip cache purge
+  find /opt/conda/ -follow -type f -name '*.a' -delete
+  find /opt/conda/ -follow -type f -name '*.pyc' -delete
+  find /opt/conda/ -follow -type f -name '*.js.map' -delete
+
+
+%environment
+  export PATH=/opt/conda/bin:$PATH
diff --git a/seqnado/cli.py b/seqnado/cli.py
@@ -113,6 +113,13 @@ def cli_design(method, files, output="design.csv"):
     is_flag=True,
     help="Remove symlinks created by previous runs. Useful for re-running pipeline after misconfiguration.",
 )
+@click.option(
+    '-s',
+    '--scale-resources',
+    help="Scale factor the memory and time resources for the pipeline",
+    default=1.0,
+    type=float
+)
 @click.option(
     "-v",
     "--verbose",
@@ -128,6 +135,7 @@ def cli_pipeline(
     version=False,
     verbose=False,
     clean_symlinks=False,
+    scale_resources=1.0,
 ):
     """Runs the data processing pipeline"""
 
@@ -151,6 +159,9 @@ def cli_pipeline(
 
     pipeline_options, cores = extract_cores_from_options(pipeline_options)
 
+    # Scale the memory and time resources
+    os.environ["SCALE_RESOURCES"] = str(scale_resources)
+
     # Removes old symlinks if requested
     if clean_symlinks:
         logger.info("Cleaning symlinks")

diff --git a/seqnado/helpers.py b/seqnado/helpers.py
@@ -1,4 +1,4 @@
-from typing import Dict, Union, Optional, List, Tuple
+from typing import Dict, Union, Optional, List, Tuple, Any
 import pathlib
 import numpy as np
 import shlex
@@ -58,6 +58,25 @@ def extract_apptainer_args(options: List[str]) -> Tuple[List[str], str]:
     return options, apptainer_args
 
 
+def define_memory_requested(attempts: int = 1, initial_value: int  = 1, scale: float = 1) -> str:
+    """
+    Define the memory requested for the job.
+    """
+    memory = int(initial_value) * 2 ** (int(attempts) - 1)
+    memory = memory * float(scale)
+    return f"{memory}G"
+
+def define_time_requested(attempts: int = 1, initial_value: int = 1, scale: float = 1) -> str:
+    """
+    Define the time requested for the job.
+
+    Base time is 1 hour.
+    """
+    time = int(initial_value) * 2 ** (int(attempts) - 1)
+    time = time * float(scale)
+    return f"{time}h"
+
+
 def symlink_file(
     output_dir: pathlib.Path, source_path: pathlib.Path, new_file_name: str
 ):

diff --git a/seqnado/workflow/rules/align.smk b/seqnado/workflow/rules/align.smk
@@ -1,4 +1,4 @@
-from seqnado.helpers import check_options
+from seqnado.helpers import check_options, define_time_requested, define_memory_requested
 
 
 
@@ -13,8 +13,8 @@ rule align_paired:
         bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
     threads: config["bowtie2"]["threads"]
     resources:
-        runtime=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}h",
-        mem=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}GB",
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/align/{sample}.log",
     shell:
@@ -34,8 +34,8 @@ rule align_single:
     output:
         bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
     resources:
-        runtime=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}h",
-        mem=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}GB",
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
     threads: config["bowtie2"]["threads"]
     log:
         "seqnado_output/logs/align/{sample}.log",

diff --git a/seqnado/workflow/rules/align_rna.smk b/seqnado/workflow/rules/align_rna.smk
@@ -1,4 +1,4 @@
-from seqnado.helpers import check_options
+from seqnado.helpers import check_options, define_memory_requested, define_time_requested
 
 rule align_paired:
     input:
@@ -15,8 +15,8 @@ rule align_paired:
         ),
     threads: config["star"]["threads"]
     resources:
-        mem="35GB",
-        runtime="6h",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=35, attempts=attempt, scale=SCALE_RESOURCES),
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=6, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/align/{sample}.log",
     shell:

diff --git a/seqnado/workflow/rules/alignment_counts.smk b/seqnado/workflow/rules/alignment_counts.smk
@@ -1,4 +1,4 @@
-from seqnado.helpers import check_options
+from seqnado.helpers import check_options, define_time_requested, define_memory_requested
 
 rule feature_counts:
     input:
@@ -11,8 +11,8 @@ rule feature_counts:
         options=check_options(config["featurecounts"]["options"]),
     threads: config["featurecounts"]["threads"]
     resources:
-        mem=lambda wildcards, attempt: f"{3 * 2 ** (attempt)}GB",
-        runtime="2h",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=3, attempts=attempt, scale=SCALE_RESOURCES),
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/readcounts/featurecounts/featurecounts.log",
     shell:
@@ -41,8 +41,8 @@ rule salmon_counts_paired:
         options=check_options(config["salmon"]["options"]),
     threads: config["salmon"]["threads"]
     resources:
-        mem=lambda wildcards, attempt: f"{3 * 2 ** (attempt)}GB",
-        runtime="2h",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=3, attempts=attempt, scale=SCALE_RESOURCES),
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/readcounts/salmon/salmon_{sample}.log",
     shell:
@@ -61,8 +61,8 @@ rule salmon_counts_single:
         options=check_options(config["salmon"]["options"]),
     threads: config["salmon"]["threads"]
     resources:
-        mem=lambda wildcards, attempt: f"{3 * 2 ** (attempt)}GB",
-        runtime="2h",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=3, attempts=attempt, scale=SCALE_RESOURCES),
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/readcounts/salmon/salmon_{sample}.log",
     shell:

diff --git a/seqnado/workflow/rules/alignment_post_processing.smk b/seqnado/workflow/rules/alignment_post_processing.smk
@@ -1,4 +1,4 @@
-from seqnado.helpers import check_options
+from seqnado.helpers import check_options, define_time_requested, define_memory_requested
 
 
 rule sort_bam:
@@ -7,7 +7,7 @@ rule sort_bam:
     output:
         bam=temp("seqnado_output/aligned/sorted/{sample}.bam"),
     resources:
-        mem=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
     threads: 8
     log:
         "seqnado_output/logs/sorted/{sample}.log",
@@ -26,7 +26,7 @@ rule index_bam:
         bai=temp("seqnado_output/aligned/sorted/{sample}.bam.bai"),
     threads: 1
     resources:
-        mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt - 1)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     shell:
         "samtools index -@ {threads} -b {input.bam}"
 
@@ -46,8 +46,8 @@ if config["remove_blacklist"] and os.path.exists(config.get("blacklist", "")):
         params:
             blacklist=check_options(config["blacklist"]),
         resources:
-            mem="5GB",
-            runtime="4h",
+            mem=lambda wildcards, attempt: define_memory_requested(initial_value=5, attempts=attempt, scale=SCALE_RESOURCES),
+            runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
         log:
             "seqnado_output/logs/blacklist/{sample}.log",
         shell:
@@ -72,7 +72,7 @@ else:
             ),
         threads: 1
         resources:
-            mem="1GB",
+            mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
         log:
             "seqnado_output/logs/blacklist/{sample}.log",
         shell:
@@ -99,8 +99,8 @@ if config["remove_pcr_duplicates_method"] == "picard":
         params:
             options=check_options(config["picard"]["options"]),
         resources:
-            mem="5GB",
-            runtime="4h",
+            mem=lambda wildcards, attempt: define_memory_requested(initial_value=5, attempts=attempt, scale=SCALE_RESOURCES),
+            runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
         log:
             "seqnado_output/logs/duplicates/{sample}.log",
         shell:

diff --git a/seqnado/workflow/rules/exogenous_norm.smk b/seqnado/workflow/rules/exogenous_norm.smk
@@ -5,14 +5,14 @@ use rule align_paired as align_paired_spikein with:
     output:
         bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"),
     resources:
-        mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=8, attempts=attempt, scale=SCALE_RESOURCES),
 
 
 use rule align_single as align_single_spikein with:
     output:
         bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"),
     resources:
-        mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=8, attempts=attempt, scale=SCALE_RESOURCES),
 
 
 use rule sort_bam as sort_bam_spikein with:
@@ -21,7 +21,7 @@ use rule sort_bam as sort_bam_spikein with:
     output:
         bam=temp("seqnado_output/aligned/spikein/sorted/{sample}.bam"),
     resources:
-        mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=8, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/aligned_spikein/{sample}_sort.log",
 

diff --git a/seqnado/workflow/rules/fastq_screen.smk b/seqnado/workflow/rules/fastq_screen.smk
@@ -1,3 +1,6 @@
+from seqnado.helpers import check_options, define_time_requested, define_memory_requested
+
+
 
 rule fastq_screen_paired:
     input:
@@ -54,7 +57,7 @@ rule multiqc_fastqscreen:
     log:
         "seqnado_output/logs/multiqc_fastqscreen.log",
     resources:
-        mem=lambda wildcards, attempt: f"{2 * 2**attempt}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     shell:
         "multiqc -o seqnado_output/qc -n full_fastqscreen_report.html --force seqnado_output/qc/fastq_screen > {log} 2>&1"
 

diff --git a/seqnado/workflow/rules/fastq_trim.smk b/seqnado/workflow/rules/fastq_trim.smk
@@ -1,4 +1,4 @@
-from seqnado.helpers import check_options
+from seqnado.helpers import check_options, define_time_requested, define_memory_requested
 
 
 rule trimgalore_paired:
@@ -11,8 +11,8 @@ rule trimgalore_paired:
         trimmed2=temp("seqnado_output/trimmed/{sample}_2.fastq.gz"),
     threads: 4
     resources:
-        mem="2GB",
-        runtime="4h",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
     params:
         options=check_options(config["trim_galore"]["options"]),
         trim_dir="seqnado_output/trimmed",
@@ -34,8 +34,8 @@ rule trimgalore_single:
         trimmed=temp("seqnado_output/trimmed/{sample}.fastq.gz"),
     threads: 4
     resources:
-        mem="2GB",
-        runtime="2h",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
+        runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     params:
         options=check_options(config["trim_galore"]["options"]),
         trim_dir="seqnado_output/trimmed",

diff --git a/seqnado/workflow/rules/heatmap.smk b/seqnado/workflow/rules/heatmap.smk
@@ -1,4 +1,4 @@
-from seqnado.helpers import check_options, get_scale_method
+from seqnado.helpers import check_options, get_scale_method, define_memory_requested, define_time_requested
 
 if ASSAY == "ChIP":
     prefix = SAMPLE_NAMES_IP
@@ -23,7 +23,7 @@ rule heatmap_matrix:
     threads: config["deeptools"]["threads"]
     resources:
         runtime=lambda wildcards, attempt: f"{1 * 2**attempt}h",
-        mem=lambda wildcards, attempt: f"{4 * 2**attempt}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/heatmap/matrix.log",
     shell:
@@ -38,7 +38,7 @@ rule heatmap_plot:
     params:
         colormap=check_options(config["heatmap"]["colormap"]),
     resources:
-        mem=lambda wildcards, attempt: f"{2 * 2**attempt}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/heatmap/heatmap.log",
     shell:
@@ -51,7 +51,7 @@ rule heatmap_metaplot:
     output:
         metaplot="seqnado_output/heatmap/metaplot.pdf",
     resources:
-        mem=lambda wildcards, attempt: f"{2 * 2**attempt}GB"
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/heatmap/metaplot.log",
     shell:

diff --git a/seqnado/workflow/rules/hub.smk b/seqnado/workflow/rules/hub.smk
@@ -72,7 +72,7 @@ rule bed_to_bigbed:
     params:
         chrom_sizes=config["genome"]["chromosome_sizes"],
     resources:
-        mem="1GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/bed_to_bigbed/{directory}/{sample}.log",
     shell:

diff --git a/seqnado/workflow/rules/motif.smk b/seqnado/workflow/rules/motif.smk
@@ -8,7 +8,7 @@ rule get_fasta:
     params:
         genome=config["genome"]["fasta"],
     resources:
-        mem=lambda wildcards, attempt: f"{1 * 2 ** (attempt)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/motifs/fasta/{sample}.log",
     shell:
@@ -28,7 +28,7 @@ rule motif_meme_chip:
         meme_chip_params=config["meme"]["meme_chip_params"],
         meme_chip_db=config["meme"]["meme_chip_db"],
     resources:
-        mem=lambda wildcards, attempt: f"{1 * 2 ** (attempt)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/motifs/meme/{sample}.log",
     shell:
@@ -47,7 +47,7 @@ rule motif_homer:
         homer_params=config["homer"]["homer_params"],
         homer_bg=config["homer"]["homer_bg"],
     resources:
-        mem=lambda wildcards, attempt: f"{1 * 2 ** (attempt)}GB",
+        mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
     log:
         "seqnado_output/logs/motifs/homer/{sample}.log",
     shell: