Skip to content

Commit

Permalink
feat(pipeline): Added -s option to allow for scaling resources (memor…
Browse files Browse the repository at this point in the history
…y and time) (#202)

* Refactor pyproject.toml to remove drmaa dependency

* feat: add container def file

* feat: Add option to scale memory and time resources for pipeline

* Add seqnado.sif to .gitignore

* Refactor assay-specific Snakefiles to include SCALE_RESOURCES variable

The commit refactors the assay-specific Snakefiles (`snakefile_atac`, `snakefile_chip`, `snakefile_snp`) to include the `SCALE_RESOURCES` variable. This variable is used to scale memory and time resources for the pipeline. The default value is set to 1, but it can be customized by setting the `SCALE_RESOURCES` environment variable.

Co-authored-by: alsmith <[email protected]>

* Refactor Snakefiles to include SCALE_RESOURCES variable

* Refactor define_memory_requested and define_time_requested functions

* Refactor assay-specific Snakefiles to include SCALE_RESOURCES variable

* Refactor define_memory_requested and define_time_requested functions

* Refactor define_memory_requested and define_time_requested functions to fix calculation bug

* Refactor peak_call_grouped.smk to use define_memory_requested function for memory resource calculation

* Refactor peak_call_grouped.smk to use define_memory_requested function for memory resource calculation

* Refactor peak_call_grouped.smk to use define_memory_requested function for memory resource calculation

* Ensure that the define_*_requested functions are imported in all rules
  • Loading branch information
alsmith151 authored Jul 25, 2024
1 parent 6230dd9 commit 6c9fdcc
Show file tree
Hide file tree
Showing 26 changed files with 168 additions and 92 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ test_rna_size_factors.ipynb
tests/data/*
pytestdebug.log
sps*
seqnado.sif
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ dynamic = ["version"]
dependencies = [
"click",
"cookiecutter",
"drmaa",
"pandas",
"pandera",
"pulp<=2.7.0",
Expand Down
39 changes: 39 additions & 0 deletions seqnado.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
BootStrap: docker
From: mambaorg/micromamba:bookworm

%post

# Pre set up
cd /opt/
apt update
apt install -y curl gcc git cmake make wget

# Install singularity
wget https://github.com/apptainer/apptainer/releases/download/v1.3.3/apptainer_1.3.3_amd64.deb
apt install -y ./apptainer_1.3.3_amd64.deb
apptainer remote add --no-login SylabsCloud cloud.sylabs.io
apptainer remote use SylabsCloud



# Mamba packages
#micromamba install -y -n base -f /opt/environment.yml
micromamba install -y -n base -c conda-forge python pip
export PATH="/opt/conda/bin:$PATH"

# Install Seqnado
git clone https://github.com/alsmith151/SeqNado.git
cd SeqNado
/opt/conda/bin/python -m pip install .

# Clean
#curl gcc git cmake make libtool g++ pkgconfig openssl-dev linux-headers
micromamba clean -afy
/opt/conda/bin/python -m pip cache purge
find /opt/conda/ -follow -type f -name '*.a' -delete
find /opt/conda/ -follow -type f -name '*.pyc' -delete
find /opt/conda/ -follow -type f -name '*.js.map' -delete


%environment
export PATH=/opt/conda/bin:$PATH
11 changes: 11 additions & 0 deletions seqnado/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,13 @@ def cli_design(method, files, output="design.csv"):
is_flag=True,
help="Remove symlinks created by previous runs. Useful for re-running pipeline after misconfiguration.",
)
@click.option(
'-s',
'--scale-resources',
help="Scale factor the memory and time resources for the pipeline",
default=1.0,
type=float
)
@click.option(
"-v",
"--verbose",
Expand All @@ -128,6 +135,7 @@ def cli_pipeline(
version=False,
verbose=False,
clean_symlinks=False,
scale_resources=1.0,
):
"""Runs the data processing pipeline"""

Expand All @@ -151,6 +159,9 @@ def cli_pipeline(

pipeline_options, cores = extract_cores_from_options(pipeline_options)

# Scale the memory and time resources
os.environ["SCALE_RESOURCES"] = str(scale_resources)

# Removes old symlinks if requested
if clean_symlinks:
logger.info("Cleaning symlinks")
Expand Down
21 changes: 20 additions & 1 deletion seqnado/helpers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Dict, Union, Optional, List, Tuple
from typing import Dict, Union, Optional, List, Tuple, Any
import pathlib
import numpy as np
import shlex
Expand Down Expand Up @@ -58,6 +58,25 @@ def extract_apptainer_args(options: List[str]) -> Tuple[List[str], str]:
return options, apptainer_args


def define_memory_requested(attempts: int = 1, initial_value: int = 1, scale: float = 1) -> str:
"""
Define the memory requested for the job.
"""
memory = int(initial_value) * 2 ** (int(attempts) - 1)
memory = memory * float(scale)
return f"{memory}G"

def define_time_requested(attempts: int = 1, initial_value: int = 1, scale: float = 1) -> str:
"""
Define the time requested for the job.
Base time is 1 hour.
"""
time = int(initial_value) * 2 ** (int(attempts) - 1)
time = time * float(scale)
return f"{time}h"


def symlink_file(
output_dir: pathlib.Path, source_path: pathlib.Path, new_file_name: str
):
Expand Down
10 changes: 5 additions & 5 deletions seqnado/workflow/rules/align.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from seqnado.helpers import check_options
from seqnado.helpers import check_options, define_time_requested, define_memory_requested



Expand All @@ -13,8 +13,8 @@ rule align_paired:
bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
threads: config["bowtie2"]["threads"]
resources:
runtime=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}h",
mem=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}GB",
runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/align/{sample}.log",
shell:
Expand All @@ -34,8 +34,8 @@ rule align_single:
output:
bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
resources:
runtime=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}h",
mem=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}GB",
runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
threads: config["bowtie2"]["threads"]
log:
"seqnado_output/logs/align/{sample}.log",
Expand Down
6 changes: 3 additions & 3 deletions seqnado/workflow/rules/align_rna.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from seqnado.helpers import check_options
from seqnado.helpers import check_options, define_memory_requested, define_time_requested

rule align_paired:
input:
Expand All @@ -15,8 +15,8 @@ rule align_paired:
),
threads: config["star"]["threads"]
resources:
mem="35GB",
runtime="6h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=35, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=6, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/align/{sample}.log",
shell:
Expand Down
14 changes: 7 additions & 7 deletions seqnado/workflow/rules/alignment_counts.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from seqnado.helpers import check_options
from seqnado.helpers import check_options, define_time_requested, define_memory_requested

rule feature_counts:
input:
Expand All @@ -11,8 +11,8 @@ rule feature_counts:
options=check_options(config["featurecounts"]["options"]),
threads: config["featurecounts"]["threads"]
resources:
mem=lambda wildcards, attempt: f"{3 * 2 ** (attempt)}GB",
runtime="2h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=3, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/readcounts/featurecounts/featurecounts.log",
shell:
Expand Down Expand Up @@ -41,8 +41,8 @@ rule salmon_counts_paired:
options=check_options(config["salmon"]["options"]),
threads: config["salmon"]["threads"]
resources:
mem=lambda wildcards, attempt: f"{3 * 2 ** (attempt)}GB",
runtime="2h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=3, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/readcounts/salmon/salmon_{sample}.log",
shell:
Expand All @@ -61,8 +61,8 @@ rule salmon_counts_single:
options=check_options(config["salmon"]["options"]),
threads: config["salmon"]["threads"]
resources:
mem=lambda wildcards, attempt: f"{3 * 2 ** (attempt)}GB",
runtime="2h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=3, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/readcounts/salmon/salmon_{sample}.log",
shell:
Expand Down
16 changes: 8 additions & 8 deletions seqnado/workflow/rules/alignment_post_processing.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from seqnado.helpers import check_options
from seqnado.helpers import check_options, define_time_requested, define_memory_requested


rule sort_bam:
Expand All @@ -7,7 +7,7 @@ rule sort_bam:
output:
bam=temp("seqnado_output/aligned/sorted/{sample}.bam"),
resources:
mem=lambda wildcards, attempt: f"{4 * 2 ** (attempt - 1)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
threads: 8
log:
"seqnado_output/logs/sorted/{sample}.log",
Expand All @@ -26,7 +26,7 @@ rule index_bam:
bai=temp("seqnado_output/aligned/sorted/{sample}.bam.bai"),
threads: 1
resources:
mem=lambda wildcards, attempt: f"{2 * 2 ** (attempt - 1)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
shell:
"samtools index -@ {threads} -b {input.bam}"

Expand All @@ -46,8 +46,8 @@ if config["remove_blacklist"] and os.path.exists(config.get("blacklist", "")):
params:
blacklist=check_options(config["blacklist"]),
resources:
mem="5GB",
runtime="4h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=5, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/blacklist/{sample}.log",
shell:
Expand All @@ -72,7 +72,7 @@ else:
),
threads: 1
resources:
mem="1GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/blacklist/{sample}.log",
shell:
Expand All @@ -99,8 +99,8 @@ if config["remove_pcr_duplicates_method"] == "picard":
params:
options=check_options(config["picard"]["options"]),
resources:
mem="5GB",
runtime="4h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=5, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/duplicates/{sample}.log",
shell:
Expand Down
6 changes: 3 additions & 3 deletions seqnado/workflow/rules/exogenous_norm.smk
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ use rule align_paired as align_paired_spikein with:
output:
bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"),
resources:
mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=8, attempts=attempt, scale=SCALE_RESOURCES),


use rule align_single as align_single_spikein with:
output:
bam=temp("seqnado_output/aligned/spikein/raw/{sample}.bam"),
resources:
mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=8, attempts=attempt, scale=SCALE_RESOURCES),


use rule sort_bam as sort_bam_spikein with:
Expand All @@ -21,7 +21,7 @@ use rule sort_bam as sort_bam_spikein with:
output:
bam=temp("seqnado_output/aligned/spikein/sorted/{sample}.bam"),
resources:
mem=lambda wildcards, attempt: f"{8 * 2 ** (attempt - 1)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=8, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/aligned_spikein/{sample}_sort.log",

Expand Down
5 changes: 4 additions & 1 deletion seqnado/workflow/rules/fastq_screen.smk
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from seqnado.helpers import check_options, define_time_requested, define_memory_requested



rule fastq_screen_paired:
input:
Expand Down Expand Up @@ -54,7 +57,7 @@ rule multiqc_fastqscreen:
log:
"seqnado_output/logs/multiqc_fastqscreen.log",
resources:
mem=lambda wildcards, attempt: f"{2 * 2**attempt}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
shell:
"multiqc -o seqnado_output/qc -n full_fastqscreen_report.html --force seqnado_output/qc/fastq_screen > {log} 2>&1"

Expand Down
10 changes: 5 additions & 5 deletions seqnado/workflow/rules/fastq_trim.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from seqnado.helpers import check_options
from seqnado.helpers import check_options, define_time_requested, define_memory_requested


rule trimgalore_paired:
Expand All @@ -11,8 +11,8 @@ rule trimgalore_paired:
trimmed2=temp("seqnado_output/trimmed/{sample}_2.fastq.gz"),
threads: 4
resources:
mem="2GB",
runtime="4h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
params:
options=check_options(config["trim_galore"]["options"]),
trim_dir="seqnado_output/trimmed",
Expand All @@ -34,8 +34,8 @@ rule trimgalore_single:
trimmed=temp("seqnado_output/trimmed/{sample}.fastq.gz"),
threads: 4
resources:
mem="2GB",
runtime="2h",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
runtime=lambda wildcards, attempt: define_time_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
params:
options=check_options(config["trim_galore"]["options"]),
trim_dir="seqnado_output/trimmed",
Expand Down
8 changes: 4 additions & 4 deletions seqnado/workflow/rules/heatmap.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from seqnado.helpers import check_options, get_scale_method
from seqnado.helpers import check_options, get_scale_method, define_memory_requested, define_time_requested

if ASSAY == "ChIP":
prefix = SAMPLE_NAMES_IP
Expand All @@ -23,7 +23,7 @@ rule heatmap_matrix:
threads: config["deeptools"]["threads"]
resources:
runtime=lambda wildcards, attempt: f"{1 * 2**attempt}h",
mem=lambda wildcards, attempt: f"{4 * 2**attempt}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=4, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/heatmap/matrix.log",
shell:
Expand All @@ -38,7 +38,7 @@ rule heatmap_plot:
params:
colormap=check_options(config["heatmap"]["colormap"]),
resources:
mem=lambda wildcards, attempt: f"{2 * 2**attempt}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/heatmap/heatmap.log",
shell:
Expand All @@ -51,7 +51,7 @@ rule heatmap_metaplot:
output:
metaplot="seqnado_output/heatmap/metaplot.pdf",
resources:
mem=lambda wildcards, attempt: f"{2 * 2**attempt}GB"
mem=lambda wildcards, attempt: define_memory_requested(initial_value=2, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/heatmap/metaplot.log",
shell:
Expand Down
2 changes: 1 addition & 1 deletion seqnado/workflow/rules/hub.smk
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ rule bed_to_bigbed:
params:
chrom_sizes=config["genome"]["chromosome_sizes"],
resources:
mem="1GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/bed_to_bigbed/{directory}/{sample}.log",
shell:
Expand Down
6 changes: 3 additions & 3 deletions seqnado/workflow/rules/motif.smk
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ rule get_fasta:
params:
genome=config["genome"]["fasta"],
resources:
mem=lambda wildcards, attempt: f"{1 * 2 ** (attempt)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/motifs/fasta/{sample}.log",
shell:
Expand All @@ -28,7 +28,7 @@ rule motif_meme_chip:
meme_chip_params=config["meme"]["meme_chip_params"],
meme_chip_db=config["meme"]["meme_chip_db"],
resources:
mem=lambda wildcards, attempt: f"{1 * 2 ** (attempt)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/motifs/meme/{sample}.log",
shell:
Expand All @@ -47,7 +47,7 @@ rule motif_homer:
homer_params=config["homer"]["homer_params"],
homer_bg=config["homer"]["homer_bg"],
resources:
mem=lambda wildcards, attempt: f"{1 * 2 ** (attempt)}GB",
mem=lambda wildcards, attempt: define_memory_requested(initial_value=1, attempts=attempt, scale=SCALE_RESOURCES),
log:
"seqnado_output/logs/motifs/homer/{sample}.log",
shell:
Expand Down
Loading

0 comments on commit 6c9fdcc

Please sign in to comment.