Skip to content

Commit

Permalink
Feat split fastq files (#65)
Browse files Browse the repository at this point in the history
* feat: add fastq_split option

* add fastq_split option to atac chip and rna & test

* add num of parts to split fastq to config

* update test for snp with split and call snps
  • Loading branch information
CChahrour authored May 4, 2023
1 parent 0660c01 commit 510f227
Show file tree
Hide file tree
Showing 22 changed files with 206 additions and 72 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ dist/*
.ipynb_checkpoints/
seqnado/_version.py

2023-04-27_test_snp/*
2 changes: 2 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ channels:
- bioconda
- defaults
dependencies:
- bedtools
- bcftools
- bowtie2
- click
- cookiecutter
- deeptools
- fastqc
- fastqsplitter
- homer
- macs2
- multiqc
Expand Down
9 changes: 8 additions & 1 deletion seqnado/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,13 +183,20 @@ def check_options(value: object):


def translate_fq_files(wc, samples: GenericFastqSamples, paired: bool=False):

if paired:
return {"fq1": samples.translation[f"{wc.sample}_1.fastq.gz"],
"fq2": samples.translation[f"{wc.sample}_2.fastq.gz"]}
else:
return {"fq": samples.translation[f"{wc.sample}_{wc.read}.fastq.gz"]}


def translate_fq_files_split(wc, samples: GenericFastqSamples, paired: bool=False):
if paired:
return [[f"fq1=", samples.translation[f"{wc.sample}_1.fastq.gz"]],
[f"fq2=", samples.translation[f"{wc.sample}_2.fastq.gz"]]]
else:
return [f"fq=", samples.translation[f"{wc.sample}_{wc.read}.fastq.gz"]]

def get_fq_filestem(wc, samples: GenericFastqSamples):
fn = samples.translation[f"{wc.sample}_{wc.read}.fastq.gz"]
basename = os.path.basename(fn)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"indicies": "/databank/igenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome",
"gtf": "path/to/gtf",
"read_type": ["paired", "single"],
"split_fastq": ["yes", "no"],
"remove_pcr_duplicates_method": ["picard", "deeptools"],
"shift_atac_reads": ["no", "yes"],
"remove_blacklist": ["yes", "no"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ genome:
design: "design.csv"

read_type: "{{cookiecutter.read_type}}"
split_fastq: "{{cookiecutter.split_fastq}}"
remove_pcr_duplicates_method: "{{cookiecutter.remove_pcr_duplicates_method}}"
shift_atac_reads: "{{cookiecutter.shift_atac_reads}}"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"indicies": "/databank/igenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome",
"gtf": "path/to/gtf",
"read_type": ["paired", "single"],
"split_fastq": ["yes", "no"],
"remove_pcr_duplicates_method": ["picard", "deeptools"],
"remove_blacklist": ["yes", "no"],
"blacklist": "path/to/hg38-blacklist.v2.bed.gz",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ genome:
design: "design.csv"

read_type: "{{cookiecutter.read_type}}"
split_fastq: "{{cookiecutter.split_fastq}}"
remove_pcr_duplicates_method: "{{cookiecutter.remove_pcr_duplicates_method}}"
shift_atac_reads: "False"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"indicies": "/databank/igenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome",
"gtf": "/databank/igenomes/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf",
"read_type": ["paired", "single"],
"split_fastq": ["yes", "no"],
"remove_pcr_duplicates_method": ["picard", "deeptools"],
"remove_blacklist": ["yes", "no"],
"blacklist": "path/to/hg38-blacklist.v2.bed.gz",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ genome:
design: "design.csv"

read_type: "{{cookiecutter.read_type}}"
split_fastq: "{{cookiecutter.split_fastq}}"
remove_pcr_duplicates_method: "{{cookiecutter.remove_pcr_duplicates_method}}"
shift_atac_reads: "False"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
"chromosome_sizes": "path/to/hg38.chrom.sizes",
"indicies": "/databank/igenomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome",
"read_type": ["paired", "single"],
"split_fastq": ["yes", "no"],
"split_fastq_parts": "int",
"remove_blacklist": ["yes", "no"],
"blacklist": "path/to/hg38-blacklist.v2.bed.gz",
"call_snps": ["yes", "no"],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ genome:
chromosome_sizes: "{{cookiecutter.chromosome_sizes}}"

read_type: "{{cookiecutter.read_type}}"
split_fastq: "{{cookiecutter.split_fastq}}"
split_fastq_parts: "{{cookiecutter.split_fastq_parts}}"
shift_atac_reads: "no"
remove_blacklist: "{{cookiecutter.remove_blacklist}}"
call_snps: "{{cookiecutter.call_snps}}"
Expand Down
1 change: 1 addition & 0 deletions seqnado/workflow/envs/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ dependencies:
- deeptools
- trim-galore
- fastqc
- fastqsplitter
- multiqc
- trackhub
- seaborn
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
jobname: smk-{jobid}-{rule}-{wildcards}
drmaa: --cpus-per-task={threads} --mem-per-cpu={resources.mem_mb} --time=1:00:00
drmaa: --cpus-per-task={threads} --mem-per-cpu={resources.mem_mb} --time=24:00:00
use-singularity: true
singularity-args: -B /ceph -B /databank -B /datashare
jobs: 50
Expand Down
80 changes: 40 additions & 40 deletions seqnado/workflow/rules/align.smk
Original file line number Diff line number Diff line change
@@ -1,44 +1,44 @@
import seqnado.utils as utils

if config["split_fastq"] == "no":
rule align_paired:
input:
fq1="seqnado_output/trimmed/{sample}_1.fastq.gz",
fq2="seqnado_output/trimmed/{sample}_2.fastq.gz",
params:
index=config["genome"]["indicies"],
options=utils.check_options(config["bowtie2"]["options"]),
output:
bam="seqnado_output/aligned/raw/{sample}.bam",
threads: config["bowtie2"]["threads"]
resources:
mem_mb=4000 // int(config["bowtie2"]["threads"])
log:
"seqnado_output/logs/align/{sample}.log",
shell:
"""bowtie2 -p {threads} -x {params.index} -1 {input.fq1} -2 {input.fq2} {params.options} 2> {log} |
samtools view -bS - > {output.bam} &&
samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} >> {log} 2>&1 &&
mv {output.bam}_sorted {output.bam}
"""

rule align_paired:
input:
fq1="seqnado_output/trimmed/{sample}_1.fastq.gz",
fq2="seqnado_output/trimmed/{sample}_2.fastq.gz",
params:
index=config["genome"]["indicies"],
options=utils.check_options(config["bowtie2"]["options"]),
output:
bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
threads: config["bowtie2"]["threads"]
resources:
mem_mb=4000 // int(config["bowtie2"]["threads"])
log:
"seqnado_output/logs/align/{sample}.log",
shell:
"""bowtie2 -p {threads} -x {params.index} -1 {input.fq1} -2 {input.fq2} {params.options} 2> {log} |
samtools view -bS - > {output.bam} &&
samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} >> {log} 2>&1 &&
mv {output.bam}_sorted {output.bam}
"""


# rule align_single:
# input:
# fq1="seqnado_output/trimmed/{sample}.fastq.gz",
# params:
# index=config["genome"]["indicies"],
# options=config["bowtie2"]["options"],
# output:
# bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
# resources:
# mem_mb=4000 // int(config["bowtie2"]["threads"])
# threads: config["bowtie2"]["threads"]
# log:
# "seqnado_output/logs/align/{sample}.log",
# shell:
# """bowtie2 -p {threads} -x {params.index} -U {input.fq1} {params.options} 2> {log} |
# samtools view -bS - > {output.bam} &&
# samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} &&
# mv {output.bam}_sorted {output.bam}
# """
# rule align_single:
# input:
# fq1="seqnado_output/trimmed/{sample}.fastq.gz",
# params:
# index=config["genome"]["indicies"],
# options=config["bowtie2"]["options"],
# output:
# bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
# resources:
# mem_mb=4000 // int(config["bowtie2"]["threads"])
# threads: config["bowtie2"]["threads"]
# log:
# "seqnado_output/logs/align/{sample}.log",
# shell:
# """bowtie2 -p {threads} -x {params.index} -U {input.fq1} {params.options} 2> {log} |
# samtools view -bS - > {output.bam} &&
# samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} &&
# mv {output.bam}_sorted {output.bam}
# """
2 changes: 1 addition & 1 deletion seqnado/workflow/rules/align_rna.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rule rename_aligned:
input:
bam=rules.align_paired.output.bam,
output:
bam="seqnado_output/aligned/sorted/{sample}.bam",
bam="seqnado_output/aligned/raw/{sample}.bam",
shell:
"mv {input.bam} {output.bam}"

Expand Down
69 changes: 69 additions & 0 deletions seqnado/workflow/rules/fastq_split.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import seqnado.utils as utils
PARTS=[str (x) for x in range(int(config["split_fastq_parts"]))]
if config["split_fastq"] == "yes":
if config["read_type"] == "paired":
rule split_fq:
input:
unpack(lambda wc: seqnado.utils.translate_fq_files(wc, samples=FASTQ_SAMPLES, paired=True)),
output:
expand("seqnado_output/fastq_split/{{sample}}_{part}_{read}.fastq.gz", part=PARTS, read=["1", "2"]),
params:
split1=expand("-o seqnado_output/fastq_split/{{sample}}_{part}_1.fastq.gz", part=PARTS),
split2=expand("-o seqnado_output/fastq_split/{{sample}}_{part}_2.fastq.gz", part=PARTS),
resources:
mem_mb=750,
shell:"""
fastqsplitter -i {input.fq1} {params.split1} &&
fastqsplitter -i {input.fq2} {params.split2}
"""

rule trimgalore_paired:
input:
split1="seqnado_output/fastq_split/{sample}_{part}_1.fastq.gz",
split2="seqnado_output/fastq_split/{sample}_{part}_2.fastq.gz",
output:
trimmed1=temp("seqnado_output/trimmed/{sample}_{part}_1_trimmed.fq.gz"),
trimmed2=temp("seqnado_output/trimmed/{sample}_{part}_2_trimmed.fq.gz"),
threads: 4
resources:
mem_mb=750,
params:
options=utils.check_options(config['trim_galore']['options']),
trim_dir="seqnado_output/trimmed"
log:"seqnado_output/logs/trimming/{sample}_{part}.log",
shell:"""
trim_galore --cores {threads} {params.options} --basename {wildcards.sample}_{wildcards.part} --paired --output_dir {params.trim_dir} {input.split1} {input.split2} >> {log} 2>&1 &&
mv {params.trim_dir}/{wildcards.sample}_{wildcards.part}_val_1.fq.gz {output.trimmed1} &&
mv {params.trim_dir}/{wildcards.sample}_{wildcards.part}_val_2.fq.gz {output.trimmed2}
"""

rule align_split:
input:
fq1="seqnado_output/trimmed/{sample}_{part}_1_trimmed.fq.gz",
fq2="seqnado_output/trimmed/{sample}_{part}_2_trimmed.fq.gz",
output:
bam=temp("seqnado_output/aligned/split/{sample}_{part}.bam"),
params:
index=config["genome"]["indicies"],
options=utils.check_options(config["bowtie2"]["options"]),
threads: config["bowtie2"]["threads"]
resources:
mem_mb=4000 // int(config["bowtie2"]["threads"])
log:"seqnado_output/logs/aligned/split/{sample}_part{part}.log",
shell:"""
bowtie2 -p {threads} -x {params.index} -1 {input.fq1} -2 {input.fq2} {params.options} 2> {log} |
samtools view -bS - > {output.bam} &&
samtools sort -@ {threads} -o {output.bam}_sorted {output.bam} >> {log} 2>&1 &&
mv {output.bam}_sorted {output.bam}
"""

rule merge_bams:
input:
expand("seqnado_output/aligned/split/{{sample}}_{part}.bam", part=PARTS),
output:
bam=temp("seqnado_output/aligned/raw/{sample}.bam"),
threads: 4
log:"seqnado_output/logs/merge/{sample}.log",
shell:"""
samtools merge -o {output.bam} -@ {threads} -h {input} >> {log} 2>&1
"""
65 changes: 42 additions & 23 deletions seqnado/workflow/rules/qc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ rule fastqc_trimmed:

rule samtools_stats:
input:
bam="seqnado_output/aligned/sorted/{sample}.bam",
bam="seqnado_output/aligned/raw/{sample}.bam",
output:
stats="seqnado_output/qc/alignment_raw/{sample}.txt",
threads: 1
Expand All @@ -58,26 +58,45 @@ use rule samtools_stats as samtools_stats_filtered with:
output:
stats="seqnado_output/qc/alignment_filtered/{sample}.txt",

if config["split_fastq"] == "no":
rule multiqc:
input:
expand(
"seqnado_output/qc/fastqc_raw/{sample}_{read}_fastqc.html",
sample=SAMPLE_NAMES,
read=[1, 2],
),
expand(
"seqnado_output/qc/fastqc_trimmed/{sample}_{read}_fastqc.html",
sample=SAMPLE_NAMES,
read=[1, 2],
),
expand("seqnado_output/qc/alignment_raw/{sample}.txt", sample=SAMPLE_NAMES),
expand("seqnado_output/qc/alignment_filtered/{sample}.txt", sample=SAMPLE_NAMES),
output:
"seqnado_output/qc/full_qc_report.html",
log:
"seqnado_output/logs/multiqc.log",
resources:
mem_mb=1000,
shell:
"multiqc -o seqnado_output/qc seqnado_output/qc -n full_qc_report.html --force > {log} 2>&1"

rule multiqc:
input:
expand(
"seqnado_output/qc/fastqc_raw/{sample}_{read}_fastqc.html",
sample=SAMPLE_NAMES,
read=[1, 2],
),
expand(
"seqnado_output/qc/fastqc_trimmed/{sample}_{read}_fastqc.html",
sample=SAMPLE_NAMES,
read=[1, 2],
),
expand("seqnado_output/qc/alignment_raw/{sample}.txt", sample=SAMPLE_NAMES),
expand("seqnado_output/qc/alignment_filtered/{sample}.txt", sample=SAMPLE_NAMES),
output:
"seqnado_output/qc/full_qc_report.html",
log:
"seqnado_output/logs/multiqc.log",
resources:
mem_mb=1000,
shell:
"multiqc -o seqnado_output/qc seqnado_output/qc -n full_qc_report.html --force > {log} 2>&1"
else:
rule multiqc:
input:
expand(
"seqnado_output/qc/fastqc_raw/{sample}_{read}_fastqc.html",
sample=SAMPLE_NAMES,
read=[1, 2],
),
expand("seqnado_output/qc/alignment_raw/{sample}.txt", sample=SAMPLE_NAMES),
expand("seqnado_output/qc/alignment_filtered/{sample}.txt", sample=SAMPLE_NAMES),
output:
"seqnado_output/qc/full_qc_report.html",
log:
"seqnado_output/logs/multiqc.log",
resources:
mem_mb=1000,
shell:
"multiqc -o seqnado_output/qc seqnado_output/qc -n full_qc_report.html --force > {log} 2>&1"
11 changes: 7 additions & 4 deletions seqnado/workflow/snakefile_snp
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,15 @@ else:
DESIGN = FASTQ_SAMPLES.design
SAMPLE_NAMES = FASTQ_SAMPLES.sample_names_all


include: "rules/qc.smk"
include: "rules/fastq_trim.smk"
include: "rules/align.smk"
include: "rules/alignment_post_processing.smk"
include: "rules/qc.smk"
include: "rules/variant.smk"
if config["split_fastq"] == "yes":
include: "rules/fastq_split.smk"
else:
include: "rules/fastq_trim.smk"
include: "rules/align.smk"


# Define output files
ANALYSIS_OUTPUT = [
Expand Down
1 change: 1 addition & 0 deletions tests/test_atac.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def set_up(
"indicies": genome_indicies,
"design": "design.csv",
"read_type": "paired",
"split_fastq": "no",
"remove_pcr_duplicates_method": "picard",
"shift_atac_reads": "yes",
"remove_blacklist": "yes",
Expand Down
Loading

0 comments on commit 510f227

Please sign in to comment.