Skip to content

Commit 42feb56

Browse files
committed
first commit
1 parent 35d2cc2 commit 42feb56

File tree

8 files changed

+277
-4
lines changed

8 files changed

+277
-4
lines changed

config/config.yml

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
genome: "hg38" # human or mouse
2+
ensembl_genome_build: "110"
3+
resources: # computing resources
4+
account: XXX
5+
partition: cclake
6+
max_jobs: 300
7+
trim:
8+
cpu: 8
9+
time: 60
10+
fastqc:
11+
cpu: 4
12+
time: 60
13+
damid:
14+
cpu: 8
15+
time: 120
16+
plotting:
17+
cpu: 2
18+
time: 20

workflow/Snakefile

+47-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,47 @@
1-
# Main entrypoint of the workflow.
2-
# Please follow the best practices:
3-
# https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html,
4-
# in particular regarding the standardized folder structure mentioned there.
1+
import os
2+
from scripts.resources import Resources
3+
from scripts import general_functions as gf
4+
from snakemake.utils import min_version
5+
6+
# set minimum snakemake version
7+
min_version("6.4.1")
8+
9+
# load config file
10+
configfile: "config/config.yaml"
11+
12+
# load genome resources to be used in rules
13+
resources = Resources(config["genome"], config["ensembl_genome_build"])
14+
15+
# get sample names
16+
SAMPLES = gf.import_samples()
17+
18+
# import rules
19+
include: "rules/fastqc.smk"
20+
include: "rules/trimming.smk"
21+
include: "rules/resources.smk"
22+
include: "rules/damid.smk"
23+
include: "rules/bedgraph_processing.smk"
24+
include: "rules/plotting.smk"
25+
26+
# target rule
27+
rule all:
28+
input:
29+
"results/plots/mapping_rates.pdf",
30+
"results/plots/pca.pdf",
31+
"results/plots/sample_distance.pdf",
32+
33+
34+
# save snakemake terminal output to log file
35+
snake_log = "logs/snakemake/snakemake.log"
36+
os.makedirs("logs/snakemake", exist_ok=True)
37+
38+
onsuccess:
39+
shell("cp -v {log} {snake_log}")
40+
shell(f"pigz {resources.fasta}") # compress genome files
41+
print("Analysis finished successfully!")
42+
43+
onerror:
44+
shell("cp -v {log} {snake_log}")
45+
print(f"Analysis (partly) failed...\nCheck {snake_log} for details")
46+
47+

workflow/envs/damid.yml

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
name: damid
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- damidseq_pipeline=1.5.3

workflow/rules/damid.smk

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
rule bowtie2_build:
2+
input:
3+
ref=resources.fasta,
4+
output:
5+
multiext(
6+
f"resources/bowtie2_index/{resources.genome}",
7+
".1.bt2",
8+
".2.bt2",
9+
".3.bt2",
10+
".4.bt2",
11+
".rev.1.bt2",
12+
".rev.2.bt2",
13+
),
14+
log:
15+
"logs/bowtie2_build/build.log",
16+
params:
17+
extra="", # optional parameters
18+
threads: config["damid"]["threads"]
19+
wrapper:
20+
"v2.6.0/bio/bowtie2/build"
21+
22+
23+
rule create_gatc_fragments:
24+
input:
25+
resources.fasta
26+
output:
27+
f"resources/gatc_fragment_file_{resources.genome}.gff.gz"
28+
conda:
29+
"envs/damid.yml"
30+
threads: config["damid"]["threads"]
31+
log:
32+
"logs/create_gatc_fragment_file/gatc.log"
33+
shell:
34+
"perl gatc.track.maker.pl "
35+
f"--name={resources.genome} "
36+
f"{resources.fasta} 2> {log}"
37+
38+
39+
rule damidseq_pipeline:
40+
input:
41+
gatc=f"resources/gatc_fragment_file_{resources.genome}.gff.gz",
42+
b2dir=f"resources/bowtie2_index/{resources.genome}",
43+
output:
44+
directory("results/bedgraph"),
45+
conda:
46+
"envs/damid.yml"
47+
threads: config["damid"]["threads"]
48+
log:
49+
"logs/damidseq_pipeline/damidseq_pipeline.log"
50+
shell:
51+
"cd reads/ && "
52+
"damidseq_pipeline "
53+
"--paired "
54+
"--gatc_frag_file={input.gatc "
55+
"--bowtie2_genome_dir={input.b2dir} 2> {log} && "
56+
"cd .."
57+

workflow/rules/fastqc.smk

+28
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
rule fastqc:
2+
input:
3+
"reads/{sample}{end}.fastq.gz"
4+
output:
5+
html="results/qc/fastqc/{sample}{end}.html",
6+
zip="results/qc/fastqc/{sample}{end}_fastqc.zip"
7+
params:
8+
extra = "--quiet"
9+
log:
10+
"logs/fastqc/{sample}{end}.log"
11+
threads: config["resources"]["fastqc"]["cpu"]
12+
wrapper:
13+
"v2.0.0/bio/fastqc"
14+
15+
16+
rule multiqc:
17+
input:
18+
expand("results/qc/fastqc/{sample}{end}_fastqc.zip", sample=SAMPLES, end=["_R1","_R2"])
19+
output:
20+
"results/qc/multiqc.html",
21+
"results/qc/multiqc_data/multiqc_general_stats.txt"
22+
params:
23+
extra="", # Optional: extra parameters for multiqc
24+
threads: config["resources"]["fastqc"]["cpu"]
25+
log:
26+
"logs/multiqc/multiqc.log"
27+
wrapper:
28+
"v2.6.0/bio/multiqc"

workflow/rules/resources.smk

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
rule get_genome_fasta:
2+
output:
3+
ensure(resources.fasta, sha256=resources.gencode_fa_sha256)
4+
retries: 3
5+
params:
6+
url=resources.gencode_fa_url,
7+
log:
8+
"logs/resources/get_gencode_fasta.log"
9+
conda:
10+
"../envs/mapping.yml"
11+
shell:
12+
"wget -q {params.url} -O {output}.gz && gunzip -f {output}.gz 2> {log}"
13+
14+
15+
rule get_transcriptome_fasta:
16+
output:
17+
ensure(resources.gencode_trx_fasta, sha256=resources.gencode_trx_fa_sha256)
18+
retries: 3
19+
params:
20+
url=gencode_trx_fa_url,
21+
log:
22+
"logs/resources/get_transcriptome_fasta.log"
23+
conda:
24+
"../envs/mapping.yml"
25+
shell:
26+
"wget -q {params.url} -O {output}.gz && gunzip -f {output}.gz 2> {log}"
27+
28+
29+
rule get_gencode_gtf:
30+
output:
31+
ensure(resources.gencode_gtf, sha256=resources.gencode_gtf_sha256)
32+
retries: 3
33+
params:
34+
url=resources.gencode_gtf_url,
35+
log:
36+
"logs/resources/get_gencode_gtf.log"
37+
conda:
38+
"../envs/mapping.yml"
39+
shell:
40+
"wget -q {params.url} -O {output}.gz && gunzip -f {output}.gz 2> {log}"
41+
42+

workflow/rules/trimming.smk

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
rule trim_galore_pe:
2+
input:
3+
["reads/{sample}_R1.fastq.gz", "reads/{sample}_R2.fastq.gz"],
4+
output:
5+
fasta_fwd="results/trimmed/{sample}_R1.fq.gz",
6+
report_fwd="results/trimmed/reports/{sample}_R1_trimming_report.txt",
7+
fasta_rev="results/trimmed/{sample}_R2.fq.gz",
8+
report_rev="results/trimmed/reports/{sample}_R2_trimming_report.txt",
9+
threads: 1
10+
params:
11+
extra="--illumina -q 20",
12+
log:
13+
"logs/trim_galore/{sample}.log",
14+
wrapper:
15+
"v2.6.0/bio/trim_galore/pe"
16+
17+

workflow/scripts/resources.py

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
import os
2+
3+
class Resources:
4+
"""Gets URLs and file names of fasta and GTF files for a given genome and build
5+
"""
6+
7+
# create genome directory
8+
os.makedirs("resources/", exist_ok=True)
9+
10+
def __init__(self, genome, build):
11+
self.genome = genome
12+
self.build = build
13+
14+
# base URLs
15+
base_url_ens = f"https://ftp.ensembl.org/pub/release-{build}/"
16+
17+
if "hg" in genome:
18+
if genome == "hg19":
19+
name = "GRCh37"
20+
elif genome == "hg38":
21+
name = "GRCh38"
22+
23+
# create URLs for genome files
24+
self.fasta_url = f"{base_url_ens}fasta/homo_sapiens/dna/Homo_sapiens.{name}.dna.primary_assembly.fa.gz"
25+
self.gtf_url = f"{base_url_ens}gtf/homo_sapiens/Homo_sapiens.{name}.{build}.gtf.gz"
26+
27+
# set sha256sums for unzipped genome files
28+
self.fasta_sha256 = "1e74081a49ceb9739cc14c812fbb8b3db978eb80ba8e5350beb80d8ad8dfef3b"
29+
self.gtf_sha256 = "12582b0db02ebe19c29c5733c6edaa62599fe934af593cb7f24423a14db3186c"
30+
31+
elif "mm" in genome:
32+
if genome == "mm9":
33+
name = "GRCm38"
34+
elif genome == "mm10":
35+
name = "GCRm39"
36+
37+
# create URLs for genome files
38+
self.fasta_url = f"{base_url_ens}fasta/mus_musculus/dna/Mus_musculus.{name}.dna.primary_assembly.fa.gz"
39+
self.gtf_url = f"{base_url_ens}gtf/mus_musculus/Mus_musculus.{name}.{build}.gtf.gz"
40+
41+
# set sha256sums for unzipped genome files
42+
self.fasta_sha256 = "14571f7559e292baf0a40f9d155c41ede19a04d80fdeb59a0c2dfe566db90552"
43+
self.gtf_sha256 = "6efbe1fdbd41d4321daf6d550db240656473b41a107648d6faaf9d61cfdb6c4d"
44+
45+
# downloaded unzipped file names
46+
self.fasta = self._file_from_url(self.fasta_url)
47+
self.gtf = self._file_from_url(self.gtf_url)
48+
49+
def _file_from_url(self, url):
50+
"""Returns file path for unzipped downloaded file
51+
"""
52+
53+
return f"resources/{os.path.basename(url).replace('.gz','')}"
54+
55+
56+
57+
58+
59+
60+
61+

0 commit comments

Comments
 (0)