first commit

niekwit · niekwit · commit 42feb56219d9 · 2023-10-22T22:01:32.000+01:00
diff --git a/config/config.yml b/config/config.yml
@@ -0,0 +1,18 @@
+genome: "hg38" # human or mouse
+ensembl_genome_build: "110"
+resources: # computing resources
+  account: XXX 
+  partition: cclake
+  max_jobs: 300
+  trim:
+    cpu: 8
+    time: 60
+  fastqc:
+    cpu: 4
+    time: 60
+  damid:
+    cpu: 8
+    time: 120
+  plotting:
+    cpu: 2
+    time: 20
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,4 +1,47 @@
-# Main entrypoint of the workflow. 
-# Please follow the best practices: 
-# https://snakemake.readthedocs.io/en/stable/snakefiles/best_practices.html,
-# in particular regarding the standardized folder structure mentioned there. 
+import os
+from scripts.resources import Resources
+from scripts import general_functions as gf
+from snakemake.utils import min_version
+
+# set minimum snakemake version
+min_version("6.4.1")
+
+# load config file
+configfile: "config/config.yaml"
+
+# load genome resources to be used in rules
+resources = Resources(config["genome"], config["ensembl_genome_build"])
+
+# get sample names
+SAMPLES = gf.import_samples()
+
+# import rules
+include: "rules/fastqc.smk"
+include: "rules/trimming.smk"
+include: "rules/resources.smk"
+include: "rules/damid.smk"
+include: "rules/bedgraph_processing.smk"
+include: "rules/plotting.smk"
+
+# target rule
+rule all:
+    input: 
+        "results/plots/mapping_rates.pdf",
+        "results/plots/pca.pdf",
+        "results/plots/sample_distance.pdf",
+
+
+# save snakemake terminal output to log file
+snake_log = "logs/snakemake/snakemake.log"
+os.makedirs("logs/snakemake", exist_ok=True)
+
+onsuccess: 
+    shell("cp -v {log} {snake_log}")
+    shell(f"pigz {resources.fasta}") # compress genome files
+    print("Analysis finished successfully!")
+
+onerror:
+    shell("cp -v {log} {snake_log}")
+    print(f"Analysis (partly) failed...\nCheck {snake_log} for details")
+
+
diff --git a/workflow/envs/damid.yml b/workflow/envs/damid.yml
@@ -0,0 +1,7 @@
+name: damid
+channels: 
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - damidseq_pipeline=1.5.3
diff --git a/workflow/rules/damid.smk b/workflow/rules/damid.smk
@@ -0,0 +1,57 @@
+rule bowtie2_build:
+    input:
+        ref=resources.fasta,
+    output:
+        multiext(
+            f"resources/bowtie2_index/{resources.genome}",
+            ".1.bt2",
+            ".2.bt2",
+            ".3.bt2",
+            ".4.bt2",
+            ".rev.1.bt2",
+            ".rev.2.bt2",
+        ),
+    log:
+        "logs/bowtie2_build/build.log",
+    params:
+        extra="",  # optional parameters
+    threads: config["damid"]["threads"]
+    wrapper:
+        "v2.6.0/bio/bowtie2/build"
+
+
+rule create_gatc_fragments:
+    input:
+        resources.fasta
+    output:
+        f"resources/gatc_fragment_file_{resources.genome}.gff.gz"
+    conda:
+        "envs/damid.yml"
+    threads: config["damid"]["threads"]
+    log:
+        "logs/create_gatc_fragment_file/gatc.log"
+    shell:
+        "perl gatc.track.maker.pl "
+        f"--name={resources.genome} "
+        f"{resources.fasta} 2> {log}"
+
+
+rule damidseq_pipeline:
+    input:
+        gatc=f"resources/gatc_fragment_file_{resources.genome}.gff.gz",
+        b2dir=f"resources/bowtie2_index/{resources.genome}",
+    output:
+        directory("results/bedgraph"),
+    conda:
+        "envs/damid.yml"
+    threads: config["damid"]["threads"]
+    log:
+        "logs/damidseq_pipeline/damidseq_pipeline.log"
+    shell:
+        "cd reads/ && "
+        "damidseq_pipeline "
+        "--paired "
+        "--gatc_frag_file={input.gatc "
+        "--bowtie2_genome_dir={input.b2dir} 2> {log} && "
+        "cd .."
+
diff --git a/workflow/rules/fastqc.smk b/workflow/rules/fastqc.smk
@@ -0,0 +1,28 @@
+rule fastqc:
+    input:
+        "reads/{sample}{end}.fastq.gz"
+    output:
+        html="results/qc/fastqc/{sample}{end}.html",
+        zip="results/qc/fastqc/{sample}{end}_fastqc.zip"
+    params:
+        extra = "--quiet"
+    log:
+        "logs/fastqc/{sample}{end}.log"
+    threads: config["resources"]["fastqc"]["cpu"]
+    wrapper:
+        "v2.0.0/bio/fastqc"
+
+
+rule multiqc:
+    input:
+        expand("results/qc/fastqc/{sample}{end}_fastqc.zip", sample=SAMPLES, end=["_R1","_R2"])
+    output:
+        "results/qc/multiqc.html",
+        "results/qc/multiqc_data/multiqc_general_stats.txt"
+    params:
+        extra="",  # Optional: extra parameters for multiqc
+    threads: config["resources"]["fastqc"]["cpu"]
+    log:
+        "logs/multiqc/multiqc.log"
+    wrapper:
+        "v2.6.0/bio/multiqc"
diff --git a/workflow/rules/resources.smk b/workflow/rules/resources.smk
@@ -0,0 +1,42 @@
+rule get_genome_fasta:
+    output:
+        ensure(resources.fasta, sha256=resources.gencode_fa_sha256)
+    retries: 3
+    params:
+        url=resources.gencode_fa_url,
+    log:
+        "logs/resources/get_gencode_fasta.log"
+    conda:
+        "../envs/mapping.yml"
+    shell:
+        "wget -q {params.url} -O {output}.gz && gunzip -f {output}.gz 2> {log}"
+
+
+rule get_transcriptome_fasta:
+    output:
+        ensure(resources.gencode_trx_fasta, sha256=resources.gencode_trx_fa_sha256)
+    retries: 3
+    params:
+        url=gencode_trx_fa_url,
+    log:
+        "logs/resources/get_transcriptome_fasta.log"
+    conda:
+        "../envs/mapping.yml"
+    shell:
+        "wget -q {params.url} -O {output}.gz && gunzip -f {output}.gz 2> {log}"
+
+
+rule get_gencode_gtf:
+    output:
+        ensure(resources.gencode_gtf, sha256=resources.gencode_gtf_sha256)
+    retries: 3
+    params:
+        url=resources.gencode_gtf_url,
+    log:
+        "logs/resources/get_gencode_gtf.log"
+    conda:
+        "../envs/mapping.yml"
+    shell:
+        "wget -q {params.url} -O {output}.gz && gunzip -f {output}.gz 2> {log}"
+
+
diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk
@@ -0,0 +1,17 @@
+rule trim_galore_pe:
+    input:
+        ["reads/{sample}_R1.fastq.gz", "reads/{sample}_R2.fastq.gz"],
+    output:
+        fasta_fwd="results/trimmed/{sample}_R1.fq.gz",
+        report_fwd="results/trimmed/reports/{sample}_R1_trimming_report.txt",
+        fasta_rev="results/trimmed/{sample}_R2.fq.gz",
+        report_rev="results/trimmed/reports/{sample}_R2_trimming_report.txt",
+    threads: 1
+    params:
+        extra="--illumina -q 20",
+    log:
+        "logs/trim_galore/{sample}.log",
+    wrapper:
+        "v2.6.0/bio/trim_galore/pe"
+
+        
diff --git a/workflow/scripts/resources.py b/workflow/scripts/resources.py
@@ -0,0 +1,61 @@
+import os
+
+class Resources:
+    """Gets URLs and file names of fasta and GTF files for a given genome and build
+    """
+    
+    # create genome directory
+    os.makedirs("resources/", exist_ok=True)
+    
+    def __init__(self, genome, build):
+        self.genome = genome
+        self.build = build
+                
+        # base URLs
+        base_url_ens = f"https://ftp.ensembl.org/pub/release-{build}/"
+                
+        if "hg" in genome:
+            if genome == "hg19":
+                name = "GRCh37"
+            elif genome == "hg38":
+                name = "GRCh38"
+                
+            # create URLs for genome files
+            self.fasta_url = f"{base_url_ens}fasta/homo_sapiens/dna/Homo_sapiens.{name}.dna.primary_assembly.fa.gz"
+            self.gtf_url = f"{base_url_ens}gtf/homo_sapiens/Homo_sapiens.{name}.{build}.gtf.gz"
+                        
+            # set sha256sums for unzipped genome files
+            self.fasta_sha256 = "1e74081a49ceb9739cc14c812fbb8b3db978eb80ba8e5350beb80d8ad8dfef3b"
+            self.gtf_sha256 = "12582b0db02ebe19c29c5733c6edaa62599fe934af593cb7f24423a14db3186c"
+                      
+        elif "mm" in genome:
+            if genome == "mm9":
+                name = "GRCm38"
+            elif genome == "mm10":
+                name = "GCRm39"
+                
+            # create URLs for genome files
+            self.fasta_url = f"{base_url_ens}fasta/mus_musculus/dna/Mus_musculus.{name}.dna.primary_assembly.fa.gz"
+            self.gtf_url = f"{base_url_ens}gtf/mus_musculus/Mus_musculus.{name}.{build}.gtf.gz"
+            
+            # set sha256sums for unzipped genome files
+            self.fasta_sha256 = "14571f7559e292baf0a40f9d155c41ede19a04d80fdeb59a0c2dfe566db90552"
+            self.gtf_sha256 = "6efbe1fdbd41d4321daf6d550db240656473b41a107648d6faaf9d61cfdb6c4d"
+            
+        # downloaded unzipped file names
+        self.fasta = self._file_from_url(self.fasta_url)
+        self.gtf = self._file_from_url(self.gtf_url)
+        
+    def _file_from_url(self, url):
+        """Returns file path for unzipped downloaded file
+        """
+        
+        return f"resources/{os.path.basename(url).replace('.gz','')}"
+    
+    
+  
+        
+        
+            
+            
+