Initial commit

niekwit · niekwit · commit ea4a446f6ada · 2024-01-21T17:06:17.000Z
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -0,0 +1,18 @@
+genome: "hg38" # human or mouse
+ensembl_genome_build: "110"
+resources: # computing resources
+  account: XXX 
+  partition: cclake
+  max_jobs: 300
+  trim:
+    cpu: 8
+    time: 60
+  fastqc:
+    cpu: 4
+    time: 60
+  damid:
+    cpu: 8
+    time: 120
+  plotting:
+    cpu: 2
+    time: 20
diff --git a/.test/reads/README.md b/.test/reads/README.md
diff --git a/.test/reads/exp1/dam.fastq.gz b/.test/reads/exp1/dam.fastq.gz
diff --git a/.test/reads/exp1/polII.fastq.gz b/.test/reads/exp1/polII.fastq.gz
diff --git a/.test/reads/exp2/dam.fastq.gz b/.test/reads/exp2/dam.fastq.gz
diff --git a/.test/reads/exp2/polII.fastq.gz b/.test/reads/exp2/polII.fastq.gz
diff --git a/config/config.yaml b/config/config.yaml
@@ -0,0 +1,19 @@
+genome: "hg38" # human or mouse
+ensembl_genome_build: "110"
+paired_end: True # paired-end or single-end
+resources: # computing resources
+  trim:
+    cpu: 8
+    time: 60
+  fastqc:
+    cpu: 4
+    time: 60
+  damid:
+    cpu: 8
+    time: 120
+  index:
+    cpu: 36
+    time: 60
+  plotting:
+    cpu: 2
+    time: 20
diff --git a/workflow/envs/damid.yaml b/workflow/envs/damid.yaml
@@ -0,0 +1,9 @@
+name: damid
+channels: 
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - damidseq_pipeline=1.5.3
+  - trim-galore=0.6.10
+  - perl-inline-c=0.81
diff --git a/workflow/envs/deeptools.yaml b/workflow/envs/deeptools.yaml
@@ -0,0 +1,9 @@
+name: deeptools
+channels: 
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - deeptools=3.5.4
+  - python=3.10
+  - ucsc-bedgraphtobigwig=445
diff --git a/workflow/rules/bedgraph_processing.smk b/workflow/rules/bedgraph_processing.smk
@@ -0,0 +1,37 @@
+rule bedgraph2bigwig:
+    input:
+        cs=f"resources/{resources.genome}_chrom.sizes",
+        bg="results/bedgraph/{dir}/{sample}-vs-Dam.gatc.bedgraph" # exclude dam sample from sample wildcard here!!!!
+    output:
+        bw="results/bigwig/{dir}/{sample}.bw"
+    params:
+        extra=""
+    threads: config["resources"]["fastqc"]["cpu"]
+    resources: 
+        runtime=config["resources"]["fastqc"]["time"],
+    conda: 
+        "../envs/deeptools.yaml"
+    shell:
+        "bedGraphToBigWig "
+        "{params.extra} "
+        "{input.bg} "
+        "{input.cs} "
+        "{output} > {log} 2>&1"
+
+
+rule average_bigwigs:
+    input:
+        expand("results/bigwig/{dir}/{sample}.bw", dir=DIRS, sample=SAMPLES),
+    output:
+        bw="results/bigwig/average_bw/{sample}.bw",
+    params:
+        extra="",
+    threads: config["resources"]["deeptools"]["cpu"]
+    resources:
+        runtime=config["resources"]["deeptools"]["time"]
+    log:
+        "logs/deeptools/bw_average_{condition}.log"
+    conda:
+        "../envs/deeptools.yaml"
+    script:
+        "../scripts/average_bigwig.py"
diff --git a/workflow/rules/deeptools.smk b/workflow/rules/deeptools.smk
diff --git a/workflow/rules/peak_calling.smk b/workflow/rules/peak_calling.smk
@@ -0,0 +1,14 @@
+rule peak_calling:
+    input:
+        fp="resources/find_peaks",
+        bg=expand("results/bedgraph/{dir}/{sample, ^((?!Dam).)*$}-vs-Dam.gatc.bedgraph", dir=DIRS, sample=SAMPLES), # exclude Dam sample from sample wildcard
+    output:
+        ""
+
+rules peaks2genes:
+    input:
+        fp="resources/find_peaks",
+        gtf=resources.gtf,
+        peaks="",
+    output:
+        "",
diff --git a/workflow/rules/setup.smk b/workflow/rules/setup.smk
@@ -0,0 +1,43 @@
+rule make_gatc_tracks:
+    input:
+        fa=resources.fasta,
+    output:
+        gatc=f"resources/{resources.genome}.GATC.gff",
+    params:
+        genome=f"resources/{resources.genome}",
+    threads: config["resources"]["fastqc"]["cpu"],
+    resources:
+        time=config["resources"]["fastqc"]["time"],
+    conda:
+        "../envs/damid.yaml",
+    log:
+        "logs/make_gatc_tracks/{params.genome}.log",
+    shell:
+        "gatc.track.maker.pl "
+        "--name={params.genome} "
+        "{input.fa} > {log} 2>&1 "
+
+
+rule bowtie2_build_index:
+    input:
+        ref=resources.fasta,
+    output:
+        multiext(
+            f"resources/bowtie2_index/{resources.genome}/index",
+            ".1.bt2",
+            ".2.bt2",
+            ".3.bt2",
+            ".4.bt2",
+            ".rev.1.bt2",
+            ".rev.2.bt2",
+        ),
+    log:
+        "logs/bowtie2_build_index/build.log",
+    params:
+        extra="",  # optional parameters
+    threads: config["resources"]["index"]["cpu"]
+    resources:
+        runtime=config["resources"]["index"]["time"],
+    wrapper:
+        "v3.3.3/bio/bowtie2/build"
+
diff --git a/workflow/scripts/average_bigwig.py b/workflow/scripts/average_bigwig.py
@@ -0,0 +1,22 @@
+from snakemake.shell import shell
+
+# Load Snakemake variables
+log = snakemake.log_fmt_shell(stdout=True, stderr=True)
+threads = snakemake.threads
+
+all_bw = snakemake.input
+sample = snakemake.wildcards["sample"]
+out = snakemake.output["bw"]
+
+# Get all samples in condition
+bw = [x for x in all_bw if sample in x] # use input lambda function instead? (just shell command in rule)
+
+# Create average bigwig file
+shell(
+    "bigwigAverage "
+    "--bigwigs {bw} "
+    "--outFileName {out} "
+    "--numberOfProcessors {threads} "
+    "{log}"
+    )
+
diff --git a/workflow/scripts/damidseq_pipeline.sh b/workflow/scripts/damidseq_pipeline.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+
+set -e
+
+# get the current working directory
+WORKDIR=$(pwd)
+
+# go to directory with fastq files
+SAMPLEDIR=${snakemake_wildcards[dir]}
+cd "reads/${SAMPLEDIR}"
+
+# Check if data is paired-end or single-end
+END=${snakemake_params[paired]}
+
+if [ "$END" == "True" ]; then
+    ARG="--paired"
+else
+    ARG=""
+fi
+
+# run DamID-seq pipeline
+damidseq_pipeline $ARG --gatc_frag_file=../${snakemake_input[gatc]} --bowtie2_genome_dir=../${snakemake_params[idxdir]} > ../${snakemake_log[0]} 2>&1
+
+# go back to working directory
+cd ${WORKDIR}
+
+# move output files to results/sample/bedgraph
+mkdir -p ${snakemake_output[dir]}
+mv *.bedgraph ${snakemake_output[dir]}
+#mkdir -p logs/damid_pipeline/
+#mv pipeline-*.log ${WORKDIR}/logs/damid_pipeline/
+
+
diff --git a/workflow/scripts/general_functions.smk b/workflow/scripts/general_functions.smk
@@ -0,0 +1,60 @@
+def dirs():
+    """Each dir contains one replicate sets of fastq files
+    """
+    DIRS = glob.glob("reads/*")
+    DIRS = [os.path.basename(d) for d in DIRS]
+        
+    return DIRS
+    
+
+def samples():
+    """Checks sample names/files and returns sample wildcard values for Snakemake
+    """
+    SAMPLES = csv["sample"]
+    
+    # Check if sample names contain any characters that are not alphanumeric or underscore
+    illegal = []
+    for sample in SAMPLES:
+        if not re.match("^[a-zA-Z0-9_]*$", sample):
+            illegal.append(sample)
+    if len(illegal) != 0:
+        illegal = "\n".join(illegal)
+        raise ValueError(f"ERROR: following samples contain illegal characters:\n{illegal}")
+
+    # Check if sample names match file names
+    not_found = []
+    for sample in SAMPLES:
+        for dir in DIRS:
+            if config["paired_end"]:
+                r1= f"reads/{dir}/{sample}_R1_001.fastq.gz"
+                r2= f"reads/{dir}/{sample}_R2_001.fastq.gz"
+                if not os.path.isfile(r1):
+                    not_found.append(r1)
+                if not os.path.isfile(r2):
+                    not_found.append(r2)
+            else:
+                r1= f"reads/{dir}/{sample}.fastq.gz"
+                if not os.path.isfile(r1):
+                    not_found.append(r1)
+    if len(not_found) != 0:
+        not_found = "\n".join(not_found)
+        raise ValueError(f"ERROR: following files not found:\n{not_found}")
+
+    return SAMPLES
+
+
+def targets():
+    """Returns file targets for rule all
+    """
+    TARGETS = [
+        expand("results/bedgraph/{dir}", dir=DIRS),
+    ]
+
+    return TARGETS
+
+
+def dam_control():
+    """Check if Dam only control is present
+    """
+    pass
+    
diff --git a/workflow/scripts/get_resource.sh b/workflow/scripts/get_resource.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+LOG=${snakemake_log[0]}
+URL=${snakemake_params["url"]}
+OUTPUT=${snakemake_output[0]}
+
+wget -q $URL -O $OUTPUT.gz 2> $LOG
+pigz -df $OUTPUT.gz 2>> $LOG
diff --git a/workflow/scripts/trim_galore.sh b/workflow/scripts/trim_galore.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -e
+
+
+#TEMP_DIR=${snakemake_output[temp_dir]}
+#TEMP_DIR=temp/${snakemake_wildcards[dir]}/${snakemake_wildcards[sample]}
+DEST_DIR=results/trimmed/${snakemake_wildcards[dir]}
+BASENAME=${snakemake_wildcards[sample]}
+
+#mkdir -p $TEMP_DIR
+mkdir -p $DEST_DIR
+
+if [ ${snakemake_params[paired]} == "YES" ]
+    then
+    INPUT="${snakemake_input[r1]} ${snakemake_input[r2]}"
+else 
+    INPUT=${snakemake_input[r1]}
+fi
+
+trim_galore ${snakemake_params[extra]} --cores ${snakemake[threads]} --output_dir $DEST_DIR --basename $BASENAME $INPUT > ${snakemake_log[0]} 2>&1
+
+#mv $TEMP_DIR/*_trimmed.fq.gz $DEST_DIR
+#mv $TEMP_DIR/*trimming_report.txt $DEST_DIR
+
+#rm -r $TEMP_DIR
+