From 959df16cd7fea14bb85b5731410ff726f3714962 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20K=C3=B6ster?= <johannes.koester@tu-dortmund.de>
Date: Fri, 6 Oct 2017 15:39:03 +0200
Subject: [PATCH] Initial import.

---
 .gitignore                 | 19 ++++++++++++++
 LICENSE                    | 21 +++++++++++++++
 README.md                  | 44 ++++++++++++++++++++++++++++++++
 Snakefile                  | 17 +++++++++++++
 config.yaml                | 18 +++++++++++++
 envs/deseq2.yaml           |  6 +++++
 rules/align.smk            | 26 +++++++++++++++++++
 rules/diffexp.smk          | 52 ++++++++++++++++++++++++++++++++++++++
 rules/trim.smk             | 31 +++++++++++++++++++++++
 samples.tsv                |  1 +
 scripts/common/__init__.py |  1 +
 scripts/count-matrix.py    |  5 ++++
 scripts/deseq2-init.R      | 14 ++++++++++
 scripts/deseq2.R           | 19 ++++++++++++++
 scripts/pca.R              | 10 ++++++++
 15 files changed, 284 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 Snakefile
 create mode 100644 config.yaml
 create mode 100644 envs/deseq2.yaml
 create mode 100644 rules/align.smk
 create mode 100644 rules/diffexp.smk
 create mode 100644 rules/trim.smk
 create mode 100644 samples.tsv
 create mode 100644 scripts/common/__init__.py
 create mode 100644 scripts/count-matrix.py
 create mode 100644 scripts/deseq2-init.R
 create mode 100644 scripts/deseq2.R
 create mode 100644 scripts/pca.R

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..e467e8f
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+*
+!scripts
+!scripts/*
+!scripts/common
+!scripts/common/*
+scripts/.snakemake*
+!Snakefile
+!config.yaml
+!samples.tsv
+!resources
+!resources/*
+!envs
+!envs/*
+!environment.yaml
+!LICENSE
+!README.md
+!rules
+!rules/*
+!.gitignore
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..ff1037d
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2017, Johannes Köster
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..3477230
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+# Snakemake workflow: rna-seq-star-deseq2
+
+[![Snakemake](https://img.shields.io/badge/snakemake-≥4.1.0-brightgreen.svg)](https://snakemake.bitbucket.io)
+[![Build Status](https://travis-ci.org/snakemake-workflows/rna-seq-spew.svg?branch=master)](https://travis-ci.org/snakemake-workflows/rna-seq-spew)
+
+This workflow performs a differential expression analysis with STAR and Deseq2.
+It is currently under development. No stable release is available yet.
+
+## Authors
+
+* Johannes Köster (@johanneskoester)
+
+## Usage
+
+### Step 1: Install workflow
+
+If you simply want to use this workflow, download and extract the [latest release](https://github.com/snakemake-workflows/rna-seq-spew/releases).
+If you intend to modify and further develop this workflow, fork this reposity. Please consider providing any generally applicable modifications via a pull request.
+
+In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and, if available, its DOI (see above).
+
+### Step 2: Configure workflow
+
+Configure the workflow according to your needs via editing the file `config.yaml`.
+
+### Step 3: Execute workflow
+
+Test your configuration by performing a dry-run via
+
+    snakemake -n
+
+Execute the workflow locally via
+
+    snakemake --cores $N
+
+using `$N` cores or run it in a cluster environment via
+
+    snakemake --cluster qsub --jobs 100
+
+or
+
+    snakemake --drmaa --jobs 100
+
+See the [Snakemake documentation](https://snakemake.readthedocs.io) for further details.
diff --git a/Snakefile b/Snakefile
new file mode 100644
index 0000000..55c4070
--- /dev/null
+++ b/Snakefile
@@ -0,0 +1,17 @@
+import pandas as pd
+
+
+configfile: "config.yaml"
+samples = pd.read_table("samples.tsv", index_col=0)
+
+
+rule all:
+    input:
+        expand("results/deseq/{contrast}.tsv",
+               contrast=config["diffexp"]["contrasts"]),
+        "results/pca.pdf"
+
+
+include: "rules/trim.smk"
+include: "rules/align.smk"
+include: "rules/diffexp.smk"
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000..106d668
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,18 @@
+# the sequencing adapter
+adapter: ACGGATCGATCGATCGATCGAT
+
+star:
+  # the STAR index
+  index: "path/to/star/index"
+
+pca:
+  labels:
+    # columns of sample sheet to use for PCA
+    - condition
+
+diffexp:
+  # contrasts for the deseq2 results method
+  contrasts:
+    treated-vs-untreated:
+      - treated
+      - untreated
diff --git a/envs/deseq2.yaml b/envs/deseq2.yaml
new file mode 100644
index 0000000..b1c2344
--- /dev/null
+++ b/envs/deseq2.yaml
@@ -0,0 +1,6 @@
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - bioconductor-deseq2 =1.16.1
diff --git a/rules/align.smk b/rules/align.smk
new file mode 100644
index 0000000..0530f81
--- /dev/null
+++ b/rules/align.smk
@@ -0,0 +1,26 @@
+def get_trimmed(wildcards):
+    if samples.loc[wildcards.sample, "fq2"]:
+        # paired-end sample
+        return expand("trimmed/{sample}.{group}.fastq.gz", 
+                      sample=wildcards.sample, group=[1, 2])
+    # single end sample
+    return "trimmed/{sample}.fastq.gz"
+
+
+rule align:
+    input:
+        sample=get_trimmed
+    output:
+        # see STAR manual for additional output files
+        "star/{sample}/Aligned.out.bam",
+        "star/{sample}/ReadsPerGene.out.tab"
+    log:
+        "logs/star/{sample}.log"
+    params:
+        # path to STAR reference genome index
+        index=config["star"]["index"],
+        # optional parameters
+        extra="--quantMode GeneCounts"
+    threads: 8
+    wrapper:
+        "0.17.4/bio/star/align"
diff --git a/rules/diffexp.smk b/rules/diffexp.smk
new file mode 100644
index 0000000..3f3790d
--- /dev/null
+++ b/rules/diffexp.smk
@@ -0,0 +1,52 @@
+rule count_matrix:
+    input:
+        expand("star/{sample}/ReadsPerGene.out.tab", sample=samples.index)
+    output:
+        "counts/all.tsv"
+    params:
+        samples=samples.index
+    script:
+        "../scripts/count-matrix.py"
+
+
+rule deseq2_init:
+    input:
+        counts="counts/all.tsv",
+        samples="samples.tsv"
+    output:
+        "deseq2/all.RData"
+    conda:
+        "../envs/deseq2.yaml"
+    script:
+        "../scripts/deseq2-init.R"
+
+
+rule pca:
+    input:
+        "deseq2/all.RData"
+    output:
+        "results/pca.pdf"
+    params:
+        pca_labels=config["pca"]["labels"]
+    conda:
+        "../envs/deseq2.yaml"
+    script:
+        "../scripts/pca.R"
+
+
+def get_contrast(wildcards):
+    return config["diffexp"]["contrasts"][wildcards.contrast]
+
+
+rule deseq2:
+    input:
+        "deseq2/all.RData"
+    output:
+        table="results/diffexp/{contrast}.diffexp.tsv",
+        ma_plot="results/diffexp/{contrast}.ma-plot.pdf",
+    params:
+        contrast=get_contrast
+    conda:
+        "../envs/deseq2.yaml"
+    script:
+        "../scripts/deseq2.R"
diff --git a/rules/trim.smk b/rules/trim.smk
new file mode 100644
index 0000000..80f6ce7
--- /dev/null
+++ b/rules/trim.smk
@@ -0,0 +1,31 @@
+def get_fastq(wildcards):
+    return samples.loc[wildcards.sample, ["fq1", "fq2"]].dropna()
+
+
+rule cutadapt_pe:
+    input:
+        get_fastq
+    output:
+        fastq1="trimmed/{sample}.1.fastq.gz",
+        fastq2="trimmed/{sample}.2.fastq.gz",
+        qc="trimmed/{sample}.qc.txt"
+    params:
+        config["cutadapt"]["params"]
+    log:
+        "logs/cutadapt/{sample}.log"
+    wrapper:
+        "0.17.4/bio/cutadapt/pe"
+
+
+rule cutadapt:
+    input:
+        get_fastq
+    output:
+        fastq="trimmed/{sample}.fastq.gz",
+        qc="trimmed/{sample}.qc.txt"
+    params:
+        config["cutadapt"]["params"]
+    log:
+        "logs/cutadapt/{sample}.log"
+    wrapper:
+        "0.17.4/bio/cutadapt/se"
diff --git a/samples.tsv b/samples.tsv
new file mode 100644
index 0000000..346fd48
--- /dev/null
+++ b/samples.tsv
@@ -0,0 +1 @@
+sample	condition	fq1	fq2
diff --git a/scripts/common/__init__.py b/scripts/common/__init__.py
new file mode 100644
index 0000000..76b38be
--- /dev/null
+++ b/scripts/common/__init__.py
@@ -0,0 +1 @@
+# Any Python script in the scripts folder will be able to import from this module and beyond.
diff --git a/scripts/count-matrix.py b/scripts/count-matrix.py
new file mode 100644
index 0000000..1e04c8b
--- /dev/null
+++ b/scripts/count-matrix.py
@@ -0,0 +1,5 @@
+import pandas as pd
+
+matrix = pd.concat([pd.read_table(f, index_col=0)[1] for f in snakmake.input], 
+                   axis=1, names=snakemake.params.samples)
+matrix.to_csv(snakemake.output[0], sep="\t")
diff --git a/scripts/deseq2-init.R b/scripts/deseq2-init.R
new file mode 100644
index 0000000..001a8d9
--- /dev/null
+++ b/scripts/deseq2-init.R
@@ -0,0 +1,14 @@
+library("DESeq2")
+
+# colData and countData must have the same sample order, but this is ensured
+# by the way we create the count matrix
+dds <- DESeqDataSetFromMatrix(countData=snakemake@input[["counts"]],
+                              colData=snakemake@input[["samples"]],
+                              design=~ condition)
+
+# remove uninformative columns
+dds <- dds[ rowSums(counts(dds)) > 1, ]
+# TODO optionally allow to collapse technical replicates
+dds <- DESeq(dds)
+
+save(dds, file=snakemake.output[[1]])
diff --git a/scripts/deseq2.R b/scripts/deseq2.R
new file mode 100644
index 0000000..e6929ce
--- /dev/null
+++ b/scripts/deseq2.R
@@ -0,0 +1,19 @@
+library("DESeq2")
+
+dds <- load(snakemake@input[[1]])
+
+contrast <- c("condition", snakemake@params[["contrast"]])
+res <- results(dds, contrast=contrast)
+# shrink fold changes for lowly expressed genes
+res <- lfcShrink(dds, contrast=contrast, res=res)
+# sort by p-value
+res <- res[order(res$padj),]
+# TODO explore IHW usage
+
+
+# store results
+pdf(snakemake@output[["ma_plot"]])
+plotMA(res, ylim=c(-2,2))
+dev.off()
+
+write.table(as.data.frame(res), file=snakemake@output[["table"]])
diff --git a/scripts/pca.R b/scripts/pca.R
new file mode 100644
index 0000000..71cb976
--- /dev/null
+++ b/scripts/pca.R
@@ -0,0 +1,10 @@
+library("DESeq2")
+
+# load deseq2 data
+dds <- load(snakemake@input[[1]])
+
+# obtain normalized counts
+ntd <- normTransform(dds)
+pdf(snakemake@output[[1]])
+plotPCA(ntd, intgroup=snakemake@params[["pca_labels"]])
+dev.off()