From 959df16cd7fea14bb85b5731410ff726f3714962 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Fri, 6 Oct 2017 15:39:03 +0200 Subject: [PATCH] Initial import. --- .gitignore | 19 ++++++++++++++ LICENSE | 21 +++++++++++++++ README.md | 44 ++++++++++++++++++++++++++++++++ Snakefile | 17 +++++++++++++ config.yaml | 18 +++++++++++++ envs/deseq2.yaml | 6 +++++ rules/align.smk | 26 +++++++++++++++++++ rules/diffexp.smk | 52 ++++++++++++++++++++++++++++++++++++++ rules/trim.smk | 31 +++++++++++++++++++++++ samples.tsv | 1 + scripts/common/__init__.py | 1 + scripts/count-matrix.py | 5 ++++ scripts/deseq2-init.R | 14 ++++++++++ scripts/deseq2.R | 19 ++++++++++++++ scripts/pca.R | 10 ++++++++ 15 files changed, 284 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 Snakefile create mode 100644 config.yaml create mode 100644 envs/deseq2.yaml create mode 100644 rules/align.smk create mode 100644 rules/diffexp.smk create mode 100644 rules/trim.smk create mode 100644 samples.tsv create mode 100644 scripts/common/__init__.py create mode 100644 scripts/count-matrix.py create mode 100644 scripts/deseq2-init.R create mode 100644 scripts/deseq2.R create mode 100644 scripts/pca.R diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e467e8f --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +* +!scripts +!scripts/* +!scripts/common +!scripts/common/* +scripts/.snakemake* +!Snakefile +!config.yaml +!samples.tsv +!resources +!resources/* +!envs +!envs/* +!environment.yaml +!LICENSE +!README.md +!rules +!rules/* +!.gitignore diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ff1037d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2017, Johannes Köster + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3477230 --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ +# Snakemake workflow: rna-seq-star-deseq2 + +[![Snakemake](https://img.shields.io/badge/snakemake-≥4.1.0-brightgreen.svg)](https://snakemake.bitbucket.io) +[![Build Status](https://travis-ci.org/snakemake-workflows/rna-seq-spew.svg?branch=master)](https://travis-ci.org/snakemake-workflows/rna-seq-spew) + +This workflow performs a differential expression analysis with STAR and Deseq2. +It is currently under development. No stable release is available yet. + +## Authors + +* Johannes Köster (@johanneskoester) + +## Usage + +### Step 1: Install workflow + +If you simply want to use this workflow, download and extract the [latest release](https://github.com/snakemake-workflows/rna-seq-spew/releases). +If you intend to modify and further develop this workflow, fork this reposity. Please consider providing any generally applicable modifications via a pull request. + +In any case, if you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this repository and, if available, its DOI (see above). + +### Step 2: Configure workflow + +Configure the workflow according to your needs via editing the file `config.yaml`. + +### Step 3: Execute workflow + +Test your configuration by performing a dry-run via + + snakemake -n + +Execute the workflow locally via + + snakemake --cores $N + +using `$N` cores or run it in a cluster environment via + + snakemake --cluster qsub --jobs 100 + +or + + snakemake --drmaa --jobs 100 + +See the [Snakemake documentation](https://snakemake.readthedocs.io) for further details. diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..55c4070 --- /dev/null +++ b/Snakefile @@ -0,0 +1,17 @@ +import pandas as pd + + +configfile: "config.yaml" +samples = pd.read_table("samples.tsv", index_col=0) + + +rule all: + input: + expand("results/deseq/{contrast}.tsv", + contrast=config["diffexp"]["contrasts"]), + "results/pca.pdf" + + +include: "rules/trim.smk" +include: "rules/align.smk" +include: "rules/diffexp.smk" diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..106d668 --- /dev/null +++ b/config.yaml @@ -0,0 +1,18 @@ +# the sequencing adapter +adapter: ACGGATCGATCGATCGATCGAT + +star: + # the STAR index + index: "path/to/star/index" + +pca: + labels: + # columns of sample sheet to use for PCA + - condition + +diffexp: + # contrasts for the deseq2 results method + contrasts: + treated-vs-untreated: + - treated + - untreated diff --git a/envs/deseq2.yaml b/envs/deseq2.yaml new file mode 100644 index 0000000..b1c2344 --- /dev/null +++ b/envs/deseq2.yaml @@ -0,0 +1,6 @@ +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - bioconductor-deseq2 =1.16.1 diff --git a/rules/align.smk b/rules/align.smk new file mode 100644 index 0000000..0530f81 --- /dev/null +++ b/rules/align.smk @@ -0,0 +1,26 @@ +def get_trimmed(wildcards): + if samples.loc[wildcards.sample, "fq2"]: + # paired-end sample + return expand("trimmed/{sample}.{group}.fastq.gz", + sample=wildcards.sample, group=[1, 2]) + # single end sample + return "trimmed/{sample}.fastq.gz" + + +rule align: + input: + sample=get_trimmed + output: + # see STAR manual for additional output files + "star/{sample}/Aligned.out.bam", + "star/{sample}/ReadsPerGene.out.tab" + log: + "logs/star/{sample}.log" + params: + # path to STAR reference genome index + index=config["star"]["index"], + # optional parameters + extra="--quantMode GeneCounts" + threads: 8 + wrapper: + "0.17.4/bio/star/align" diff --git a/rules/diffexp.smk b/rules/diffexp.smk new file mode 100644 index 0000000..3f3790d --- /dev/null +++ b/rules/diffexp.smk @@ -0,0 +1,52 @@ +rule count_matrix: + input: + expand("star/{sample}/ReadsPerGene.out.tab", sample=samples.index) + output: + "counts/all.tsv" + params: + samples=samples.index + script: + "../scripts/count-matrix.py" + + +rule deseq2_init: + input: + counts="counts/all.tsv", + samples="samples.tsv" + output: + "deseq2/all.RData" + conda: + "../envs/deseq2.yaml" + script: + "../scripts/deseq2-init.R" + + +rule pca: + input: + "deseq2/all.RData" + output: + "results/pca.pdf" + params: + pca_labels=config["pca"]["labels"] + conda: + "../envs/deseq2.yaml" + script: + "../scripts/pca.R" + + +def get_contrast(wildcards): + return config["diffexp"]["contrasts"][wildcards.contrast] + + +rule deseq2: + input: + "deseq2/all.RData" + output: + table="results/diffexp/{contrast}.diffexp.tsv", + ma_plot="results/diffexp/{contrast}.ma-plot.pdf", + params: + contrast=get_contrast + conda: + "../envs/deseq2.yaml" + script: + "../scripts/deseq2.R" diff --git a/rules/trim.smk b/rules/trim.smk new file mode 100644 index 0000000..80f6ce7 --- /dev/null +++ b/rules/trim.smk @@ -0,0 +1,31 @@ +def get_fastq(wildcards): + return samples.loc[wildcards.sample, ["fq1", "fq2"]].dropna() + + +rule cutadapt_pe: + input: + get_fastq + output: + fastq1="trimmed/{sample}.1.fastq.gz", + fastq2="trimmed/{sample}.2.fastq.gz", + qc="trimmed/{sample}.qc.txt" + params: + config["cutadapt"]["params"] + log: + "logs/cutadapt/{sample}.log" + wrapper: + "0.17.4/bio/cutadapt/pe" + + +rule cutadapt: + input: + get_fastq + output: + fastq="trimmed/{sample}.fastq.gz", + qc="trimmed/{sample}.qc.txt" + params: + config["cutadapt"]["params"] + log: + "logs/cutadapt/{sample}.log" + wrapper: + "0.17.4/bio/cutadapt/se" diff --git a/samples.tsv b/samples.tsv new file mode 100644 index 0000000..346fd48 --- /dev/null +++ b/samples.tsv @@ -0,0 +1 @@ +sample condition fq1 fq2 diff --git a/scripts/common/__init__.py b/scripts/common/__init__.py new file mode 100644 index 0000000..76b38be --- /dev/null +++ b/scripts/common/__init__.py @@ -0,0 +1 @@ +# Any Python script in the scripts folder will be able to import from this module and beyond. diff --git a/scripts/count-matrix.py b/scripts/count-matrix.py new file mode 100644 index 0000000..1e04c8b --- /dev/null +++ b/scripts/count-matrix.py @@ -0,0 +1,5 @@ +import pandas as pd + +matrix = pd.concat([pd.read_table(f, index_col=0)[1] for f in snakmake.input], + axis=1, names=snakemake.params.samples) +matrix.to_csv(snakemake.output[0], sep="\t") diff --git a/scripts/deseq2-init.R b/scripts/deseq2-init.R new file mode 100644 index 0000000..001a8d9 --- /dev/null +++ b/scripts/deseq2-init.R @@ -0,0 +1,14 @@ +library("DESeq2") + +# colData and countData must have the same sample order, but this is ensured +# by the way we create the count matrix +dds <- DESeqDataSetFromMatrix(countData=snakemake@input[["counts"]], + colData=snakemake@input[["samples"]], + design=~ condition) + +# remove uninformative columns +dds <- dds[ rowSums(counts(dds)) > 1, ] +# TODO optionally allow to collapse technical replicates +dds <- DESeq(dds) + +save(dds, file=snakemake.output[[1]]) diff --git a/scripts/deseq2.R b/scripts/deseq2.R new file mode 100644 index 0000000..e6929ce --- /dev/null +++ b/scripts/deseq2.R @@ -0,0 +1,19 @@ +library("DESeq2") + +dds <- load(snakemake@input[[1]]) + +contrast <- c("condition", snakemake@params[["contrast"]]) +res <- results(dds, contrast=contrast) +# shrink fold changes for lowly expressed genes +res <- lfcShrink(dds, contrast=contrast, res=res) +# sort by p-value +res <- res[order(res$padj),] +# TODO explore IHW usage + + +# store results +pdf(snakemake@output[["ma_plot"]]) +plotMA(res, ylim=c(-2,2)) +dev.off() + +write.table(as.data.frame(res), file=snakemake@output[["table"]]) diff --git a/scripts/pca.R b/scripts/pca.R new file mode 100644 index 0000000..71cb976 --- /dev/null +++ b/scripts/pca.R @@ -0,0 +1,10 @@ +library("DESeq2") + +# load deseq2 data +dds <- load(snakemake@input[[1]]) + +# obtain normalized counts +ntd <- normTransform(dds) +pdf(snakemake@output[[1]]) +plotPCA(ntd, intgroup=snakemake@params[["pca_labels"]]) +dev.off()