Merge pull request #7 from IARCbioinfo/dev

Dev
IARCbioinfo · Jul 10, 2017 · e1795c4 · e1795c4
2 parents ae889d3 + ce432b9
commit e1795c4
Show file tree

Hide file tree

Showing 18 changed files with 2,391 additions and 759 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,125 @@
+# Set the base image to Debian
+FROM debian:9.0
+
+# File Author / Maintainer
+MAINTAINER **nalcala** <**[email protected]**>
+
+RUN mkdir -p /var/cache/apt/archives/partial && \
+	touch /var/cache/apt/archives/lock && \
+	chmod 640 /var/cache/apt/archives/lock && \
+	apt-get update -y &&\
+	apt-get install -y gnupg2
+
+RUN	apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F76221572C52609D && \
+	apt-get clean && \
+	apt-get update -y && \
+
+
+  # Install dependences
+  DEBIAN_FRONTEND=noninteractive apt-get install --no-install-recommends -y \
+  make \
+  g++ \
+  perl \
+  default-jre \
+  zlib1g-dev \
+  libncurses5-dev \
+  libncurses5 \
+  git \
+  wget \
+  ca-certificates \
+  python-dev \
+  python-pip \
+  bzip2 \
+  libbz2-dev \
+  liblzma-dev \
+  libcurl4-openssl-dev \
+  libfreetype6-dev \
+  libpng-dev \
+  unzip \
+  r-base \
+  r-cran-ggplot2 \
+  r-cran-gplots \
+  r-cran-reshape && \
+  cp /usr/include/freetype2/*.h /usr/include/. && \
+
+  Rscript -e 'install.packages("gsalib",repos="http://cran.us.r-project.org")' && \
+
+  # Install samtools specific version manually
+  wget https://github.com/samtools/samtools/releases/download/1.3.1/samtools-1.3.1.tar.bz2 && \
+  tar -jxf samtools-1.3.1.tar.bz2 && \
+  cd samtools-1.3.1 && \
+  make && \
+  make install && \
+  cd .. && \
+  rm -rf samtools-1.3.1 samtools-1.3.1.tar.bz2 && \
+
+  # Install FastQC
+  wget http://www.bioinformatics.babraham.ac.uk/projects/fastqc/fastqc_v0.11.5.zip && \
+  unzip fastqc_v0.11.5.zip && \
+  chmod 755 FastQC/fastqc && \
+  cp -r FastQC /usr/local/bin/. && \
+  ln -s /usr/local/bin/FastQC/fastqc /usr/local/bin/ && \
+  rm -rf fastqc_v0.11.5.zip FastQC && \
+
+  # Install cutadapt
+  pip install cutadapt && \
+
+  # Install trim_galore
+  wget https://github.com/FelixKrueger/TrimGalore/archive/0.4.3.tar.gz && \
+  tar xvzf 0.4.3.tar.gz && \
+  mv TrimGalore-0.4.3/trim_galore /usr/bin && \
+  rm -rf TrimGalore-0.4.3 0.4.3.tar.gz && \
+
+  # Install hisat2
+
+  # Install htseq
+  pip install numpy && \
+  pip install setuptools && \
+  pip install HTSeq && \
+
+  # Install multiqc
+  pip install --upgrade --force-reinstall git+https://github.com/nalcala/MultiQC.git && \
+
+  # Install STAR specific version manually
+  wget https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz && \
+  tar -xzf 2.5.3a.tar.gz && \
+  cp STAR-2.5.3a/bin/Linux_x86_64_static/STAR /usr/local/bin/. && \
+  rm -rf 2.5.3a.tar.gz STAR-2.5.3a && \
+
+  # Install hisat2
+  wget ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/downloads/hisat2-2.1.0-Linux_x86_64.zip && \
+  unzip hisat2-2.1.0-Linux_x86_64.zip && \
+  cp -r hisat2-2.1.0/. /usr/local/bin/. && \
+  rm -rf hisat2-2.1.0-Linux_x86_64.zip hisat2-2.1.0 && \
+
+  # Install RSeQC
+  pip install RSeQC && \
+
+  # Install samblaster specific version manually
+  wget https://github.com/GregoryFaust/samblaster/releases/download/v.0.1.24/samblaster-v.0.1.24.tar.gz && \
+  tar -xzf samblaster-v.0.1.24.tar.gz && \
+  cd samblaster-v.0.1.24 && \
+  make && \
+  cp samblaster /usr/local/bin/. && \
+  cd .. && \
+  rm -rf samblaster-v.0.1.24.tar.gz samblaster-v.0.1.24 && \
+
+  # Install sambamba specific version manually
+  wget https://github.com/lomereiter/sambamba/releases/download/v0.6.6/sambamba_v0.6.6_linux.tar.bz2 && \
+  tar -jxf sambamba_v0.6.6_linux.tar.bz2 && \
+  cp sambamba_v0.6.6 /usr/local/bin/sambamba && \
+  rm -rf sambamba_v0.6.6_linux.tar.bz2 && \
+
+  # Remove unnecessary dependences
+  DEBIAN_FRONTEND=noninteractive apt-get remove -y \
+  make \
+  g++ \
+  wget \
+  bzip2 \
+  git \
+  zlib1g-dev \
+  libncurses5-dev && \
+
+  # Clean
+  DEBIAN_FRONTEND=noninteractive apt-get autoremove -y && \
+  apt-get clean
diff --git a/README.md b/README.md
@@ -1,28 +1,31 @@
 # RNAseq-nf
-RNAseq mapping, quality control, and reads counting nextflow pipeline
 
-## Overview of pipeline workflow
+## Nextflow pipeline for RNA seq processing
+
 ![workflow](RNAseqpipeline.png?raw=true "Scheme of alignment/realignment Workflow")
 
-## Prerequisites
+## Decription
+
+Nextflow pipeline for RNA sequencing mapping, quality control, reads counting, and unsupervised analysis
+
+## Dependencies
+
+1. Nextflow : for common installation procedures see the [IARC-nf](https://github.com/IARCbioinfo/IARC-nf) repository.
 
-### General prerequisites
-The following programs need to be installed and in the PATH environment variable:
-- [*fastqc*](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/INSTALL.txt)
-- [*cutadapt*](http://cutadapt.readthedocs.io/en/stable/installation.html), which requires Python version > 2.7
-- [*trim_galore*](https://github.com/FelixKrueger/TrimGalore)
-- [*RESeQC*](http://rseqc.sourceforge.net/)
-- [*multiQC*](http://multiqc.info/docs/)
-- [*STAR*](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
-- [*htseq*](http://www-huber.embl.de/HTSeq/doc/install.html#install); the python script htseq-count must also be in the PATH
-- [*nextflow*](https://www.nextflow.io/docs/latest/getstarted.html)
+2. [*fastqc*](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/INSTALL.txt)
+3. [*cutadapt*](http://cutadapt.readthedocs.io/en/stable/installation.html), which requires Python version > 2.7
+4. [*trim_galore*](https://github.com/FelixKrueger/TrimGalore)
+5. [*RESeQC*](http://rseqc.sourceforge.net/)
+6. [*multiQC*](http://multiqc.info/docs/)
+7. [*STAR*](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf)
+8. [*htseq*](http://www-huber.embl.de/HTSeq/doc/install.html#install); the python script htseq-count must also be in the PATH
 
 In addition, STAR requires genome indices that can be generated from a genome fasta file ref.fa and a splice junction annotation file ref.gtf using the following command:
 ```bash
 STAR --runThreadN n --runMode genomeGenerate --genomeDir ref --genomeFastaFiles ref.fa --sjdbGTFfile ref.gtf --sjdbOverhang 99
 ```
 
-### Prerequisites for alignment with hisat2
+### Alignment with hisat2
 In order to perform the optional alignment with hisat2, hisat2 must be installed:
 - [*hisat2*](https://ccb.jhu.edu/software/hisat2/index.shtml)
 
@@ -33,7 +36,7 @@ extract_exons.py reference.gtf > genome.exon
 hisat2-build reference.fa --ss genome.ss --exon genome.exon genome_tran
 ```
 
-### Prerequisites for reads trimming at splice junctions
+### Reads trimming at splice junctions
 In order to perform the optional reads trimming at splice junctions, GATK must be installed:
 - GATK [*GenomeAnalysisTK.jar*](https://software.broadinstitute.org/gatk/guide/quickstart)
 
@@ -43,58 +46,136 @@ samtools faidx ref.fa
 java -jar picard.jar CreateSequenceDictionary R= ref.fa O= ref.dict
 ```
 
-### Prerequisites for base quality score recalibration
+### Base quality score recalibration
+In order to perform the optional base quality score recalibration, several files are required:
 - GATK [*GenomeAnalysisTK.jar*](https://software.broadinstitute.org/gatk/guide/quickstart)
 - [GATK bundle](https://software.broadinstitute.org/gatk/download/bundle) VCF files with lists of indels and SNVs (recommended: 1000 genomes indels, Mills gold standard indels VCFs, dbsnp VCF)
 - bed file with intervals to be considered
 
+### Clustering
+In order to perform the optional unsupervised analysis of read counts (PCA and consensus clustering), you need:
+- the unsupervised analysis R script [*RNAseq_unsupervised.R*](https://github.com/IARCbioinfo/RNAseq_analysis_scripts); this script must be in a floder of the path variable (e.g., in /usr/bin/)
+- [R and Rscript](https://cran.r-project.org) with packages ConsensusClusterPlus, ade4, DESeq2, fpc, and cluster
+
+## Input 
+ | Type      | Description     |
+  |-----------|---------------|
+  | --input_folder    | a folder with fastq files or bam files |
+
+
+## Parameters
+
+* #### Mandatory
+| Name | Example value | Description |
+|-----------|--------------:|-------------| 
+| --input_folder | . | input folder |
+|--ref_folder | ref | reference genome folder |
+|--gtf   |  Homo_sapiens.GRCh38.79.gtf | annotation GTF file |
+|--bed   |  gene.bed | bed file with genes for RESeQC | 
+
+
+* #### Optional
+
+| Name | Default value | Description |
+|-----------|--------------|-------------| 
+|--cpu          | 4 | number of CPUs |
+|--mem         | 50 | memory for mapping|
+|--mem_QC     | 2 | memory for QC and counting|
+|--fastq_ext    | fq.gz | extension of fastq files|
+|--suffix1      | \_1 | suffix for second element of read files pair|
+|--suffix2      | \_2 | suffix for second element of read files pair|
+|--output_folder   | . | output folder for aligned BAMs|
+|--ref |    ref.fa | reference genome fasta file for GATK |
+|--GATK_jar |  GenomeAnalysisTK.jar | path to jar file GenomeAnalysisTK.jar |
+|--GATK_bundle |  GATK_bundle | folder with files for BQSR |
+|--RG          |  PL:ILLUMINA | string to be added to read group information in BAM file |
+|--stranded   |  no | Strand information for counting with htseq [no, yes, reverse] | 
+|--hisat2_idx   |  genome_tran | index filename prefix for hisat2 | 
+|--clustering_n | 500 | number of genes to use for clustering |
+|--clustering_t | "vst" | count transformation method; 'rld', 'vst', or 'auto' |
+|--clustering_c | "hc" | clustering algorithm to be passed to ConsensusClusterPlus |
+|--clustering_l | "complete" | method for hierarchical clustering to be passed to ConsensusClusterPlus |
+|--htseq_maxreads| null | maximum number of reads in the htseq buffer; if null, uses the default htseq value 30,000,000 |
+
+* #### Flags
+
+| Name  | Description |
+|-----------|-------------| 
+|--help | print usage and optional parameters |
+|--sjtrim   | enable reads trimming at splice junctions | 
+|--hisat2   | use hisat2 instead of STAR for mapping | 
+|--recalibration  | perform quality score recalibration (GATK)|
+|--clustering  | perform unsupervised analyses of read counts data|
+
+
 ## Usage
-To run the pipeline on a series of paired-end fastq files (with suffixes *_1* and *_2*) in folder *fastq*, and a reference genome with indexes in folder *ref_genome*, one can type:
+To run the pipeline on a series of paired-end fastq files (with suffixes *_1* and *_2*) in folder *fastq*, a reference genome with indexes in folder *ref_genome*, an annotation file ref.gtf, and a bed file ref.bed, one can type:
 ```bash
-nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --gendir ref_genome --suffix1 _1 --suffix2 _2
+nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed
 ``` 
 ### Use hisat2 for mapping
-To use the reads trimming at splice junctions step, you must add the ***--hisat2* option**, specify the path to the folder containing the hisat2 index files, as well as satisfy the requirements above mentionned. For example:
+To use hisat2 instead of STAR for the reads mapping, you must add the ***--hisat2* option**, specify the path to the folder containing the hisat2 index files (genome_tran.1.ht2 to genome_tran.8.ht2), as well as satisfy the requirements above mentionned. For example:
 ```bash
-nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --suffix1 _1 --suffix2 _2 --hisat2 --hisat2_idx /home/user/reference/genome_tran 
+nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed --hisat2 --hisat2_idx genome_tran 
 ```
+Note that parameter '--hisat2_idx' is the prefix of the index files, not the entire path to .ht2 files. 
+
 ### Enable reads trimming at splice junctions
 To use the reads trimming at splice junctions step, you must add the ***--sjtrim* option**, specify the path to the folder containing the GenomeAnalysisTK jar file, as well as satisfy the requirements above mentionned. For example:
 ```bash
-nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --gendir ref_genome --suffix1 _1 --suffix2 _2 --sjtrim --GATK_folder /home/user/GATK 
+nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed --sjtrim --GATK_jar /home/user/GATK/GenomeAnalysisTK.jar
 ```
 
 ### Enable Base Quality Score Recalibration
 To use the base quality score recalibration step, you must add the ***--bqsr* option**, specify the path to the folder containing the GenomeAnalysisTK jar file, the path to the GATK bundle folder for your reference genome, specify the path to the bed file with intervals to be considered, as well as satisfy the requirements above mentionned. For example:
 ```bash
-nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --gendir ref_genome --suffix1 _1 --suffix2 _2 --bqsr --GATK_folder /home/user/GATK --GATK_bundle /home/user/GATKbundle --intervals intervals.bed
+nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed --recalibration --GATK_jar /home/user/GATK/GenomeAnalysisTK.jar --GATK_bundle /home/user/GATKbundle
 ```
 
-## All parameters
-| **PARAMETER** | **DEFAULT** | **DESCRIPTION** |
-|-----------|--------------:|-------------| 
-| *--help* | null | print usage and optional parameters |
-*--input_folder* | . | input folder |
-*--output_folder* |   . | output folder |
-*--gendir* | ref | reference genome folder |
-*--cpu*          | 4 | number of CPUs |
-*--mem*         | 50 | memory for mapping|
-*--memOther*     | 2 | memory for QC and counting|
-*--fastq_ext*    | fq.gz | extension of fastq files|
-*--suffix1*      | \_1 | suffix for second element of read files pair|
-*--suffix2*      | \_2 | suffix for second element of read files pair|
-*--output_folder*   | . | output folder for aligned BAMs|
-*--annot_gtf*   |  Homo_sapiens.GRCh38.79.gtf | annotation GTF file |
-*--annot_gff*   |  Homo_sapiens.GRCh38.79.gff | annotation GFF file |
-*--fasta_ref* |    ref.fa | reference genome fasta file for GATK |
-*--GATK_folder* |  GATK | folder with jar file GenomeAnalysisTK.jar |
-*--GATK_bundle* |  GATK_bundle | folder with files for BQSR |
-*--intervals*   |  intervals.bed | bed file with intervals for BQSR | 
-*--RG*          |  PL:ILLUMINA | string to be added to read group information in BAM file |
-*--sjtrim*      |  false | enable reads trimming at splice junctions | 
-*--bqsr*        |  false | enable base quality score recalibration |
-*--gene_bed*   |  gene.bed | bed file with genes for RESeQC | 
-*--stranded*   |  no | Strand information for counting with htseq [no, yes, reverse] | 
-*--stranded*   |  no | Strand information for counting with htseq [no, yes, reverse] | 
-*--hisat2*   |  false | use hisat2 instead of STAR for mapping | 
-*--hisat2_idx*   |  genome_tran | index filename prefix for hisat2 | 
+### Perform unsupervised analysis
+To use the unsupervised analysis step, you must add the ***--clustering* option**, and satisfy the requirements above mentionned. For example:
+```bash
+nextflow run iarcbioinfo/RNAseq-nf --input_folder fastq --ref_folder ref_genome --gtf ref.gtf --bed ref.bed --clustering
+```
+You can also specify options n, t, c, and l (see [*RNAseq_unsupervised.R*](https://github.com/IARCbioinfo/RNAseq_analysis_scripts)) of script RNAseq_unsupervised.R using options '--clustering_n', '--clustering_t', '--clustering_c', and '--clustering_l'.
+
+
+## Output 
+  | Type      | Description     |
+  |-----------|---------------|
+  | file.bam    | BAM files of alignments or realignments |
+  | file.bam.bai    | BAI files of alignments or realignments |
+  | file_{12}.fq.gz_trimming_report.txt | trim_galore report | 
+  |multiqc_pretrim_report.html  | multiqc report before trimming | 
+  |multiqc_pretrim_report_data            | folder with data used to compute multiqc report before trimming |
+  |multiqc_posttrim_report.html      |     multiqc report before trimming | 
+  |multiqc_posttrim_report_data      |  folder with data used to compute multiqc report before trimming |
+  |STAR.file.Log.final.out| STAR log |
+  |file_readdist.txt                | RSeQC report |
+  |file_count.txt                   | htseq-count output file  |
+  | file_target_intervals.list    | list of intervals used  |
+  | file_recal.table | table of scores before recalibration   |
+  | file_post_recal.table   | table of scores after recalibration |
+  | file_recalibration_plots.pdf   |  before/after recalibration plots   |
+          
+
+## Directed Acyclic Graph
+
+### With default options
+[![DAG STAR](dag_STAR.png)](http://htmlpreview.github.io/?https://github.com/IARCbioinfo/RNAseq-nf/blob/dev/dag_STAR.html)
+
+### With option --hisat2
+[![DAG hisat2](dag_hisat2.png)](http://htmlpreview.github.io/?https://github.com/IARCbioinfo/RNAseq-nf/blob/dev/dag_hisat2.html)
+
+### With options --sjtrim and --recalibration
+[![DAG STAR_sjtrim_recal](dag_STAR_sjtrim_recal.png)](http://htmlpreview.github.io/?https://github.com/IARCbioinfo/RNAseq-nf/blob/dev/dag_STAR_sjtrim_recal.html)
+
+## Contributions
+
+  | Name      | Email | Description     |
+  |-----------|---------------|-----------------| 
+  | Nicolas Alcala*    | [email protected]    | Developer to contact for support |
+  | Noemie Leblay | [email protected] | Tester |
+  | Alexis Robitaille | [email protected] | Tester |
+
+