diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 0e7fa487..bb2bc110 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,6 @@ +v0.2.0 (2014-03-29) +- Switch to DESeq +- Renaming from TRAPL to READemption (yes, another name change) v0.1.9 (2014-02-17) - Fix isssue with path for paired-end reads - Fix lib name sorting for gene quanti overview diff --git a/Makefile b/Makefile index 49cd1b02..92061566 100644 --- a/Makefile +++ b/Makefile @@ -27,4 +27,4 @@ readme_clean: rm -f README.tex README.html README.rst pylint: - pylint bin/trapl trapllib/* tests/* + pylint bin/reademption reademptionlib/* tests/* diff --git a/README.md b/README.md index 9645c9a8..988c115c 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,25 @@ About ----- -The RNA-Seq Analysis PipeLine (TRAPL) is - as the name implies - a -pipeline for the computational evaluation of RNA-Seq data. It was -originally developed at the IMIB/ZINF to process dRNA-Seq reads (as -introduced by Sharma et al., Nature, 2010 originating from bacterial -samples. Meanwhile is has been extended to process data generated in -different experimental setups and originating from all domains of life -and is under active development. The subcommands which are provided -by command-line interface cover read processing and aligning, coverage -plot generation, gene expression quantification as well as -differential gene expression analysis. TRAPL was applied to analyze -numerous data sets. In order to set up analyses quickly TRAPL follows -the principal of "convention over configuration": Once the input files -are copied into defined folders no further parameters have to be -given. Still, TRAPL's behavior can be adapted to specific needs of the -user. +READemption is a pipeline for the computational evaluation of RNA-Seq +data. It was originally developed at the IMIB/ZINF to process dRNA-Seq +reads (as introduced by Sharma et al., Nature, 2010 originating from +bacterial samples. Meanwhile is has been extended to process data +generated in different experimental setups and originating from all +domains of life and is under active development. The subcommands which +are provided by command-line interface cover read processing and +aligning, coverage plot generation, gene expression quantification as +well as differential gene expression analysis. READemption was applied +to analyze numerous data sets. In order to set up analyses quickly +READemption follows the principal of "convention over configuration": +Once the input files are copied into defined folders no further +parameters have to be given. Still, READemption's behavior can be +adapted to specific needs of the user. License ------- -[ICS](https://en.wikipedia.org/wiki/ISC_license) - see LICENSE.txt +[ICSL](https://en.wikipedia.org/wiki/ISC_license) - see LICENSE.txt Development ----------- diff --git a/bin/trapl b/bin/reademption similarity index 98% rename from bin/trapl rename to bin/reademption index 612b0529..e8b2b0fa 100755 --- a/bin/trapl +++ b/bin/reademption @@ -1,15 +1,15 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """RAPL - A RNA-seq Analysis PipeLine""" import argparse -from trapllib.controller import Controller +from reademptionlib.controller import Controller __author__ = "Konrad Foerstner " __copyright__ = "2011-2013 by Konrad Foerstner " __license__ = "ISC license" __email__ = "konrad@foerstner.org" -__version__ = "0.1.9" +__version__ = "0.2.0" def main(): parser = argparse.ArgumentParser() @@ -178,7 +178,7 @@ def main(): help="Comma separated list of condition in the same order as " "their corresponding libraries.") deseq_parser.add_argument( - "--no_replicates", "-r", default=False, action="store_true") + "--cooks_cutoff_off", "-k", default=False, action="store_true") deseq_parser.set_defaults(func=run_deseq) # Parameters for viz_align diff --git a/docs/source/conf.py b/docs/source/conf.py index 99de2c7f..1a27e211 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# TRAPL documentation build configuration file, created by +# READemption documentation build configuration file, created by # sphinx-quickstart on Thu May 2 08:33:05 2013. # # This file is execfile()d with the current directory set to its containing dir. @@ -40,8 +40,8 @@ master_doc = 'index' # General information about the project. -project = u'TRAPL' -copyright = u'2013, Konrad U. Förstner' +project = u'READemption' +copyright = u'2014, Konrad U. Förstner' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -164,7 +164,7 @@ #html_file_suffix = None # Output file base name for HTML help builder. -htmlhelp_basename = 'TRAPLdoc' +htmlhelp_basename = 'READdemptiondoc' # -- Options for LaTeX output -------------------------------------------------- @@ -183,8 +183,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'TRAPL.tex', u'TRAPL Documentation', - u'Konrad Förstner', 'manual'), + ('index', 'READemption.tex', u'READemption Documentation', + u'Konrad U. Förstner', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -213,8 +213,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'rapl', u'TRAPL Documentation', - [u'Konrad Förstner'], 1) + ('index', 'rapl', u'READemption Documentation', + [u'Konrad U. Förstner'], 1) ] # If true, show URL addresses after external links. @@ -227,8 +227,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'TRAPL', u'TRAPL Documentation', - u'Konrad Förstner', 'TRAPL', 'One line description of project.', + ('index', 'READemption', u'READemption Documentation', + u'Konrad U. Förstner', 'READemption', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/source/example_analysis.rst b/docs/source/example_analysis.rst index 366128c9..e8254a2c 100644 --- a/docs/source/example_analysis.rst +++ b/docs/source/example_analysis.rst @@ -9,7 +9,7 @@ Generating a project Creating a new project:: - $ trapl create my_rna_seq_analysis + $ reademption create my_rna_seq_analysis Created folder "my_rna_seq_analysis" and required subfolders. Please copy read files into folder "my_rna_seq_analysis/input/reads" and reference sequences files into folder "my_rna_seq_analysis/input/reference_sequences". diff --git a/trapllib/__init__.py b/docs/source/example_analysis.rst: similarity index 100% rename from trapllib/__init__.py rename to docs/source/example_analysis.rst: diff --git a/docs/source/index.rst b/docs/source/index.rst index c00a2458..66027ecd 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,5 +1,5 @@ -TRAPL - The RNA-Seq Analysis PipeLine -************************************* +READemption - A RNA-Seq Analysis Pipeline +***************************************** Table of content ================ @@ -13,51 +13,50 @@ Table of content license versions -TRAPL in a nutshell -=================== +READemption in a nutshell +========================= -*The RNA-Seq Analysis PipeLine* (*TRAPL*) is - as the name implies - a -pipeline for the computational evaluation of RNA-Seq data. It was -originally developed at the `IMIB/ZINF +*READemption* is a pipeline for the computational evaluation of +RNA-Seq data. It was originally developed at the `IMIB/ZINF `_ to process dRNA-Seq reads (as introduced by Sharma *et al.*, Nature, 2010 (`Pubmed `_)) originating from bacterial samples. Meanwhile is has been extended to process data generated in different experimental setups and originating from all domains of life and is under `active development -`_. The `subcommands +`_. The `subcommands `_ which are provided by command-line interface cover read processing and aligning, coverage plot generation, gene expression quantification as well as differential gene expression -analysis. TRAPL was applied to analyze numerous data sets. In order to -set up analyses quickly TRAPL follows the principal of *convention +analysis. READemption was applied to analyze numerous data sets. In order to +set up analyses quickly READemption follows the principal of *convention over configuration*: Once the input files are copied into defined -folders no further parameters have to be given. Still, TRAPL's +folders no further parameters have to be given. Still, READemption's behavior can be adapted to specific needs of the user. This tools is -available as open source under the `ICS `_ -open source license. +available as open source under open source license `ICSL +`_ . Download ======== -TRAPL can be download from `its PyPI page -`_. Please read the +READemption can be download from `its PyPI page +`_. Please read the `installation instructions `_. Source code =========== -The source code of TRAPL can be found at https://github.com/konrad/trapl. +The source code of READemption can be found at https://github.com/konrad/READemption. Cite ==== -If you apply TRALP in you data analysis please cite the following -reference: *The RNA-Seq Analysis PipeLine (TRAPL) – A tool for the -computational analysis of deep-sequencing based transcriptome data*. -Konrad U. Förstner, Jörg Vogel, Cynthia M. Sharma; (in preparation). A -`pre-preprint version `_ of the -manuscript is hosted at bioRxiv. +If you apply READemption in you data analysis please cite the +following reference: *READemption – A tool for the computational +analysis of deep-sequencing-based transcriptome data*. +Konrad U. Förstner, Jörg Vogel, Cynthia M. Sharma; (submitted). A +`pre-preprint version `_ of +the manuscript is hosted at bioRxiv. Contact ======= diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 69ac24b3..af8c5422 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -4,21 +4,21 @@ Installation Requirements ------------ -TRAPL was developed using Python 3.3 and for best performance the user -is advised to run TRAPL with this version, too. Any other Python 3 -version should work as well. Also Python 2.7 can be used if the -library `futures `_ is -installed. In any case, the third party modules `pysam -`_ as well as `setuptool -`_ and `pip -`_ in order to make the installation -easy by retrieving are required. TRAPL uses the short read mapper -`segemehl `_ for -the mapping and this software needs to be installed. The subcommand +READemption was developed using Python 3.3 and for best performance +the user is advised to run READemptionL with this or a higher versoin, +too. Also Python 2.7 can be used if the library `futures +`_ is installed. In any case, +the third party modules `pysam `_ as +well as `setuptool `_ and +`pip `_ in order to make the +installation easy by retrieving are required. READemption uses the +short read mapper `segemehl +`_ for the +mapping and this software needs to be installed. The subcommand `viz_align`, `viz_gene_quanti`, `viz_deseq` require the Python library `Matplotlib `_. `R -`_ and the bioconductor package `DESeq -`_ are +`_ and the bioconductor package `DESeq2 +`_ are necessary for the subcommand `deseq` which performs differential gene expression analysis. Don't worry - in the following the installation of all these requirements will be covered. @@ -58,10 +58,10 @@ If PIP is not yet install you should get this, too.:: curl https://raw.github.com/pypa/pip/master/contrib/get-pip.py > get-pip.py sudo python3.3 get-pip.py -Now you can use PIP to install pysam and TRAPL:: +Now you can use PIP to install pysam and READemption:: pip-3.3 install pysam - pip-3.3 install trapl + pip-3.3 install READemption Install make and ncurses dev library.:: @@ -91,9 +91,9 @@ and libxml2 which is required for the installation of some R-packages.:: sudo apt-get install libxml2-dev -Install DESeq in :: +Install DESeq2 in :: - echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq")' | Rscript - + echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq2")' | Rscript - .. diff --git a/docs/source/license.rst b/docs/source/license.rst index e94a3766..145130c8 100644 --- a/docs/source/license.rst +++ b/docs/source/license.rst @@ -1,7 +1,7 @@ License ======= -TRAPL is open source software and available under the ISC license. +READemption is open source software and available under the ISC license. Copyright (c) 2011-2014, Konrad Förstner diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst index b4a4e62e..f161edc5 100644 --- a/docs/source/subcommands.rst +++ b/docs/source/subcommands.rst @@ -1,5 +1,5 @@ -TRAPL's subcommands -=================== +READemption's subcommands +========================= create ------ diff --git a/docs/source/versions.rst b/docs/source/versions.rst index ae9a4904..fc8f3bb5 100644 --- a/docs/source/versions.rst +++ b/docs/source/versions.rst @@ -1,2 +1,4 @@ Versions/Change log =================== + +See https://github.com/konrad/READemption/blob/master/CHANGELOG.txt diff --git a/reademptionlib/__init__.py b/reademptionlib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/trapllib/bammerger.py b/reademptionlib/bammerger.py similarity index 100% rename from trapllib/bammerger.py rename to reademptionlib/bammerger.py diff --git a/trapllib/controller.py b/reademptionlib/controller.py similarity index 96% rename from trapllib/controller.py rename to reademptionlib/controller.py index 722cfd53..6b4c0edd 100644 --- a/trapllib/controller.py +++ b/reademptionlib/controller.py @@ -3,21 +3,21 @@ import sys import json import pysam -from trapllib.bammerger import BamMerger -from trapllib.coveragecalculator import CoverageCalculator -from trapllib.deseq import DESeqRunner -from trapllib.fasta import FastaParser -from trapllib.genewisequanti import GeneWiseQuantification, GeneWiseOverview -from trapllib.paths import Paths -from trapllib.projectcreator import ProjectCreator -from trapllib.rawstatdata import RawStatDataWriter, RawStatDataReader -from trapllib.readaligner import ReadAligner -from trapllib.readalignerstats import ReadAlignerStats -from trapllib.readrealigner import ReadRealigner -from trapllib.readalignerstatstable import ReadAlignerStatsTable -from trapllib.readprocessor import ReadProcessor -from trapllib.sambamconverter import SamToBamConverter -from trapllib.wiggle import WiggleWriter +from reademptionlib.bammerger import BamMerger +from reademptionlib.coveragecalculator import CoverageCalculator +from reademptionlib.deseq import DESeqRunner +from reademptionlib.fasta import FastaParser +from reademptionlib.genewisequanti import GeneWiseQuantification, GeneWiseOverview +from reademptionlib.paths import Paths +from reademptionlib.projectcreator import ProjectCreator +from reademptionlib.rawstatdata import RawStatDataWriter, RawStatDataReader +from reademptionlib.readaligner import ReadAligner +from reademptionlib.readalignerstats import ReadAlignerStats +from reademptionlib.readrealigner import ReadRealigner +from reademptionlib.readalignerstatstable import ReadAlignerStatsTable +from reademptionlib.readprocessor import ReadProcessor +from reademptionlib.sambamconverter import SamToBamConverter +from reademptionlib.wiggle import WiggleWriter class Controller(object): @@ -627,7 +627,7 @@ def compare_with_deseq(self): self._paths.gene_wise_quanti_combined_path, self._paths.deseq_tmp_session_info_script, self._paths.deseq_session_info, - no_replicates=self._args.no_replicates) + self._args.cooks_cutoff_off) deseq_runner.create_deseq_script_file() deseq_runner.write_session_info_file() deseq_runner.run_deseq() @@ -665,7 +665,7 @@ def _write_err_msg_and_quit(self, msg): def viz_align(self): """Generate plots based on the read processing and mapping""" - from trapllib.vizalign import AlignViz + from reademptionlib.vizalign import AlignViz align_viz = AlignViz( self._paths.get_lib_names_single_end(), self._paths.read_processing_stats_path, @@ -678,7 +678,7 @@ def viz_align(self): def viz_gene_quanti(self): """Generate plots based on the gene-wise read countings""" - from trapllib.vizgenequanti import GeneQuantiViz + from reademptionlib.vizgenequanti import GeneQuantiViz gene_quanti_viz = GeneQuantiViz( self._paths.gene_wise_quanti_combined_path, self._paths.get_lib_names_single_end()) @@ -690,7 +690,7 @@ def viz_gene_quanti(self): def viz_deseq(self): """Generate plots based on the DESeq analysis""" - from trapllib.vizdeseq import DESeqViz + from reademptionlib.vizdeseq import DESeqViz deseq_path_template = ( self._paths.deseq_raw_folder + "/deseq_comp_%s_vs_%s.csv") deseq_viz = DESeqViz(self._paths.deseq_script_path, deseq_path_template) diff --git a/trapllib/coveragecalculator.py b/reademptionlib/coveragecalculator.py similarity index 100% rename from trapllib/coveragecalculator.py rename to reademptionlib/coveragecalculator.py diff --git a/trapllib/deseq.py b/reademptionlib/deseq.py similarity index 75% rename from trapllib/deseq.py rename to reademptionlib/deseq.py index 6ac94367..ed91ce7e 100644 --- a/trapllib/deseq.py +++ b/reademptionlib/deseq.py @@ -9,7 +9,7 @@ def __init__( self, libs, conditions, deseq_raw_folder, deseq_extended_folder, deseq_script_path, gene_wise_quanti_combined_path, deseq_tmp_session_info_script, deseq_session_info, - no_replicates=False): + cooks_cutoff_off=False): self._libs = libs self._conditions = conditions self._deseq_raw_folder = deseq_raw_folder @@ -18,12 +18,12 @@ def __init__( self._gene_wise_quanti_combined_path = gene_wise_quanti_combined_path self._deseq_tmp_session_info_script = deseq_tmp_session_info_script self._deseq_session_info = deseq_session_info - self._no_replicastes = no_replicates + self._cooks_cutoff_off = cooks_cutoff_off self._first_data_column = 11 def write_session_info_file(self): with open(self._deseq_tmp_session_info_script, "w") as tmp_r_script_fh: - tmp_r_script_fh.write("library('DESeq')\nsessionInfo()\n") + tmp_r_script_fh.write("library('DESeq2')\nsessionInfo()\n") with open(self._deseq_session_info, "w") as session_info_fh: with open(os.devnull, "w") as devnull: call(["Rscript", self._deseq_tmp_session_info_script,], @@ -37,14 +37,12 @@ def create_deseq_script_file(self): head_row = open( self._gene_wise_quanti_combined_path).readline()[:-1].split("\t") libs = head_row[self._first_data_column-1:] + libs_str = ",".join(["'%s'" % lib for lib in libs]) conditions = [libs_to_conditions[lib] for lib in libs] condition_str = ", ".join(["'%s'" % cond for cond in conditions]) - dispersion_params = "" - if self._no_replicastes is True: - dispersion_params = ", method='blind', sharingMode='fit-only'" file_content = self._deseq_script_template() % ( self._gene_wise_quanti_combined_path, self._first_data_column, - condition_str, dispersion_params) + libs_str, condition_str) file_content += self._comparison_call_strings(conditions) deseq_fh = open(self._deseq_script_path, "w") deseq_fh.write(file_content) @@ -71,13 +69,15 @@ def merge_counting_files_with_results(self): csv.reader(open( self._gene_wise_quanti_combined_path), delimiter="\t"), csv.reader(deseq_result_fh, delimiter="\t")): - if comparison_file_row[0] == "id": + if comparison_file_row[0] == "baseMean": + # Add another column to the header comparison_file_row = [""] + comparison_file_row - # Add condition name to the headline - comparison_file_row[3] += " (%s)" % combo[0] - comparison_file_row[4] += " (%s)" % combo[1] + # Extend column description + counting_file_row[self._first_data_column:] = [ + "%s raw countings" % lib_name for lib_name in + counting_file_row[self._first_data_column:]] output_fh.write("\t".join( - counting_file_row + comparison_file_row[2:]) + "\n") + counting_file_row + comparison_file_row[1:]) + "\n") output_fh.close() def _condition_combos(self, conditions): @@ -90,9 +90,14 @@ def _comparison_call_strings(self, conditions): call_string = "" condition_combos = self._condition_combos(conditions) self._comparison_files_and_combos = [] + cooks_cutoff_str = "" + if self._cooks_cutoff_off: + cooks_cutoff_str = ", cooksCutoff=FALSE" for index, condition_combo in enumerate(condition_combos): - call_string += "comp%s <- nbinomTest(cds, '%s', '%s')\n" % ( - index, condition_combo[0], condition_combo[1]) + call_string += ("comp%s <- results(dds, contrast=" + "c('condition','%s', '%s')%s)\n" % ( + index, condition_combo[0], condition_combo[1], + cooks_cutoff_str)) comparison_file = "deseq_comp_%s_vs_%s.csv" % ( condition_combo[0], condition_combo[1]) self._comparison_files_and_combos.append( @@ -105,11 +110,14 @@ def _comparison_call_strings(self, conditions): def _deseq_script_template(self): return ( - "library('DESeq')\n" + "library('DESeq2')\n" "rawCountTable <- read.table('%s', skip=1, sep='\\t', " "quote='', comment.char='')\n" - "countTable <- round(rawCountTable[,%s:length(names(rawCountTable))])\n" + "countTable <- round(rawCountTable[,%s:length(names(" + "rawCountTable))])\n" + "libs <- c(%s)\n" "conds <- c(%s)\n" - "cds <- newCountDataSet(countTable, conds)\n" - "cds <- estimateSizeFactors(cds)\n" - "cds <- estimateDispersions(cds%s)\n") + "samples <- data.frame(row.names=libs, condition=conds)\n" + "dds <- DESeqDataSetFromMatrix(countData=countTable, " + "colData=samples, design=~condition)\n" + "dds <- DESeq(dds)\n") diff --git a/trapllib/fasta.py b/reademptionlib/fasta.py similarity index 100% rename from trapllib/fasta.py rename to reademptionlib/fasta.py diff --git a/trapllib/genewisequanti.py b/reademptionlib/genewisequanti.py similarity index 99% rename from trapllib/genewisequanti.py rename to reademptionlib/genewisequanti.py index 8e7c3520..499d7187 100644 --- a/trapllib/genewisequanti.py +++ b/reademptionlib/genewisequanti.py @@ -1,6 +1,6 @@ import csv import os.path -from trapllib.gff3 import Gff3Parser +from reademptionlib.gff3 import Gff3Parser import pysam class GeneWiseQuantification(object): diff --git a/trapllib/gff3.py b/reademptionlib/gff3.py similarity index 100% rename from trapllib/gff3.py rename to reademptionlib/gff3.py diff --git a/trapllib/lack.py b/reademptionlib/lack.py similarity index 100% rename from trapllib/lack.py rename to reademptionlib/lack.py diff --git a/trapllib/parameterlog.py b/reademptionlib/parameterlog.py similarity index 100% rename from trapllib/parameterlog.py rename to reademptionlib/parameterlog.py diff --git a/trapllib/paths.py b/reademptionlib/paths.py similarity index 99% rename from trapllib/paths.py rename to reademptionlib/paths.py index 88138b53..45feadf0 100644 --- a/trapllib/paths.py +++ b/reademptionlib/paths.py @@ -96,7 +96,7 @@ def _set_viz_deseq_folder_names(self): self.viz_deseq_base_folder = ( "%s/viz_deseq" % self.output_folder) self.viz_deseq_scatter_plot_path = ( - "%s/base_mean_scatterplots.pdf" % + "%s/MA_plots.pdf" % self.viz_deseq_base_folder) self.viz_deseq_volcano_plot_path = ( "%s/volcano_plots_log2_fold_change_vs_p-value.pdf" % @@ -127,7 +127,7 @@ def _set_static_files(self): self.deseq_tmp_session_info_script = "%s/tmp.R" % self.deseq_raw_folder self.deseq_session_info = "%s/R_session_info.txt" % ( self.deseq_raw_folder) - self.version_path = "%s/used_trapl_version.txt" % (self.align_report_folder) + self.version_path = "%s/used_reademption_version.txt" % (self.align_report_folder) def _get_sorted_folder_content(self, folder): """Return the sorted file list of a folder""" diff --git a/trapllib/polyaclipper.py b/reademptionlib/polyaclipper.py similarity index 100% rename from trapllib/polyaclipper.py rename to reademptionlib/polyaclipper.py diff --git a/trapllib/projectcreator.py b/reademptionlib/projectcreator.py similarity index 94% rename from trapllib/projectcreator.py rename to reademptionlib/projectcreator.py index 325a0caa..2c0d2a7e 100644 --- a/trapllib/projectcreator.py +++ b/reademptionlib/projectcreator.py @@ -31,4 +31,4 @@ def create_subfolders(self, subfolders): def create_version_file(self, version_file_path, version): with open(version_file_path, "w") as fh: - fh.write("TRAPL version %s" % version) + fh.write("READemption version %s" % version) diff --git a/trapllib/rawstatdata.py b/reademptionlib/rawstatdata.py similarity index 100% rename from trapllib/rawstatdata.py rename to reademptionlib/rawstatdata.py diff --git a/trapllib/readaligner.py b/reademptionlib/readaligner.py similarity index 94% rename from trapllib/readaligner.py rename to reademptionlib/readaligner.py index fa8ce141..dc90de51 100644 --- a/trapllib/readaligner.py +++ b/reademptionlib/readaligner.py @@ -1,5 +1,5 @@ import sys -from trapllib.segemehl import Segemehl +from reademptionlib.segemehl import Segemehl class ReadAligner(object): """An abstraction layer for different short read aligners.""" diff --git a/trapllib/readalignerstats.py b/reademptionlib/readalignerstats.py similarity index 99% rename from trapllib/readalignerstats.py rename to reademptionlib/readalignerstats.py index 2c037ca7..1f297e95 100644 --- a/trapllib/readalignerstats.py +++ b/reademptionlib/readalignerstats.py @@ -2,7 +2,7 @@ import sys from collections import defaultdict from functools import reduce -from trapllib.fasta import FastaParser +from reademptionlib.fasta import FastaParser import pysam class ReadAlignerStats(object): diff --git a/trapllib/readalignerstatstable.py b/reademptionlib/readalignerstatstable.py similarity index 100% rename from trapllib/readalignerstatstable.py rename to reademptionlib/readalignerstatstable.py diff --git a/trapllib/readprocessor.py b/reademptionlib/readprocessor.py similarity index 98% rename from trapllib/readprocessor.py rename to reademptionlib/readprocessor.py index d976abb9..d3c5d9b9 100644 --- a/trapllib/readprocessor.py +++ b/reademptionlib/readprocessor.py @@ -1,8 +1,8 @@ import gzip import bz2 from collections import defaultdict -from trapllib.fasta import FastaParser -from trapllib.polyaclipper import PolyAClipper +from reademptionlib.fasta import FastaParser +from reademptionlib.polyaclipper import PolyAClipper class ReadProcessor(object): diff --git a/trapllib/readrealigner.py b/reademptionlib/readrealigner.py similarity index 95% rename from trapllib/readrealigner.py rename to reademptionlib/readrealigner.py index e9981a6e..f463cc4b 100644 --- a/trapllib/readrealigner.py +++ b/reademptionlib/readrealigner.py @@ -1,5 +1,5 @@ import sys -from trapllib.lack import Lack +from reademptionlib.lack import Lack class ReadRealigner(object): """An abstraction layer for different short read realigners.""" diff --git a/trapllib/sambamconverter.py b/reademptionlib/sambamconverter.py similarity index 100% rename from trapllib/sambamconverter.py rename to reademptionlib/sambamconverter.py diff --git a/trapllib/segemehl.py b/reademptionlib/segemehl.py similarity index 100% rename from trapllib/segemehl.py rename to reademptionlib/segemehl.py diff --git a/trapllib/vizalign.py b/reademptionlib/vizalign.py similarity index 100% rename from trapllib/vizalign.py rename to reademptionlib/vizalign.py diff --git a/trapllib/vizdeseq.py b/reademptionlib/vizdeseq.py similarity index 59% rename from trapllib/vizdeseq.py rename to reademptionlib/vizdeseq.py index 843ba232..22ab6525 100644 --- a/trapllib/vizdeseq.py +++ b/reademptionlib/vizdeseq.py @@ -11,11 +11,10 @@ def __init__(self, deseq_script_path, deseq_path_template, use_antisene=True): self._deseq_script_path = deseq_script_path self._deseq_path_template = deseq_path_template self._use_antisene = use_antisene - self._basemean_lib_1_column = 3 - self._basemean_lib_2_column = 4 - self._log_2_fold_chance_column = 6 - self._p_value_column = 7 - self._p_adj_value_column = 8 + self._basemean_column = 1 + self._log_2_fold_chance_column = 2 + self._p_value_column = 5 + self._adj_p_value_column = 6 self._p_value_significance_limit = 0.05 self._log_fold_change_limit = 2 self._log_2_fold_chance_limit = 1 @@ -32,34 +31,32 @@ def create_scatter_plots(self, plot_path): def _create_scatter_plots(self, condition_1, condition_2): deseq_path = self._deseq_path_template % (condition_1, condition_2) - (basemean_lib_1, basemean_lib_2, log2_fold_changes, p_values, + (basemeans, log2_fold_changes, p_values, p_adj_values) = self._parse_deseq_file(deseq_path) fig = plt.figure() ax = fig.add_subplot(111) - ax.set_aspect(1) - ax.set_yscale('log') ax.set_xscale('log') - axis_min = 1 - axis_max = max( - [max(counting) - for counting in [basemean_lib_1, basemean_lib_2]]) - # Draw lines - plt.plot([axis_min, axis_max], [axis_min, axis_max], - linestyle="solid", color="green", alpha=0.4) - plt.plot([axis_min, axis_max], [self._log_fold_change_limit, axis_max*self._log_fold_change_limit], - linestyle="dashed", color="green", alpha=0.4) - plt.plot([axis_min, axis_max], [1/self._log_fold_change_limit, axis_max*1/self._log_fold_change_limit], - linestyle="dashed", color="green", alpha=0.4) - # Calculate the Pearson correlation coefficient - corr_coeff = np.corrcoef(basemean_lib_1, basemean_lib_2)[0][1] + cleaned_basemeans = [] + cleaned_log2_fold_changes = [] + for basemean, log2_fc in zip(basemeans, log2_fold_changes): + if log2_fc != "NA" and basemean != "NA": + cleaned_basemeans.append(basemean) + cleaned_log2_fold_changes.append(log2_fc) + y_max = max([abs(log2_fc) for log2_fc in cleaned_log2_fold_changes]) + y_min = -1*y_max + x_min = min(cleaned_basemeans) + x_max = max(cleaned_basemeans) + # # Draw lines + plt.plot([x_min, x_max], [0, 0], color="k", alpha=0.2) # Set axis ranges - plt.axis([axis_min, axis_max, - axis_min, axis_max]) - plt.title("%s vs. %s\n(r = %s)" % ( - condition_1, condition_2, corr_coeff)) - plt.plot(basemean_lib_1, basemean_lib_2, "k.", alpha=0.2) - plt.xlabel("Expression %s" % condition_1) - plt.ylabel("Expression %s" % condition_2) + plt.axis([x_min, x_max, y_max, y_min]) + plt.title("%s vs. %s" % (condition_1, condition_2)) + sig_basemeans = [] + sig_log2_fold_changes = [] + plt.plot(sig_basemeans, sig_log2_fold_changes, "r.", alpha=0.2) + plt.plot(cleaned_basemeans, cleaned_log2_fold_changes, "k.", alpha=0.2) + plt.xlabel("Mean of normalized counts ") + plt.ylabel("Log2 fold change") self._pp_scatterplots.savefig() def create_volcano_plots(self, volcano_plot_path, volcano_plot_adj_path): @@ -84,14 +81,27 @@ def _extract_condition_names(self): def _create_volcano_plots(self, condition_1, condition_2): deseq_path = self._deseq_path_template % (condition_1, condition_2) - (basemean_lib_1, basemean_lib_2, log2_fold_changes, p_values, - p_adj_values) = self._parse_deseq_file(deseq_path) + (basemean, log2_fold_changes, p_values, + adj_p_values) = self._parse_deseq_file(deseq_path) + cleaned_p_values = [] + cleaned_log2_fold_changes = [] + for p_value, log2_fold_change in zip(p_values, log2_fold_changes): + if p_value != "NA" and log2_fold_change != "NA": + cleaned_p_values.append(p_value) + cleaned_log2_fold_changes.append(log2_fold_change) self._create_volcano_plot( - log2_fold_changes, p_values, condition_1, condition_2, - self._pp_raw) + cleaned_log2_fold_changes, cleaned_p_values, condition_1, + condition_2, self._pp_raw) + cleaned_adj_p_values = [] + cleaned_log2_fold_changes = [] + for p_value, log2_fold_change in zip(adj_p_values, log2_fold_changes): + if p_value != "NA" and log2_fold_change != "NA": + cleaned_adj_p_values.append(p_value) + cleaned_log2_fold_changes.append(log2_fold_change) self._create_volcano_plot( - log2_fold_changes, p_adj_values, condition_1, condition_2, - self._pp_adj, pvalue_string_mod="(adjusted)") + cleaned_log2_fold_changes, cleaned_adj_p_values, + condition_1, condition_2, self._pp_adj, + pvalue_string_mod="(adjusted)") def _create_volcano_plot( self, log2_fold_changes, p_values, condition_1, condition_2, pp, @@ -126,27 +136,20 @@ def _create_volcano_plot( pp.savefig() def _parse_deseq_file(self, deseq_path): - basemean_lib_1 = [] - basemean_lib_2 = [] + basemeans = [] log_2_fold_changes = [] - p_value = [] - p_adj_value = [] + p_values = [] + adj_p_values = [] for row in csv.reader(open(deseq_path), delimiter="\t"): - if row[0].startswith("id"): - continue - skip = False - for column_no in [ - self._log_2_fold_chance_column, self._p_value_column, - self._p_adj_value_column]: - if row[column_no] in ["NA", "-Inf", "Inf"]: - skip = True - break - if skip is True: + if row[0].startswith("baseMean"): continue - basemean_lib_1.append(float(row[self._basemean_lib_1_column])) - basemean_lib_2.append(float(row[self._basemean_lib_2_column])) - log_2_fold_changes.append(float(row[self._log_2_fold_chance_column])) - p_value.append(float(row[self._p_value_column])) - p_adj_value.append(float(row[self._p_adj_value_column])) - return (basemean_lib_1, basemean_lib_2, log_2_fold_changes, p_value, - p_adj_value) + basemeans.append(float(row[self._basemean_column])) + for value_list, column_no in ( + (log_2_fold_changes, self._log_2_fold_chance_column), + (p_values, self._p_value_column), + (adj_p_values, self._adj_p_value_column)): + try: + value_list.append(float(row[column_no])) + except ValueError: + value_list.append(row[column_no]) + return basemeans, log_2_fold_changes, p_values, adj_p_values diff --git a/trapllib/vizgenequanti.py b/reademptionlib/vizgenequanti.py similarity index 100% rename from trapllib/vizgenequanti.py rename to reademptionlib/vizgenequanti.py diff --git a/trapllib/wiggle.py b/reademptionlib/wiggle.py similarity index 100% rename from trapllib/wiggle.py rename to reademptionlib/wiggle.py diff --git a/setup.py b/setup.py index 8ae85bc3..9e7bc5da 100644 --- a/setup.py +++ b/setup.py @@ -4,21 +4,22 @@ from distutils.core import setup setup( - name='trapl', - version='0.1.7', - packages=['trapllib', 'tests'], + name='READemption', + version='0.2.0', + packages=['reademptionlib', 'tests'], author='Konrad U. Förstner', author_email='konrad@foerstner.org', - description='The RNA-Seq Analysis Pipeline', + description='READemption - A RNA-Seq Analysis Pipeline', url='', install_requires=[ - "pysam >= 0.7.6" + "pysam >= 0.7.7" ], - scripts=['bin/trapl'], + scripts=['bin/reademption'], license='LICENSE.txt', long_description=open('README.txt').read(), classifiers=[ - 'License :: OSI Approved :: ISC License', + 'License :: OSI Approved :: ISC License (ISCL)' 'Programming Language :: Python :: 3', + 'Topic :: Scientific/Engineering :: Bio-Informatics' ] ) diff --git a/tests/readaligner.py b/tests/readaligner.py index 75d5cc0b..d1cbd9b6 100644 --- a/tests/readaligner.py +++ b/tests/readaligner.py @@ -2,7 +2,7 @@ import sys from io import StringIO sys.path.append(".") -from trapllib.readmapper import ReadMapper +from reademptionlib.readmapper import ReadMapper class TestReadMapper(unittest.TestCase): diff --git a/tests/test_controller.py b/tests/test_controller.py index 40037042..5f353a35 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -3,7 +3,7 @@ import unittest import shutil sys.path.append(".") -from trapllib.controller import Controller +from reademptionlib.controller import Controller class ArgMock(object): project_path = "a_test_project" diff --git a/tests/test_coveragecalculator.py b/tests/test_coveragecalculator.py index f94b3933..5da7f7c2 100644 --- a/tests/test_coveragecalculator.py +++ b/tests/test_coveragecalculator.py @@ -2,7 +2,7 @@ import os import unittest sys.path.append(".") -from trapllib.coveragecalculator import CoverageCalculator +from reademptionlib.coveragecalculator import CoverageCalculator from io import StringIO import pysam diff --git a/tests/test_fasta.py b/tests/test_fasta.py index b4f7b8bb..bb0619f1 100644 --- a/tests/test_fasta.py +++ b/tests/test_fasta.py @@ -2,7 +2,7 @@ import unittest import sys sys.path.append(".") -from trapllib.fasta import FastaParser +from reademptionlib.fasta import FastaParser class TestFastaParser(unittest.TestCase): diff --git a/tests/test_genewisequanti.py b/tests/test_genewisequanti.py index fb0e50b4..0a785dd3 100644 --- a/tests/test_genewisequanti.py +++ b/tests/test_genewisequanti.py @@ -2,7 +2,7 @@ import os import sys sys.path.append(".") -from trapllib.genewisequanti import GeneWiseQuantification +from reademptionlib.genewisequanti import GeneWiseQuantification import pysam class Gff3EntryMoc(object): diff --git a/tests/test_gff3.py b/tests/test_gff3.py index 73eef798..0689e668 100644 --- a/tests/test_gff3.py +++ b/tests/test_gff3.py @@ -2,7 +2,7 @@ import unittest import sys sys.path.append(".") -from trapllib.gff3 import Gff3Parser, Gff3Entry +from reademptionlib.gff3 import Gff3Parser, Gff3Entry class TestGff3Parser(unittest.TestCase): diff --git a/tests/test_paths.py b/tests/test_paths.py index 4531e9a6..6e3b5c68 100644 --- a/tests/test_paths.py +++ b/tests/test_paths.py @@ -3,7 +3,7 @@ import shutil import sys sys.path.append(".") -from trapllib.paths import Paths +from reademptionlib.paths import Paths class TestPaths(unittest.TestCase): diff --git a/tests/test_polyaclipper.py b/tests/test_polyaclipper.py index b975a7be..05c9c6c5 100644 --- a/tests/test_polyaclipper.py +++ b/tests/test_polyaclipper.py @@ -1,7 +1,7 @@ import sys import unittest sys.path.append(".") -from trapllib.polyaclipper import PolyAClipper +from reademptionlib.polyaclipper import PolyAClipper class TestPolyAClipper(unittest.TestCase): diff --git a/tests/test_projectcreator.py b/tests/test_projectcreator.py index 100c6c31..82ae4339 100644 --- a/tests/test_projectcreator.py +++ b/tests/test_projectcreator.py @@ -3,7 +3,7 @@ import sys import shutil sys.path.append(".") -from trapllib.projectcreator import ProjectCreator +from reademptionlib.projectcreator import ProjectCreator class TestProjectCreator(unittest.TestCase): diff --git a/tests/test_segemehl.py b/tests/test_segemehl.py index d5148610..107b9f36 100644 --- a/tests/test_segemehl.py +++ b/tests/test_segemehl.py @@ -3,7 +3,7 @@ import sys import unittest sys.path.append(".") -from trapllib.segemehl import Segemehl +from reademptionlib.segemehl import Segemehl class TestSegemehl(unittest.TestCase): """Provide general functionalities for tha actuall testing classes."""