Merge branch 'release-0.3.1'

foerstner-lab · Jul 5, 2014 · bbac9ed · bbac9ed
2 parents 773f068 + 957223b
commit bbac9ed
Show file tree

Hide file tree

Showing 16 changed files with 263 additions and 66 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,9 @@
+v0.3.1 (2014-07-05)
+- Add live / installation image (thanks anonymous reviewer #1)
+- Add support for Fastq read files (thanks anonymous reviewer #3)
+- Extend CLI documentation (thanks reviewer Matt MacManes)
+- Fix/extend online documentation (thanks for help re. OS X to Lei Li 
+  and the feedback to Petya Zhelyazkova)
 v0.3.0 (2014-05-18)
 - Improve Makefile
 - Update docs

diff --git a/Makefile b/Makefile
@@ -54,8 +54,8 @@ new_release:
 	@echo "* Test doc creation"
 	@echo "* make package_to_pypi"
 	@echo "* git add CHANGELOG.txt bin/reademption docs/source/conf.py setup.py"
-	@echo "* Commit changes e.g. 'git commit -m \"Set version to 0.2.X\"'"
-	@echo "* Tag the commit e.g. 'git tag -a v0.2.X -m \"version v0.2.X\"'"
+	@echo "* Commit changes e.g. 'git commit -m \"Set version to 0.3.X\"'"
+	@echo "* Tag the commit e.g. 'git tag -a v0.3.X -m \"version v0.3.X\"'"
 	@echo "* Merge release into dev and master"
 	@echo "* Push it to github: git push"
 	@echo "* Generate a new release based on this tag at"

diff --git a/bin/reademption b/bin/reademption
@@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner <[email protected]>"
 __copyright__ = "2011-2014 by Konrad Foerstner <[email protected]>"
 __license__ = "ISC license"
 __email__ = "[email protected]"
-__version__ = "0.3.0"
+__version__ = "0.3.1"
 
 def main():
     parser = argparse.ArgumentParser()
@@ -34,10 +34,11 @@ def main():
         "directory is used.")
     read_aligning_parser.add_argument(
         "--min_read_length", "-l", default=12, type=int,
-        help="Minimal read length after clipping.")
+        help="Minimal read length after clipping (default 12). Should be "
+	"higher for eukaryotic species.")
     read_aligning_parser.add_argument(
         "--processes", "-p", default=1, type=int,
-        help="Number of processes that should be used.")
+        help="Number of processes that should be used (default 1).")
     read_aligning_parser.add_argument(
         "--segemehl_accuracy", "-a", default=95.0, type=float,
         help="Segemehl's minimal accuracy (in %%) (default 95).")
@@ -71,6 +72,9 @@ def main():
     read_aligning_parser.add_argument(
         "--lack_bin", "-L", default="lack.x",
         help="Lack's binary path (default 'lack.x').")
+    read_aligning_parser.add_argument(
+        "--fastq", "-q", default=False, action="store_true",
+        help="Input reads are in FASTQ not FASTA format.")
     read_aligning_parser.add_argument(
         "--check_for_existing_files", "-f", default=False,
         action="store_true", help="Check for existing files (e.g. from a "
@@ -108,7 +112,7 @@ def main():
         "calculation.")
     coverage_creation_parser.add_argument(
         "--processes", "-p", default=1, type=int,
-        help="Number of processes that should be used.")
+        help="Number of processes that should be used (default 1).")
     coverage_creation_parser.add_argument(
         "--skip_read_count_splitting", "-s", default=False,
         action="store_true", help="Do not split the read counting between "
@@ -151,7 +155,7 @@ def main():
         "and anti-sense overlaps are counted and separately reported.")
     gene_wise_quanti_parser.add_argument(
         "--processes", "-p", default=1, type=int,
-        help="Number of processes that should be used.")
+        help="Number of processes that should be used (default 1).")
     gene_wise_quanti_parser.add_argument(
         "--features", "-t", dest="allowed_features", default=None,
         help="Comma separated list of features that should be considered "

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -50,7 +50,7 @@
 # The short X.Y version.
 version = '0.3'
 # The full version, including alpha/beta/rc tags.
-release = '0.3.0'
+release = '0.3.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/source/example_analysis.rst b/docs/source/example_analysis.rst
@@ -103,7 +103,10 @@ Finally, we need the reads of the RNA-Seq libraries. To save some time
 for running this examples we will work with subsampled libraries of 1M
 reads each. This will the limit informative value of the results which
 is acceptable as we just want to understand the workflow of the
-READemption.
+READemption. Please be aware that READemption does not perform quality
+trimming or adapter clipping so far. For this purpose use the `FASTX
+toolkit <http://hannonlab.cshl.edu/fastx_toolkit/>`_, `cutadapt
+<https://code.google.com/p/cutadapt/>`_ or other tools.
 
 ::
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -8,6 +8,7 @@ Table of content
 
    index
    installation
+   live_and_installation_image      
    subcommands      
    example_analysis
    troubleshooting

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -1,13 +1,14 @@
-Installation
-============
+Installation and updating
+=========================
 
 Requirements
 ------------
 
-READemption was developed using Python 3.3 and the user is advised to
-run READemption with this or a higher version. In any case, the third
-party packages `pysam <https://code.google.com/p/pysam>`_ as well as
-`setuptools <https://pypi.python.org/pypi/setuptools>`_ and `pip
+READemption was started to be developed using Python 3.2 and the user
+is advised to run READemption with this or a higher version. In any
+case, the third party packages `pysam
+<https://code.google.com/p/pysam>`_ as well as `setuptools
+<https://pypi.python.org/pypi/setuptools>`_ and `pip
 <http://www.pip-installer.org>`_ should be available on the system in
 order to make the installation easy. READemption uses the short read
 mapper `segemehl
@@ -21,8 +22,8 @@ necessary for the subcommand ``deseq`` which performs differential
 gene expression analysis. Don't worry - in the following the
 installation of all these requirements will be covered.
 
-Installing on a fresh Ubuntu installation
------------------------------------------
+Installing on a fresh Ubuntu system
+-----------------------------------
 
 The following installation procedure was tested on a `Amazon AWS
 t1.micro
@@ -53,22 +54,22 @@ Some comments:
 
 ::
 
-  curl http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_1_7.tar.gz > segemehl_0_1_7.tar.gz
-  tar xzf segemehl_0_1_7.tar.gz
+  curl http://www.bioinf.uni-leipzig.de/Software/segemehl/segemehl_0_1_9.tar.gz > segemehl_0_1_9.tar.gz
+  tar xzf segemehl_0_1_9.tar.gz
   cd segemehl_*/segemehl/ && make && cd ../../
 
 Copying the executable to a location that is part of the ``PATH`` e.g
 ``/usr/bin/`` ...
 
 ::
 
-  sudo cp segemehl_0_1_7/segemehl/segemehl.x /usr/bin/segemehl.x
-  sudo cp segemehl_0_1_7/segemehl/lack.x /usr/bin/lack.x
+  sudo cp segemehl_0_1_9/segemehl/segemehl.x /usr/bin/segemehl.x
+  sudo cp segemehl_0_1_9/segemehl/lack.x /usr/bin/lack.x
 
 ... or the bin folder of your home directory::
 
   mkdir ~/bin
-  cp segemehl_0_1_7/segemehl/segemehl.x ~/bin
+  cp segemehl_0_1_9/segemehl/segemehl.x ~/bin
 
 3. Install DESeq2
 ~~~~~~~~~~~~~~~~~
@@ -82,7 +83,7 @@ and install the DESeq2 package inside of the interactive command line
 interface. You might be asked to confirm the installation path::
 
   source("http://bioconductor.org/biocLite.R")
-  biocLite("DESeq2").
+  biocLite("DESeq2")
 
 Leave ``R``::
 
@@ -101,12 +102,49 @@ Voilà! You should now be able to call READemption::
 
   reademption -h
 
-..
-.. Global installation
-.. -------------------
-.. 
-.. Installation in the home directory of the user
-.. ----------------------------------------------
-.. 
-.. Installation in a pyvenv
-.. ----------------------
+
+Installing on a Apple OS X
+--------------------------
+
+(Many thanks to Lei Li for contribution this part!)
+
+1. Installing all required software/packages
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To download and install Python 3 follow the instruction at this
+`download page <https://www.python.org/downloads/>`_.
+
+Download and install `xcode <https://developer.apple.com/xcode/>`_ (`download page <https://developer.apple.com/xcode/downloads/>`_) and R
+(download links are on `the frontpage <http://www.r-project.org/>`_).
+
+To install ``pip`` open a terminal and run
+
+::
+
+  curl -O https://bitbucket.org/pypa/setuptools/raw/bootstrap/ez_setup.py python3 ez_setup.py # download and install pip 
+  curl -O https://raw.github.com/pypa/pip/master/contrib/get-pip.py 
+  python3 get-pip.py
+
+Install ``matplotlib``:
+
+::
+
+  pip3 install python3-matplotlib
+
+
+1. Installing segemehl, DESeq, pysam and READemption
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The remaining installation steps are the same as descibed above. Just
+open a terminal and run the commands.
+
+
+Updating READemption
+--------------------
+
+Once you have installed READemption as described above you can easily
+update it to the newest version by running
+
+::
+
+  pip3 install READemption -U
diff --git a/docs/source/live_and_installation_image.rst b/docs/source/live_and_installation_image.rst
@@ -0,0 +1,19 @@
+Live system and installation image
+==================================
+
+For users who do not feel comfortable with the installation of
+READemption, who want to run it under Windows or who just want to test
+it without installation we offer a Ubuntu 14.4 based live system /
+installation image (~ 1.2 GB large) which can be `downloaded here
+<http://osimages.imib-zinf.net>`_. Once you have retrieved the image
+you can either use it to install or test the system on a physical
+machine or use it in a virtual machine (e.g. using the open source
+software `VirtualBox <https://www.virtualbox.org/>`_). The required
+installation/setup steps are the `same as for the official Ubuntu
+version
+<http://www.ubuntu.com/download/desktop/install-ubuntu-desktop>`_. The
+only difference is that once you have a running system READemption can
+be used out of the box without any further setup. A very detailed
+description of the setup of a virtual machine with Ubuntu under
+Windows can be found `here
+<http://www.wikihow.com/Install-Ubuntu-on-VirtualBox>`_.
diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst
@@ -31,20 +31,25 @@ annotation files in GFF3 format have to be put into
 align
 -----
 
-``align`` performs the clipping and size filtering of the reads, as well
-as the actual aligning to the reference sequences. It also generates
-statistics about the steps (e.g. number of aligned reads, number of
-mappings). As the result of this steps are needed by the other
-subcommands it has to be run before the others. It requires reads in
-FASTA format (or counterparts compressed with ``gzip`` or ``bzip2``)
-and reference sequences in FASTA format. ``align`` generates the read
-alignments in BAM format (``*.bam``) and also index files for those
-(``*.bam.bai``). Is also stores unmapped reads so that they can be
-inspected e.g. to search for contaminations. The file
+``align`` performs the clipping and size filtering of the reads, as
+well as the actual aligning to the reference sequences. It also
+generates statistics about the steps (e.g. number of aligned reads,
+number of mappings). As the result of this steps are needed by the
+other subcommands it has to be run before the others. It requires
+reads in FASTA or FASTQ format (or counterparts compressed with
+``gzip`` or ``bzip2``) and reference sequences in FASTA
+format. ``align`` generates the read alignments in BAM format
+(``*.bam``) and also index files for those (``*.bam.bai``). Is also
+stores unmapped reads so that they can be inspected e.g. to search for
+contaminations. The file
 ``output/align/reports_and_stats/read_alignment_stats.csv`` lists
 several mapping statistics. The folder
-``output/align/reports_and_stats/stats_data_json/`` contains files with
-the original countings in JSON format.
+``output/align/reports_and_stats/stats_data_json/`` contains files
+with the original countings in JSON format. Please be aware that
+READemption does not perform quality trimming or adapter clipping so
+far. For this purpose use the `FASTX toolkit
+<http://hannonlab.cshl.edu/fastx_toolkit/>`_, `cutadapt
+<https://code.google.com/p/cutadapt/>`_ or other tools.
 
 ::
 
@@ -55,19 +60,21 @@ the original countings in JSON format.
                            [--segemehl_bin SEGEMEHL_BIN] [--paired_end]
                            [--split] [--poly_a_clipping] [--realign]
                            [--keep_original_alignments] [--lack_bin LACK_BIN]
-                           [--check_for_existing_files] [--progress]
+                           [--fastq] [--check_for_existing_files] [--progress]
+                           [--crossalign_cleaning CROSSALIGN_CLEANING_STRING]
                            [project_path]
-
+  
   positional arguments:
     project_path          Path of the project folder. If none is given the
                           current directory is used.
-
+  
   optional arguments:
     -h, --help            show this help message and exit
     --min_read_length MIN_READ_LENGTH, -l MIN_READ_LENGTH
-                          Minimal read length after clipping.
+                          Minimal read length after clipping (default 12).
+                          Should be higher for eukaryotic species.
     --processes PROCESSES, -p PROCESSES
-                          Number of processes that should be used.
+                          Number of processes that should be used (default 1).
     --segemehl_accuracy SEGEMEHL_ACCURACY, -a SEGEMEHL_ACCURACY
                           Segemehl's minimal accuracy (in %) (default 95).
     --segemehl_evalue SEGEMEHL_EVALUE, -e SEGEMEHL_EVALUE
@@ -91,6 +98,7 @@ the original countings in JSON format.
                           (lack) after merging.
     --lack_bin LACK_BIN, -L LACK_BIN
                           Lack's binary path (default 'lack.x').
+    --fastq, -q           Input reads are in FASTQ not FASTA format.
     --check_for_existing_files, -f
                           Check for existing files (e.g. from a interrupted
                           previous run) and do not overwrite them if they exits.
@@ -104,7 +112,6 @@ the original countings in JSON format.
                           org_1_repl1>,<org_1_repl2>,..,<org_1_repl_n>;<ORG_NAME
                           _2>:<org_2_repl1>,<org_2_repl2>,..,<org_2_repl_n>'
 
-
 coverage
 --------
 
@@ -159,7 +166,7 @@ positions. To turn off this behavior use
                           number of aligned reads even if only uniquely aligned
                           reads are used for the coverage calculation.
     --processes PROCESSES, -p PROCESSES
-                          Number of processes that should be used.
+                          Number of processes that should be used (default 1).
     --skip_read_count_splitting, -s
                           Do not split the read counting between different
                           alignings. Default is to do the splitting.
@@ -216,7 +223,7 @@ overlaps are counted and separately listed.
                           sense and anti-sense overlaps are counted and
                           separately reported.
     --processes PROCESSES, -p PROCESSES
-                          Number of processes that should be used.
+                          Number of processes that should be used (default 1).
     --features ALLOWED_FEATURES, -t ALLOWED_FEATURES
                           Comma separated list of features that should be
                           considered (e.g. gene, cds, region, exon). Other

diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py
@@ -53,8 +53,6 @@ def create_project(self, version):
         project_creator.create_root_folder(self._args.project_path)
         project_creator.create_subfolders(self._paths.required_folders())
         project_creator.create_version_file(self._paths.version_path, version)
-        sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % (
-            self._args.project_path))
         sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % (
                 self._args.project_path))
         sys.stdout.write("Please copy read files into folder \"%s\" and "
@@ -253,8 +251,9 @@ def _prepare_reads_single_end(self):
                     continue
                 read_processor = ReadProcessor(
                     poly_a_clipping=self._args.poly_a_clipping,
-                    min_read_length=self._args.min_read_length)
-                read_files_and_jobs[lib_name]  = executor.submit(
+                    min_read_length=self._args.min_read_length,
+                    fastq=self._args.fastq)
+                read_files_and_jobs[lib_name] = executor.submit(
                     read_processor.process_single_end, read_path, 
                     processed_read_path)
         self._evaluet_job_and_generate_stat_file(lib_name, read_files_and_jobs)
@@ -272,7 +271,8 @@ def _prepare_reads_paired_end(self):
                         continue
                     read_processor = ReadProcessor(
                         poly_a_clipping=False, 
-                        min_read_length=self._args.min_read_length)
+                        min_read_length=self._args.min_read_length,
+                        fastq=self._args.fastq)
                 read_files_and_jobs[lib_name]  = executor.submit(
                     read_processor.process_paired_end, read_path_pair, 
                     processed_read_path_pair)
@@ -295,7 +295,7 @@ def _evaluet_job_and_generate_stat_file(
             read_files_and_stats, self._paths.read_processing_stats_path)
 
     def _align_single_end_reads(self):
-        """Manage the actual alignement of single end reads."""
+        """Manage the actual alignment of single end reads."""
         read_aligner = ReadAligner(self._args.segemehl_bin, self._args.progress)
         if self._file_needs_to_be_created(self._paths.index_path) is True:
             read_aligner.build_index(