Merge branch 'release_v0.2.8'

foerstner-lab · May 9, 2014 · 0d2b80c · 0d2b80c
2 parents ed1d5d5 + 91d911e
commit 0d2b80c
Show file tree

Hide file tree

Showing 10 changed files with 188 additions and 11 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -1,3 +1,7 @@
+v0.2.8 (2014-05-09)
+- Add removal of crossmapped reads
+- Fix bug in align subcommand (reported by Alexander Mellmann)
+- Adapt description of the DESeq2 installation
 v0.2.7 (2014-05-03)
 - Update docs
 - Improve Makefile

diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 test:
-	python3.3 tests/test_all.py
+	python3 tests/test_all.py
 
 coverage:
 	coverage3 run tests/test_all.py

diff --git a/bin/reademption b/bin/reademption
@@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner <[email protected]>"
 __copyright__ = "2011-2014 by Konrad Foerstner <[email protected]>"
 __license__ = "ISC license"
 __email__ = "[email protected]"
-__version__ = "0.2.6"
+__version__ = "0.2.8"
 
 def main():
     parser = argparse.ArgumentParser()
@@ -80,6 +80,14 @@ def main():
     read_aligning_parser.add_argument(
         "--progress", "-g", default=False, action="store_true",
         help="Show progress of the segemehl mapping.")
+    read_aligning_parser.add_argument(
+        "--crossalign_cleaning", "-x", default=None,
+        dest="crossalign_cleaning_str", metavar="CROSSALIGN_CLEANING_STRING",
+        help="Remove reads that are cross-mapped to replicons of different "
+        "species. To associated species and replicons give a string in the "
+        "following format: '<ORG_NAME_1>:<org_1_repl1>,<org_1_repl2>,..,"
+        "<org_1_repl_n>;<ORG_NAME_2>:<org_2_repl1>,<org_2_repl2>,..,"
+        "<org_2_repl_n>'")
     read_aligning_parser.set_defaults(func=align_reads)
 
     # Parameters for coverage file building

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -50,7 +50,7 @@
 # The short X.Y version.
 version = '0.2'
 # The full version, including alpha/beta/rc tags.
-release = '0.2.7'
+release = '0.2.8'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -73,9 +73,21 @@ Copying the executable to a location that is part of the ``PATH`` e.g
 3. Install DESeq2
 ~~~~~~~~~~~~~~~~~
 
-::
+Start ``R``::
+
+  R
+
+
+and install the DESeq2 package inside of the interactive command line
+interface. You might be asked to confirm the installation path::
+
+  source("http://bioconductor.org/biocLite.R")
+  biocLite("DESeq2").
+
+Leave ``R``::
+
+  quit(save = "no")
 
-  echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq2")' | sudo Rscript -
 
 Install pysam and READemption
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py
@@ -5,6 +5,7 @@
 import pysam
 from reademptionlib.bammerger import BamMerger
 from reademptionlib.coveragecalculator import CoverageCalculator
+from reademptionlib.crossalignfilter import CrossAlignFilter
 from reademptionlib.deseq import DESeqRunner
 from reademptionlib.fasta import FastaParser
 from reademptionlib.genewisequanti import GeneWiseQuantification, GeneWiseOverview
@@ -93,21 +94,81 @@ def align_reads(self):
             self._paths.primary_read_aligner_bam_prefix_paths,
             self._paths.primary_read_aligner_bam_paths)
         self._generate_read_alignment_stats(
-            self._lib_names, 
+            self._lib_names,
             self._paths.primary_read_aligner_bam_paths,
-            self._paths.unaligned_reads_paths, 
+            self._paths.unaligned_reads_paths,
             self._paths.primary_read_aligner_stats_path)
+        final_unaligned_reads_paths = self._paths.unaligned_reads_paths
         if self._args.realign is True:
             self._run_realigner_and_process_alignments()
-        if self._args.realign is True:
             self._merge_bam_files()
+            final_unaligned_reads_paths = (
+                self._paths.realigned_unaligned_reads_paths)
+        if not self._args.crossalign_cleaning_str is None:
+            self._remove_crossaligned_reads()
         self._generate_read_alignment_stats(
-            self._lib_names, 
+            self._lib_names,
             self._paths.read_alignment_bam_paths,
-            self._paths.realigned_unaligned_reads_paths, 
+            final_unaligned_reads_paths,
             self._paths.read_alignments_stats_path)
         self._write_alignment_stat_table()
 
+    def _remove_crossaligned_reads(self):
+        self._string_to_species_and_sequence_ids()
+        jobs = []
+        with concurrent.futures.ProcessPoolExecutor(
+            max_workers=self._args.processes) as executor:
+            for (bam_path, bam_with_crossmappings_path,
+                 bam_cleaned_tmp_path, crossmapped_reads_path) in zip(
+                     self._paths.read_alignment_bam_paths,
+                     self._paths.read_alignment_bam_with_crossmappings_paths,
+                     self._paths.read_alignment_bam_cross_cleaned_tmp_paths,
+                     self._paths.crossmapped_reads_paths):
+                jobs.append(executor.submit(
+                    self._remove_crossaligned_reads_for_lib, bam_path,
+                    bam_with_crossmappings_path, bam_cleaned_tmp_path,
+                    crossmapped_reads_path))
+        # Evaluate thread outcome
+        self._check_job_completeness(jobs)
+
+    def _remove_crossaligned_reads_for_lib(
+            bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path,
+            crossmapped_reads_path):
+        # Perform the removal or cross aligned reads
+        cross_align_filter = CrossAlignFilter(
+            bam_path, bam_cleaned_tmp_path, crossmapped_reads_path,
+            self._species_and_sequence_ids)
+        cross_align_filter.determine_crossmapped_reads()
+        cross_align_filter.write_crossmapping_free_bam()
+        # Rename the original mapping file that potentially
+        # contains cross aligned reads
+        os.rename(bam_path, bam_with_crossmappings_path)
+        os.rename(bam_path + ".bai", bam_with_crossmappings_path + ".bai")
+        # Move the cross aligned filtered file to the final mapping
+        # path
+        os.rename(bam_cleaned_tmp_path, bam_path)
+        os.rename(bam_cleaned_tmp_path + ".bai", bam_path + ".bai")
+
+    def _string_to_species_and_sequence_ids(self):
+        self._species_and_sequence_ids = {}
+        orgs_and_seq_ids_strs = self._args.crossalign_cleaning_str.split(";")
+        if len(orgs_and_seq_ids_strs) < 2:
+            self._write_err_msg_and_quit(
+                "Error! Only one organism is defined for the cross align "
+                "removal. This does not make sense.\nYou gave the "
+                "following input:\n%s\n" % self._args.crossalign_cleaning_str)
+        for org_and_seq_ids_str in orgs_and_seq_ids_strs:
+            org, seq_ids_str = org_and_seq_ids_str.strip().split(":")
+            seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")]
+            if "" in seq_ids: seq_ids.remove("")
+            if len(seq_ids) < 1:
+                self._write_err_msg_and_quit(
+                    "Error! No sequence ID was given for the species '%s'. " 
+                    "This does not make sense.\nYou gave the "
+                    "following input:\n%s\n" % (
+                        org, self._args.crossalign_cleaning_str))
+            self._species_and_sequence_ids[org] = seq_ids
+
     def _set_primary_aligner_paths_to_final_paths(self):
         # If no remapping is performed the paths of the final bam files
         # is the paths of the primary mapper

diff --git a/reademptionlib/crossalignfilter.py b/reademptionlib/crossalignfilter.py
@@ -0,0 +1,80 @@
+import pysam
+import sys
+
+class CrossAlignFilter(object):
+
+    def __init__(self, input_bam, output_bam, output_crossmapped_reads, 
+                 orgs_and_replicon_ids):
+        self._input_bam = input_bam
+        self._output_bam = output_bam
+        self._output_crossmapped_reads = output_crossmapped_reads
+        self._orgs_and_replicon_ids = orgs_and_replicon_ids
+        self._crossmapped_reads = set()
+        self.no_of_crossmapped_reads = None
+
+    def determine_crossmapped_reads(self):
+        """Find reads that are mapped to sequences of different species.
+
+        The comparison is performed replicon wise in order to reduce
+        the memory footprint.
+        """
+        self._check_replicon_existance()
+        done_replicon_comparison = []
+        with pysam.Samfile(self._input_bam) as bam:
+            for org, replicon_ids in self._orgs_and_replicon_ids.items():
+                for replicon_id in replicon_ids:
+                    self._read_ids = set()
+                    # First, collect the ids of the aligned reads of
+                    # this replicon
+                    for alignment in bam.fetch(reference=replicon_id):
+                        self._read_ids.add(alignment.qname)
+                    # Then compare them to the alignments of each
+                    # replicon of the other organism(s)
+                    for comp_org, comp_replicon_ids in (
+                            self._orgs_and_replicon_ids.items()):
+                        # Only compare replicons of different species
+                        if org == comp_org:
+                            continue
+                        for comp_replicon_id in comp_replicon_ids:
+                            comparison = sorted([replicon_id, comp_replicon_id])
+                            # Check if comparison of the two replicons
+                            # has been done already
+                            if comparison in done_replicon_comparison:
+                                continue
+                            done_replicon_comparison.append(comparison)
+                            # Compare all read ids of the comparison
+                            # replicon to the query replicon read ids
+                            for alignment in bam.fetch(
+                                    reference=comp_replicon_id):
+                                if alignment.qname in self._read_ids:
+                                    self._crossmapped_reads.add(
+                                        alignment.qname)
+        self.no_of_crossmapped_reads = len(self._crossmapped_reads)
+        # Write list of cross mapped read to file
+        with open(self._output_crossmapped_reads, "w") as output_list_fh:
+            output_list_fh.write("\n".join(self._crossmapped_reads) + "\n")
+
+    def _check_replicon_existance(self):
+        found_all = True
+        with pysam.Samfile(self._input_bam) as bam:
+            for replicon_ids in self._orgs_and_replicon_ids.values():
+                for replicon_id in replicon_ids:
+                    if not replicon_id in bam.references:
+                        sys.stderr.write(
+                            "\"%s\" not found in BAM header.\n" % replicon_id)
+                        found_all = False
+        if not found_all:
+            raise RepliconIdNotInBam
+
+    def write_crossmapping_free_bam(self):
+        with pysam.Samfile(self._input_bam) as input_bam:
+            with pysam.Samfile(self._output_bam, "wb", 
+                               header=input_bam.header) as output_bam:
+                for alignment in input_bam.fetch():
+                    if not alignment.qname in self._crossmapped_reads:
+                        output_bam.write(alignment)
+        pysam.index(self._output_bam)
+
+class RepliconIdNotInBam(BaseException):
+    pass
+
diff --git a/reademptionlib/paths.py b/reademptionlib/paths.py
@@ -308,6 +308,17 @@ def _set_alignment_paths(self, lib_names):
             self.read_alignments_folder, 
             lib_names, appendix="_alignments_realigner")
         ### 
+        # For the cross-aligned cleaned
+        self.read_alignment_bam_cross_cleaned_tmp_paths = self._path_list(
+            self.read_alignments_folder, lib_names, 
+            appendix="_alignments_tmp_crossmapped_cleaned.bam")
+        self.read_alignment_bam_with_crossmappings_paths = self._path_list(
+            self.read_alignments_folder, lib_names, 
+            appendix="_alignments_potententially_with_crossmappings.bam")
+        self.crossmapped_reads_paths = self._path_list(
+            self.read_alignments_folder, lib_names, 
+            appendix="_crossmapped_reads.txt")
+        ###
         # For the final (merged) version
         self.read_alignment_bam_paths = self._path_list(
             self.read_alignments_folder, lib_names, 

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='READemption',
-    version='0.2.7',
+    version='0.2.8',
     packages=['reademptionlib', 'tests'],
     author='Konrad U. Förstner',
     author_email='[email protected]',

diff --git a/tests/test_controller.py b/tests/test_controller.py
@@ -19,6 +19,7 @@ class ArgMock(object):
     progress = False
     split = False
     realign = False
+    crossalign_cleaning_str = None
 
 class TestController(unittest.TestCase):