diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 1b349e23..646605d6 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,7 @@ +v0.2.8 (2014-05-09) +- Add removal of crossmapped reads +- Fix bug in align subcommand (reported by Alexander Mellmann) +- Adapt description of the DESeq2 installation v0.2.7 (2014-05-03) - Update docs - Improve Makefile diff --git a/Makefile b/Makefile index 030db29a..5a5002d5 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ test: - python3.3 tests/test_all.py + python3 tests/test_all.py coverage: coverage3 run tests/test_all.py diff --git a/bin/reademption b/bin/reademption index 5698f117..d8adf68e 100755 --- a/bin/reademption +++ b/bin/reademption @@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner " __copyright__ = "2011-2014 by Konrad Foerstner " __license__ = "ISC license" __email__ = "konrad@foerstner.org" -__version__ = "0.2.6" +__version__ = "0.2.8" def main(): parser = argparse.ArgumentParser() @@ -80,6 +80,14 @@ def main(): read_aligning_parser.add_argument( "--progress", "-g", default=False, action="store_true", help="Show progress of the segemehl mapping.") + read_aligning_parser.add_argument( + "--crossalign_cleaning", "-x", default=None, + dest="crossalign_cleaning_str", metavar="CROSSALIGN_CLEANING_STRING", + help="Remove reads that are cross-mapped to replicons of different " + "species. To associated species and replicons give a string in the " + "following format: ':,,..," + ";:,,..," + "'") read_aligning_parser.set_defaults(func=align_reads) # Parameters for coverage file building diff --git a/docs/source/conf.py b/docs/source/conf.py index 4cbdd1de..9ca86931 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,7 +50,7 @@ # The short X.Y version. version = '0.2' # The full version, including alpha/beta/rc tags. -release = '0.2.7' +release = '0.2.8' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 1945e104..91ec4a61 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -73,9 +73,21 @@ Copying the executable to a location that is part of the ``PATH`` e.g 3. Install DESeq2 ~~~~~~~~~~~~~~~~~ -:: +Start ``R``:: + + R + + +and install the DESeq2 package inside of the interactive command line +interface. You might be asked to confirm the installation path:: + + source("http://bioconductor.org/biocLite.R") + biocLite("DESeq2"). + +Leave ``R``:: + + quit(save = "no") - echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq2")' | sudo Rscript - Install pysam and READemption ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index 526fb595..9e4bf80c 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -5,6 +5,7 @@ import pysam from reademptionlib.bammerger import BamMerger from reademptionlib.coveragecalculator import CoverageCalculator +from reademptionlib.crossalignfilter import CrossAlignFilter from reademptionlib.deseq import DESeqRunner from reademptionlib.fasta import FastaParser from reademptionlib.genewisequanti import GeneWiseQuantification, GeneWiseOverview @@ -93,21 +94,81 @@ def align_reads(self): self._paths.primary_read_aligner_bam_prefix_paths, self._paths.primary_read_aligner_bam_paths) self._generate_read_alignment_stats( - self._lib_names, + self._lib_names, self._paths.primary_read_aligner_bam_paths, - self._paths.unaligned_reads_paths, + self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_stats_path) + final_unaligned_reads_paths = self._paths.unaligned_reads_paths if self._args.realign is True: self._run_realigner_and_process_alignments() - if self._args.realign is True: self._merge_bam_files() + final_unaligned_reads_paths = ( + self._paths.realigned_unaligned_reads_paths) + if not self._args.crossalign_cleaning_str is None: + self._remove_crossaligned_reads() self._generate_read_alignment_stats( - self._lib_names, + self._lib_names, self._paths.read_alignment_bam_paths, - self._paths.realigned_unaligned_reads_paths, + final_unaligned_reads_paths, self._paths.read_alignments_stats_path) self._write_alignment_stat_table() + def _remove_crossaligned_reads(self): + self._string_to_species_and_sequence_ids() + jobs = [] + with concurrent.futures.ProcessPoolExecutor( + max_workers=self._args.processes) as executor: + for (bam_path, bam_with_crossmappings_path, + bam_cleaned_tmp_path, crossmapped_reads_path) in zip( + self._paths.read_alignment_bam_paths, + self._paths.read_alignment_bam_with_crossmappings_paths, + self._paths.read_alignment_bam_cross_cleaned_tmp_paths, + self._paths.crossmapped_reads_paths): + jobs.append(executor.submit( + self._remove_crossaligned_reads_for_lib, bam_path, + bam_with_crossmappings_path, bam_cleaned_tmp_path, + crossmapped_reads_path)) + # Evaluate thread outcome + self._check_job_completeness(jobs) + + def _remove_crossaligned_reads_for_lib( + bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, + crossmapped_reads_path): + # Perform the removal or cross aligned reads + cross_align_filter = CrossAlignFilter( + bam_path, bam_cleaned_tmp_path, crossmapped_reads_path, + self._species_and_sequence_ids) + cross_align_filter.determine_crossmapped_reads() + cross_align_filter.write_crossmapping_free_bam() + # Rename the original mapping file that potentially + # contains cross aligned reads + os.rename(bam_path, bam_with_crossmappings_path) + os.rename(bam_path + ".bai", bam_with_crossmappings_path + ".bai") + # Move the cross aligned filtered file to the final mapping + # path + os.rename(bam_cleaned_tmp_path, bam_path) + os.rename(bam_cleaned_tmp_path + ".bai", bam_path + ".bai") + + def _string_to_species_and_sequence_ids(self): + self._species_and_sequence_ids = {} + orgs_and_seq_ids_strs = self._args.crossalign_cleaning_str.split(";") + if len(orgs_and_seq_ids_strs) < 2: + self._write_err_msg_and_quit( + "Error! Only one organism is defined for the cross align " + "removal. This does not make sense.\nYou gave the " + "following input:\n%s\n" % self._args.crossalign_cleaning_str) + for org_and_seq_ids_str in orgs_and_seq_ids_strs: + org, seq_ids_str = org_and_seq_ids_str.strip().split(":") + seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")] + if "" in seq_ids: seq_ids.remove("") + if len(seq_ids) < 1: + self._write_err_msg_and_quit( + "Error! No sequence ID was given for the species '%s'. " + "This does not make sense.\nYou gave the " + "following input:\n%s\n" % ( + org, self._args.crossalign_cleaning_str)) + self._species_and_sequence_ids[org] = seq_ids + def _set_primary_aligner_paths_to_final_paths(self): # If no remapping is performed the paths of the final bam files # is the paths of the primary mapper diff --git a/reademptionlib/crossalignfilter.py b/reademptionlib/crossalignfilter.py new file mode 100644 index 00000000..13423598 --- /dev/null +++ b/reademptionlib/crossalignfilter.py @@ -0,0 +1,80 @@ +import pysam +import sys + +class CrossAlignFilter(object): + + def __init__(self, input_bam, output_bam, output_crossmapped_reads, + orgs_and_replicon_ids): + self._input_bam = input_bam + self._output_bam = output_bam + self._output_crossmapped_reads = output_crossmapped_reads + self._orgs_and_replicon_ids = orgs_and_replicon_ids + self._crossmapped_reads = set() + self.no_of_crossmapped_reads = None + + def determine_crossmapped_reads(self): + """Find reads that are mapped to sequences of different species. + + The comparison is performed replicon wise in order to reduce + the memory footprint. + """ + self._check_replicon_existance() + done_replicon_comparison = [] + with pysam.Samfile(self._input_bam) as bam: + for org, replicon_ids in self._orgs_and_replicon_ids.items(): + for replicon_id in replicon_ids: + self._read_ids = set() + # First, collect the ids of the aligned reads of + # this replicon + for alignment in bam.fetch(reference=replicon_id): + self._read_ids.add(alignment.qname) + # Then compare them to the alignments of each + # replicon of the other organism(s) + for comp_org, comp_replicon_ids in ( + self._orgs_and_replicon_ids.items()): + # Only compare replicons of different species + if org == comp_org: + continue + for comp_replicon_id in comp_replicon_ids: + comparison = sorted([replicon_id, comp_replicon_id]) + # Check if comparison of the two replicons + # has been done already + if comparison in done_replicon_comparison: + continue + done_replicon_comparison.append(comparison) + # Compare all read ids of the comparison + # replicon to the query replicon read ids + for alignment in bam.fetch( + reference=comp_replicon_id): + if alignment.qname in self._read_ids: + self._crossmapped_reads.add( + alignment.qname) + self.no_of_crossmapped_reads = len(self._crossmapped_reads) + # Write list of cross mapped read to file + with open(self._output_crossmapped_reads, "w") as output_list_fh: + output_list_fh.write("\n".join(self._crossmapped_reads) + "\n") + + def _check_replicon_existance(self): + found_all = True + with pysam.Samfile(self._input_bam) as bam: + for replicon_ids in self._orgs_and_replicon_ids.values(): + for replicon_id in replicon_ids: + if not replicon_id in bam.references: + sys.stderr.write( + "\"%s\" not found in BAM header.\n" % replicon_id) + found_all = False + if not found_all: + raise RepliconIdNotInBam + + def write_crossmapping_free_bam(self): + with pysam.Samfile(self._input_bam) as input_bam: + with pysam.Samfile(self._output_bam, "wb", + header=input_bam.header) as output_bam: + for alignment in input_bam.fetch(): + if not alignment.qname in self._crossmapped_reads: + output_bam.write(alignment) + pysam.index(self._output_bam) + +class RepliconIdNotInBam(BaseException): + pass + diff --git a/reademptionlib/paths.py b/reademptionlib/paths.py index 45feadf0..d4fa4167 100644 --- a/reademptionlib/paths.py +++ b/reademptionlib/paths.py @@ -308,6 +308,17 @@ def _set_alignment_paths(self, lib_names): self.read_alignments_folder, lib_names, appendix="_alignments_realigner") ### + # For the cross-aligned cleaned + self.read_alignment_bam_cross_cleaned_tmp_paths = self._path_list( + self.read_alignments_folder, lib_names, + appendix="_alignments_tmp_crossmapped_cleaned.bam") + self.read_alignment_bam_with_crossmappings_paths = self._path_list( + self.read_alignments_folder, lib_names, + appendix="_alignments_potententially_with_crossmappings.bam") + self.crossmapped_reads_paths = self._path_list( + self.read_alignments_folder, lib_names, + appendix="_crossmapped_reads.txt") + ### # For the final (merged) version self.read_alignment_bam_paths = self._path_list( self.read_alignments_folder, lib_names, diff --git a/setup.py b/setup.py index 32cfddde..dd34dc1f 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='READemption', - version='0.2.7', + version='0.2.8', packages=['reademptionlib', 'tests'], author='Konrad U. Förstner', author_email='konrad@foerstner.org', diff --git a/tests/test_controller.py b/tests/test_controller.py index 5f353a35..c35560bf 100644 --- a/tests/test_controller.py +++ b/tests/test_controller.py @@ -19,6 +19,7 @@ class ArgMock(object): progress = False split = False realign = False + crossalign_cleaning_str = None class TestController(unittest.TestCase):