Skip to content

Commit

Permalink
Merge branch 'release_v0.2.8'
Browse files Browse the repository at this point in the history
  • Loading branch information
konrad committed May 9, 2014
2 parents ed1d5d5 + 91d911e commit 0d2b80c
Show file tree
Hide file tree
Showing 10 changed files with 188 additions and 11 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
v0.2.8 (2014-05-09)
- Add removal of crossmapped reads
- Fix bug in align subcommand (reported by Alexander Mellmann)
- Adapt description of the DESeq2 installation
v0.2.7 (2014-05-03)
- Update docs
- Improve Makefile
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
test:
python3.3 tests/test_all.py
python3 tests/test_all.py

coverage:
coverage3 run tests/test_all.py
Expand Down
10 changes: 9 additions & 1 deletion bin/reademption
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner <[email protected]>"
__copyright__ = "2011-2014 by Konrad Foerstner <[email protected]>"
__license__ = "ISC license"
__email__ = "[email protected]"
__version__ = "0.2.6"
__version__ = "0.2.8"

def main():
parser = argparse.ArgumentParser()
Expand Down Expand Up @@ -80,6 +80,14 @@ def main():
read_aligning_parser.add_argument(
"--progress", "-g", default=False, action="store_true",
help="Show progress of the segemehl mapping.")
read_aligning_parser.add_argument(
"--crossalign_cleaning", "-x", default=None,
dest="crossalign_cleaning_str", metavar="CROSSALIGN_CLEANING_STRING",
help="Remove reads that are cross-mapped to replicons of different "
"species. To associated species and replicons give a string in the "
"following format: '<ORG_NAME_1>:<org_1_repl1>,<org_1_repl2>,..,"
"<org_1_repl_n>;<ORG_NAME_2>:<org_2_repl1>,<org_2_repl2>,..,"
"<org_2_repl_n>'")
read_aligning_parser.set_defaults(func=align_reads)

# Parameters for coverage file building
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
# The short X.Y version.
version = '0.2'
# The full version, including alpha/beta/rc tags.
release = '0.2.7'
release = '0.2.8'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
16 changes: 14 additions & 2 deletions docs/source/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,21 @@ Copying the executable to a location that is part of the ``PATH`` e.g
3. Install DESeq2
~~~~~~~~~~~~~~~~~

::
Start ``R``::

R


and install the DESeq2 package inside of the interactive command line
interface. You might be asked to confirm the installation path::

source("http://bioconductor.org/biocLite.R")
biocLite("DESeq2").

Leave ``R``::

quit(save = "no")

echo 'source("http://bioconductor.org/biocLite.R");biocLite("DESeq2")' | sudo Rscript -

Install pysam and READemption
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
71 changes: 66 additions & 5 deletions reademptionlib/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pysam
from reademptionlib.bammerger import BamMerger
from reademptionlib.coveragecalculator import CoverageCalculator
from reademptionlib.crossalignfilter import CrossAlignFilter
from reademptionlib.deseq import DESeqRunner
from reademptionlib.fasta import FastaParser
from reademptionlib.genewisequanti import GeneWiseQuantification, GeneWiseOverview
Expand Down Expand Up @@ -93,21 +94,81 @@ def align_reads(self):
self._paths.primary_read_aligner_bam_prefix_paths,
self._paths.primary_read_aligner_bam_paths)
self._generate_read_alignment_stats(
self._lib_names,
self._lib_names,
self._paths.primary_read_aligner_bam_paths,
self._paths.unaligned_reads_paths,
self._paths.unaligned_reads_paths,
self._paths.primary_read_aligner_stats_path)
final_unaligned_reads_paths = self._paths.unaligned_reads_paths
if self._args.realign is True:
self._run_realigner_and_process_alignments()
if self._args.realign is True:
self._merge_bam_files()
final_unaligned_reads_paths = (
self._paths.realigned_unaligned_reads_paths)
if not self._args.crossalign_cleaning_str is None:
self._remove_crossaligned_reads()
self._generate_read_alignment_stats(
self._lib_names,
self._lib_names,
self._paths.read_alignment_bam_paths,
self._paths.realigned_unaligned_reads_paths,
final_unaligned_reads_paths,
self._paths.read_alignments_stats_path)
self._write_alignment_stat_table()

def _remove_crossaligned_reads(self):
self._string_to_species_and_sequence_ids()
jobs = []
with concurrent.futures.ProcessPoolExecutor(
max_workers=self._args.processes) as executor:
for (bam_path, bam_with_crossmappings_path,
bam_cleaned_tmp_path, crossmapped_reads_path) in zip(
self._paths.read_alignment_bam_paths,
self._paths.read_alignment_bam_with_crossmappings_paths,
self._paths.read_alignment_bam_cross_cleaned_tmp_paths,
self._paths.crossmapped_reads_paths):
jobs.append(executor.submit(
self._remove_crossaligned_reads_for_lib, bam_path,
bam_with_crossmappings_path, bam_cleaned_tmp_path,
crossmapped_reads_path))
# Evaluate thread outcome
self._check_job_completeness(jobs)

def _remove_crossaligned_reads_for_lib(
bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path,
crossmapped_reads_path):
# Perform the removal or cross aligned reads
cross_align_filter = CrossAlignFilter(
bam_path, bam_cleaned_tmp_path, crossmapped_reads_path,
self._species_and_sequence_ids)
cross_align_filter.determine_crossmapped_reads()
cross_align_filter.write_crossmapping_free_bam()
# Rename the original mapping file that potentially
# contains cross aligned reads
os.rename(bam_path, bam_with_crossmappings_path)
os.rename(bam_path + ".bai", bam_with_crossmappings_path + ".bai")
# Move the cross aligned filtered file to the final mapping
# path
os.rename(bam_cleaned_tmp_path, bam_path)
os.rename(bam_cleaned_tmp_path + ".bai", bam_path + ".bai")

def _string_to_species_and_sequence_ids(self):
self._species_and_sequence_ids = {}
orgs_and_seq_ids_strs = self._args.crossalign_cleaning_str.split(";")
if len(orgs_and_seq_ids_strs) < 2:
self._write_err_msg_and_quit(
"Error! Only one organism is defined for the cross align "
"removal. This does not make sense.\nYou gave the "
"following input:\n%s\n" % self._args.crossalign_cleaning_str)
for org_and_seq_ids_str in orgs_and_seq_ids_strs:
org, seq_ids_str = org_and_seq_ids_str.strip().split(":")
seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")]
if "" in seq_ids: seq_ids.remove("")
if len(seq_ids) < 1:
self._write_err_msg_and_quit(
"Error! No sequence ID was given for the species '%s'. "
"This does not make sense.\nYou gave the "
"following input:\n%s\n" % (
org, self._args.crossalign_cleaning_str))
self._species_and_sequence_ids[org] = seq_ids

def _set_primary_aligner_paths_to_final_paths(self):
# If no remapping is performed the paths of the final bam files
# is the paths of the primary mapper
Expand Down
80 changes: 80 additions & 0 deletions reademptionlib/crossalignfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pysam
import sys

class CrossAlignFilter(object):

def __init__(self, input_bam, output_bam, output_crossmapped_reads,
orgs_and_replicon_ids):
self._input_bam = input_bam
self._output_bam = output_bam
self._output_crossmapped_reads = output_crossmapped_reads
self._orgs_and_replicon_ids = orgs_and_replicon_ids
self._crossmapped_reads = set()
self.no_of_crossmapped_reads = None

def determine_crossmapped_reads(self):
"""Find reads that are mapped to sequences of different species.
The comparison is performed replicon wise in order to reduce
the memory footprint.
"""
self._check_replicon_existance()
done_replicon_comparison = []
with pysam.Samfile(self._input_bam) as bam:
for org, replicon_ids in self._orgs_and_replicon_ids.items():
for replicon_id in replicon_ids:
self._read_ids = set()
# First, collect the ids of the aligned reads of
# this replicon
for alignment in bam.fetch(reference=replicon_id):
self._read_ids.add(alignment.qname)
# Then compare them to the alignments of each
# replicon of the other organism(s)
for comp_org, comp_replicon_ids in (
self._orgs_and_replicon_ids.items()):
# Only compare replicons of different species
if org == comp_org:
continue
for comp_replicon_id in comp_replicon_ids:
comparison = sorted([replicon_id, comp_replicon_id])
# Check if comparison of the two replicons
# has been done already
if comparison in done_replicon_comparison:
continue
done_replicon_comparison.append(comparison)
# Compare all read ids of the comparison
# replicon to the query replicon read ids
for alignment in bam.fetch(
reference=comp_replicon_id):
if alignment.qname in self._read_ids:
self._crossmapped_reads.add(
alignment.qname)
self.no_of_crossmapped_reads = len(self._crossmapped_reads)
# Write list of cross mapped read to file
with open(self._output_crossmapped_reads, "w") as output_list_fh:
output_list_fh.write("\n".join(self._crossmapped_reads) + "\n")

def _check_replicon_existance(self):
found_all = True
with pysam.Samfile(self._input_bam) as bam:
for replicon_ids in self._orgs_and_replicon_ids.values():
for replicon_id in replicon_ids:
if not replicon_id in bam.references:
sys.stderr.write(
"\"%s\" not found in BAM header.\n" % replicon_id)
found_all = False
if not found_all:
raise RepliconIdNotInBam

def write_crossmapping_free_bam(self):
with pysam.Samfile(self._input_bam) as input_bam:
with pysam.Samfile(self._output_bam, "wb",
header=input_bam.header) as output_bam:
for alignment in input_bam.fetch():
if not alignment.qname in self._crossmapped_reads:
output_bam.write(alignment)
pysam.index(self._output_bam)

class RepliconIdNotInBam(BaseException):
pass

11 changes: 11 additions & 0 deletions reademptionlib/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,17 @@ def _set_alignment_paths(self, lib_names):
self.read_alignments_folder,
lib_names, appendix="_alignments_realigner")
###
# For the cross-aligned cleaned
self.read_alignment_bam_cross_cleaned_tmp_paths = self._path_list(
self.read_alignments_folder, lib_names,
appendix="_alignments_tmp_crossmapped_cleaned.bam")
self.read_alignment_bam_with_crossmappings_paths = self._path_list(
self.read_alignments_folder, lib_names,
appendix="_alignments_potententially_with_crossmappings.bam")
self.crossmapped_reads_paths = self._path_list(
self.read_alignments_folder, lib_names,
appendix="_crossmapped_reads.txt")
###
# For the final (merged) version
self.read_alignment_bam_paths = self._path_list(
self.read_alignments_folder, lib_names,
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name='READemption',
version='0.2.7',
version='0.2.8',
packages=['reademptionlib', 'tests'],
author='Konrad U. Förstner',
author_email='[email protected]',
Expand Down
1 change: 1 addition & 0 deletions tests/test_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class ArgMock(object):
progress = False
split = False
realign = False
crossalign_cleaning_str = None

class TestController(unittest.TestCase):

Expand Down

0 comments on commit 0d2b80c

Please sign in to comment.