From b0b83252321191f3d54f340c4901accddc8d5ade Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 12:47:37 +0100 Subject: [PATCH 01/22] Add non-strand specific coverage calculation --- bin/reademption | 5 +++++ docs/source/subcommands.rst | 4 ++++ reademptionlib/controller.py | 9 ++++++--- reademptionlib/coveragecalculator.py | 12 +++++++++++- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/bin/reademption b/bin/reademption index 0ad27c24..8e2ae4d2 100755 --- a/bin/reademption +++ b/bin/reademption @@ -126,6 +126,11 @@ def main(): "--skip_read_count_splitting", "-s", default=False, action="store_true", help="Do not split the read counting between " "different alignings. Default is to do the splitting.") + coverage_creation_parser.add_argument( + "--non_strand_specific", "-d", default=False, + action="store_true", help="Do not distict between the coverage of the " + "forward and reverse strand but sum them to a single value for each " + "base.") coverage_creation_parser.add_argument( "--first_base_only", "-b", default=False, action="store_true", help="Only the first bases 5' base of each read " diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst index 681ba119..2c47cc2e 100644 --- a/docs/source/subcommands.rst +++ b/docs/source/subcommands.rst @@ -179,6 +179,10 @@ positions. To turn off this behavior use --skip_read_count_splitting, -s Do not split the read counting between different alignings. Default is to do the splitting. + --non_strand_specific, -d + Do not distict between the coverage of the forward and + reverse strand but sum them to a single value for each + base. --first_base_only, -b Only the first bases 5' base of each read aligning is taken into account. diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index 7c805452..75e99553 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -510,7 +510,10 @@ def _all_coverage_file_exist( def _create_coverage_files_for_lib( self, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads): """Perform the coverage calculation for a given library.""" - strands = ["forward", "reverse"] + if not self._args.non_strand_specific: + strands = ["forward", "reverse"] + else: + strands = ["forward_and_reverse"] if self._all_coverage_file_exist( lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): return @@ -520,7 +523,8 @@ def _create_coverage_files_for_lib( coverage_calculator = CoverageCalculator( read_count_splitting=read_count_splitting, uniqueley_aligned_only=self._args.unique_only, - first_base_only=self._args.first_base_only) + first_base_only=self._args.first_base_only, + non_strand_specific=self._args.non_strand_specific) (coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm) = self._wiggle_writers( lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads) @@ -777,4 +781,3 @@ def viz_deseq(self): deseq_viz.create_volcano_plots( self._paths.viz_deseq_volcano_plot_path, self._paths.viz_deseq_volcano_plot_adj_path) - diff --git a/reademptionlib/coveragecalculator.py b/reademptionlib/coveragecalculator.py index 77ddc8a7..d259780d 100644 --- a/reademptionlib/coveragecalculator.py +++ b/reademptionlib/coveragecalculator.py @@ -3,20 +3,30 @@ class CoverageCalculator(object): def __init__(self, read_count_splitting=True, uniqueley_aligned_only=False, - first_base_only=False): + first_base_only=False, non_strand_specific=False): self._read_count_splitting = read_count_splitting self._uniqueley_aligned_only = uniqueley_aligned_only self._first_base_only = first_base_only self._coverage_add_function = self._select_coverage_add_function() self._coverages = {} + self._non_strand_specific = non_strand_specific def ref_seq_and_coverages(self, bam_path): bam = self._open_bam_file(bam_path) for ref_seq, length in zip(bam.references, bam.lengths): self._init_coverage_list(length) self._calc_coverage(ref_seq, bam) + if self._non_strand_specific: + self._sum_strand_coverages() yield(ref_seq, self._coverages) + def _sum_strand_coverages(self): + self._coverages["forward_and_reverse"] = [ + cov_for + abs(cov_rev) for cov_for, cov_rev in + zip(self._coverages["forward"], self._coverages["reverse"])] + self._coverages.pop("forward") + self._coverages.pop("reverse") + def _init_coverage_list(self, length): for strand in ["forward", "reverse"]: self._coverages[strand] = [0.0] * length From 3e4b858ca2c23ba3893ae59bd2176ccc9125dbca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 12:53:15 +0100 Subject: [PATCH 02/22] Fix DESeq url --- docs/source/subcommands.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst index 2c47cc2e..b2093f11 100644 --- a/docs/source/subcommands.rst +++ b/docs/source/subcommands.rst @@ -254,7 +254,7 @@ deseq ----- Differential gene expression can be performed using ``deseq`` which -will run a `DESeq2 `_ +will run a `DESeq2 `_ analyses for all possible combinations of conditions. To allocated the conditions to the libraries use the ``--libs`` and ``--conditions`` parameters (e.g. ``--libs From 5f3191c030226b7476e4751ba3a6d8ec0b0e8ef4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 14:57:53 +0100 Subject: [PATCH 03/22] Adapt to pysam 0.8.1 API The pysam API has changed and the code of READemption was adapted accordingly. Due to this a never version (=> 0.8.1) is required. --- reademptionlib/genewisequanti.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/reademptionlib/genewisequanti.py b/reademptionlib/genewisequanti.py index 499d7187..f8dd7ae8 100644 --- a/reademptionlib/genewisequanti.py +++ b/reademptionlib/genewisequanti.py @@ -124,7 +124,7 @@ def _overlapping_alignments(self, sam, entry): # this correctly (checked in IGB, IGV and the unit testings). for alignment in sam.fetch( reference=entry.seq_id, start=entry.start-1, end=entry.end): - if alignment.overlap(entry.start-1, entry.end) < self._min_overlap: + if alignment.get_overlap(entry.start-1, entry.end) < self._min_overlap: continue if self._skip_antisense: if not self._same_strand(entry, alignment): diff --git a/setup.py b/setup.py index 6f962eef..b6196a11 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ description='A RNA-Seq Analysis Pipeline', url='', install_requires=[ - "pysam >= 0.7.8" + "pysam >= 0.8.1" ], scripts=['bin/reademption'], license='ISC License (ISCL)', From dcc0752d26aeb8910b85c938effada51b63450f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 15:01:10 +0100 Subject: [PATCH 04/22] Add further dependcies --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b6196a11..0cb116c4 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,9 @@ description='A RNA-Seq Analysis Pipeline', url='', install_requires=[ - "pysam >= 0.8.1" + "pysam >= 0.8.1", + "biopython >= 1.65", + "matplotlib >= 1.4.2", ], scripts=['bin/reademption'], license='ISC License (ISCL)', From 22ef32cda2ea9997b53f5e948f9af146b8d84ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 15:01:51 +0100 Subject: [PATCH 05/22] Adapt coverage call --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7a0277f1..7c8c1882 100644 --- a/Makefile +++ b/Makefile @@ -2,12 +2,15 @@ test: python3 tests/test_all.py coverage: - coverage3 run tests/test_all.py - coverage3 report + python3-coverage run tests/test_all.py + python3-coverage report package: python3 setup.py sdist +build: + python3 setup.py bdist + package_to_pypi: python3 setup.py sdist upload @echo "Go to https://pypi.python.org/pypi/READemption/" From 1f82d17b0cc8db7accbffcfbeaf77c3868016b85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 16:37:05 +0100 Subject: [PATCH 06/22] Add cleaning after sdist --- Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Makefile b/Makefile index 7c8c1882..710a9af1 100644 --- a/Makefile +++ b/Makefile @@ -7,6 +7,8 @@ coverage: package: python3 setup.py sdist + rm -rf READemption.egg-info + ls dist/* build: python3 setup.py bdist From 755eca7b6607bbcd8833cb45b90b172a14d54c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 22:26:57 +0100 Subject: [PATCH 07/22] Remove unnecessary imports --- reademptionlib/controller.py | 3 +-- reademptionlib/projectcreator.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index 75e99553..aa7e70ec 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -2,13 +2,12 @@ import os import sys import json -import pysam from reademptionlib.bammerger import BamMerger from reademptionlib.coveragecalculator import CoverageCalculator from reademptionlib.crossalignfilter import CrossAlignFilter from reademptionlib.deseq import DESeqRunner from reademptionlib.fasta import FastaParser -from reademptionlib.genewisequanti import GeneWiseQuantification, GeneWiseOverview +from reademptionlib.genewisequanti import GeneWiseOverview from reademptionlib.paths import Paths from reademptionlib.projectcreator import ProjectCreator from reademptionlib.rawstatdata import RawStatDataWriter, RawStatDataReader diff --git a/reademptionlib/projectcreator.py b/reademptionlib/projectcreator.py index 2c0d2a7e..6be18b53 100644 --- a/reademptionlib/projectcreator.py +++ b/reademptionlib/projectcreator.py @@ -1,6 +1,8 @@ -import json import os import sys +import Bio +import pysam +import matplotlib class ProjectCreator(object): From 54c9c650c543bfdac06a5260d3c73cbc0b2f2403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 22:28:20 +0100 Subject: [PATCH 08/22] Typo fix) --- reademptionlib/parameterlog.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/reademptionlib/parameterlog.py b/reademptionlib/parameterlog.py index 43737c3e..ee555d11 100644 --- a/reademptionlib/parameterlog.py +++ b/reademptionlib/parameterlog.py @@ -6,9 +6,9 @@ def __init__(self, log_file): class ParameterLogger(ParameterLog): - def add_paramter(self, paramter, value): + def add_paramter(self, parameter, value): log_fh = open(self.log_file, "a") - log_fh.write(self.delimiter.join([paramter, value]) + "\n") + log_fh.write(self.delimiter.join([parameter, value]) + "\n") log_fh.close() class ParameterLogReader(ParameterLog): From b71f67b2139763c68237b9a4b3d75558f8224f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 22:29:21 +0100 Subject: [PATCH 09/22] Add further lib version to be logged --- reademptionlib/projectcreator.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/reademptionlib/projectcreator.py b/reademptionlib/projectcreator.py index 6be18b53..8fd68af3 100644 --- a/reademptionlib/projectcreator.py +++ b/reademptionlib/projectcreator.py @@ -33,4 +33,8 @@ def create_subfolders(self, subfolders): def create_version_file(self, version_file_path, version): with open(version_file_path, "w") as fh: - fh.write("READemption version %s" % version) + fh.write("READemption version: %s\n" % version) + fh.write("Python version: %s\n" % sys.version.replace("\n", " ")) + fh.write("Biopython version: %s\n" % Bio.__version__) + fh.write("pysam version: %s\n" % pysam.__version__) + fh.write("matplotlib version: %s\n" % matplotlib.__version__) From d5804d06782f708ebaa35c84dd24e716b28b516b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Sun, 11 Jan 2015 22:30:08 +0100 Subject: [PATCH 10/22] Rename version logging file --- reademptionlib/paths.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/reademptionlib/paths.py b/reademptionlib/paths.py index 0d0acab0..9e34e76b 100644 --- a/reademptionlib/paths.py +++ b/reademptionlib/paths.py @@ -127,7 +127,8 @@ def _set_static_files(self): self.deseq_tmp_session_info_script = "%s/tmp.R" % self.deseq_raw_folder self.deseq_session_info = "%s/R_session_info.txt" % ( self.deseq_raw_folder) - self.version_path = "%s/used_reademption_version.txt" % (self.align_report_folder) + self.version_path = "%s/versions_of_used_libraries.txt" % ( + self.align_report_folder) def _get_sorted_folder_content(self, folder): """Return the sorted file list of a folder""" From 4999e01596600575ee59ec31c192914714517356 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Mon, 12 Jan 2015 10:15:08 +0100 Subject: [PATCH 11/22] Style changes based on flake8 recommendations --- reademptionlib/bammerger.py | 1 + reademptionlib/controller.py | 300 ++++++++++++++------------- reademptionlib/coveragecalculator.py | 1 + 3 files changed, 160 insertions(+), 142 deletions(-) diff --git a/reademptionlib/bammerger.py b/reademptionlib/bammerger.py index e0304740..4e6fc600 100644 --- a/reademptionlib/bammerger.py +++ b/reademptionlib/bammerger.py @@ -2,6 +2,7 @@ import sys import pysam + class BamMerger(object): def merge(self, output_bam, input_bam_1, input_bam_2): diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index aa7e70ec..3a53ce11 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -7,6 +7,7 @@ from reademptionlib.crossalignfilter import CrossAlignFilter from reademptionlib.deseq import DESeqRunner from reademptionlib.fasta import FastaParser +from reademptionlib.genewisequanti import GeneWiseQuantification from reademptionlib.genewisequanti import GeneWiseOverview from reademptionlib.paths import Paths from reademptionlib.projectcreator import ProjectCreator @@ -19,6 +20,7 @@ from reademptionlib.sambamconverter import SamToBamConverter from reademptionlib.wiggle import WiggleWriter + class Controller(object): """Manage the actions of the subcommands. @@ -42,7 +44,7 @@ def create_project(self, version): " / _ \/ __/ _ | / _ \___ __ _ ___ / /_(_)__ ___\n" " / , _/ _// __ |/ // / -_) ' \/ _ \/ __/ / _ \/ _ \\\n" "/_/|_/___/_/ |_/____/\__/_/_/_/ .__/\__/_/\___/_//_/\n" - " / /\n" + " / /\n" "====================================================\n" "========================================\n" "=======================\n" @@ -53,10 +55,11 @@ def create_project(self, version): project_creator.create_subfolders(self._paths.required_folders()) project_creator.create_version_file(self._paths.version_path, version) sys.stdout.write("Created folder \"%s\" and required subfolders.\n" % ( - self._args.project_path)) + self._args.project_path)) sys.stdout.write("Please copy read files into folder \"%s\" and " "reference sequences files into folder \"%s\".\n" % ( - self._paths.read_fasta_folder, self._paths.ref_seq_folder)) + self._paths.read_fasta_folder, + self._paths.ref_seq_folder)) def align_reads(self): """Perform the alignment of the reads.""" @@ -66,13 +69,13 @@ def align_reads(self): self._ref_seq_files = self._paths.get_ref_seq_files() self._paths.set_ref_seq_paths(self._ref_seq_files) self._test_align_file_existance() - if self._args.paired_end is False: + if not self._args.paired_end: # Single end reads self._read_files = self._paths.get_read_files() self._lib_names = self._paths.get_lib_names_single_end() self._paths.set_read_files_dep_file_lists_single_end( self._read_files, self._lib_names) - if self._args.realign is False: + if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() self._prepare_reads_single_end() self._align_single_end_reads() @@ -82,7 +85,7 @@ def align_reads(self): self._lib_names = self._paths.get_lib_names_paired_end() self._paths.set_read_files_dep_file_lists_paired_end( self._read_file_pairs, self._lib_names) - if self._args.realign is False: + if not self._args.realign: self._set_primary_aligner_paths_to_final_paths() self._prepare_reads_paired_end() self._align_paired_end_reads() @@ -96,12 +99,12 @@ def align_reads(self): self._paths.unaligned_reads_paths, self._paths.primary_read_aligner_stats_path) final_unaligned_reads_paths = self._paths.unaligned_reads_paths - if self._args.realign is True: + if self._args.realign: self._run_realigner_and_process_alignments() self._merge_bam_files() final_unaligned_reads_paths = ( self._paths.realigned_unaligned_reads_paths) - if not self._args.crossalign_cleaning_str is None: + if self._args.crossalign_cleaning_str is not None: self._remove_crossaligned_reads() self._generate_read_alignment_stats( self._lib_names, @@ -109,12 +112,12 @@ def align_reads(self): final_unaligned_reads_paths, self._paths.read_alignments_stats_path) self._write_alignment_stat_table() - + def _remove_crossaligned_reads(self): self._string_to_species_and_sequence_ids() jobs = [] with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for (bam_path, bam_with_crossmappings_path, bam_cleaned_tmp_path, crossmapped_reads_path) in zip( self._paths.read_alignment_bam_paths, @@ -157,10 +160,11 @@ def _string_to_species_and_sequence_ids(self): for org_and_seq_ids_str in orgs_and_seq_ids_strs: org, seq_ids_str = org_and_seq_ids_str.strip().split(":") seq_ids = [seq_id.strip() for seq_id in seq_ids_str.split(",")] - if "" in seq_ids: seq_ids.remove("") + if "" in seq_ids: + seq_ids.remove("") if len(seq_ids) < 1: self._write_err_msg_and_quit( - "Error! No sequence ID was given for the species '%s'. " + "Error! No sequence ID was given for the species '%s'. " "This does not make sense.\nYou gave the " "following input:\n%s\n" % ( org, self._args.crossalign_cleaning_str)) @@ -185,12 +189,13 @@ def _merge_bam_files(self): self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths): bam_merger = BamMerger() - jobs.append(executor.submit(bam_merger.merge, merged_bam, - primary_aligner_bam, realigner_bam)) + jobs.append(executor.submit( + bam_merger.merge, merged_bam, + primary_aligner_bam, realigner_bam)) self._check_job_completeness(jobs) - if self._args.keep_original_alignments is False: + if not self._args.keep_original_alignments: for bam_file_list in [ - self._paths.primary_read_aligner_bam_paths, + self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_bam_paths]: for bam_file in bam_file_list: os.remove(bam_file) @@ -205,7 +210,7 @@ def _run_realigner_and_process_alignments(self): self._paths.read_realigner_bam_prefixes_paths, self._paths.read_realigner_sam_paths) self._generate_read_alignment_stats( - self._lib_names, + self._lib_names, self._paths.read_realigner_bam_paths, self._paths.realigned_unaligned_reads_paths, self._paths.read_realigner_stats_path) @@ -214,13 +219,14 @@ def _test_align_file_existance(self): """Test if the input file for the the align subcommand exist.""" if len(self._paths.get_read_files()) == 0: self._write_err_msg_and_quit("Error! No read libraries given!\n") - if len(self._ref_seq_files ) == 0: - self._write_err_msg_and_quit("Error! No reference sequence files given!\n") + if len(self._ref_seq_files) == 0: + self._write_err_msg_and_quit( + "Error! No reference sequence files given!\n") def _test_folder_existance(self, task_specific_folders): """Test the existance of required folders.""" for folder in ( - self._paths.required_base_folders() + task_specific_folders): + self._paths.required_base_folders() + task_specific_folders): if not os.path.exists(folder): self._write_err_msg_and_quit( "Error! Folder '%s' does not exist! Is the given project " @@ -228,10 +234,10 @@ def _test_folder_existance(self, task_specific_folders): def _file_needs_to_be_created(self, file_path, quiet=False): """Test if a file exists of need to be created.""" - if self._args.check_for_existing_files is False: + if not self._args.check_for_existing_files: return True if os.path.exists(file_path): - if quiet is False: + if not quiet: sys.stderr.write( "File %s exists. Skipping its generation.\n" % file_path) return False @@ -239,14 +245,14 @@ def _file_needs_to_be_created(self, file_path, quiet=False): def _prepare_reads_single_end(self): """Manage the prepartion of reads before the actual mappings.""" - raw_stat_data_writer = RawStatDataWriter(pretty=True) read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for lib_name, read_path, processed_read_path in zip( - self._lib_names, self._paths.read_paths, + self._lib_names, self._paths.read_paths, self._paths.processed_read_paths): - if self._file_needs_to_be_created(processed_read_path) is False: + if not self._file_needs_to_be_created( + processed_read_path): continue read_processor = ReadProcessor( poly_a_clipping=self._args.poly_a_clipping, @@ -255,39 +261,39 @@ def _prepare_reads_single_end(self): min_phred_score=self._args.min_phred_score, adapter=self._args.adapter) read_files_and_jobs[lib_name] = executor.submit( - read_processor.process_single_end, read_path, + read_processor.process_single_end, read_path, processed_read_path) self._evaluet_job_and_generate_stat_file(lib_name, read_files_and_jobs) def _prepare_reads_paired_end(self): read_files_and_jobs = {} with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for lib_name, read_path_pair, processed_read_path_pair in zip( - self._lib_names, self._paths.read_path_pairs, - self._paths.processed_read_path_pairs): + self._lib_names, self._paths.read_path_pairs, + self._paths.processed_read_path_pairs): for processed_read_path in processed_read_path_pair: - if self._file_needs_to_be_created( - processed_read_path) is False: + if not self._file_needs_to_be_created( + processed_read_path): continue read_processor = ReadProcessor( - poly_a_clipping=False, + poly_a_clipping=False, min_read_length=self._args.min_read_length, fastq=self._args.fastq, min_phred_score=self._args.min_phred_score, adapter=self._args.adapter) - read_files_and_jobs[lib_name] = executor.submit( - read_processor.process_paired_end, read_path_pair, + read_files_and_jobs[lib_name] = executor.submit( + read_processor.process_paired_end, read_path_pair, processed_read_path_pair) self._evaluet_job_and_generate_stat_file(lib_name, read_files_and_jobs) def _evaluet_job_and_generate_stat_file( - self, lib_name, read_files_and_jobs): + self, lib_name, read_files_and_jobs): raw_stat_data_writer = RawStatDataWriter(pretty=True) # Evaluate thread outcome self._check_job_completeness(read_files_and_jobs.values()) - if self._file_needs_to_be_created( - self._paths.read_processing_stats_path) is False: + if not self._file_needs_to_be_created( + self._paths.read_processing_stats_path): return # Create a dict of the read file names and the processing # counting results @@ -299,58 +305,61 @@ def _evaluet_job_and_generate_stat_file( def _align_single_end_reads(self): """Manage the actual alignment of single end reads.""" - read_aligner = ReadAligner(self._args.segemehl_bin, self._args.progress) - if self._file_needs_to_be_created(self._paths.index_path) is True: + read_aligner = ReadAligner( + self._args.segemehl_bin, self._args.progress) + if self._file_needs_to_be_created(self._paths.index_path): read_aligner.build_index( self._paths.ref_seq_paths, self._paths.index_path) for read_path, output_path, nomatch_path, bam_path in zip( - self._paths.processed_read_paths, - self._paths.primary_read_aligner_sam_paths, - self._paths.unaligned_reads_paths, - self._paths.read_alignment_bam_paths): - if self._file_needs_to_be_created(output_path) is False: + self._paths.processed_read_paths, + self._paths.primary_read_aligner_sam_paths, + self._paths.unaligned_reads_paths, + self._paths.read_alignment_bam_paths): + if not self._file_needs_to_be_created(output_path): continue - elif self._file_needs_to_be_created(bam_path) is False: + elif not self._file_needs_to_be_created(bam_path): continue read_aligner.run_alignment( - read_path, self._paths.index_path, self._paths.ref_seq_paths, + read_path, self._paths.index_path, self._paths.ref_seq_paths, output_path, nomatch_path, int(self._args.processes), - int(self._args.segemehl_accuracy), - float(self._args.segemehl_evalue), self._args.split, + int(self._args.segemehl_accuracy), + float(self._args.segemehl_evalue), self._args.split, paired_end=False) def _align_paired_end_reads(self): """Manage the actual alignemnt of paired end reads.""" - read_aligner = ReadAligner(self._args.segemehl_bin, self._args.progress) - if self._file_needs_to_be_created(self._paths.index_path) is True: + read_aligner = ReadAligner( + self._args.segemehl_bin, self._args.progress) + if self._file_needs_to_be_created(self._paths.index_path): read_aligner.build_index( self._paths.ref_seq_paths, self._paths.index_path) for read_path_pair, output_path, nomatch_path, bam_path in zip( - self._paths.processed_read_path_pairs, - self._paths.primary_read_aligner_sam_paths, - self._paths.unaligned_reads_paths, - self._paths.primary_read_aligner_bam_paths): - if self._file_needs_to_be_created(output_path) is False: + self._paths.processed_read_path_pairs, + self._paths.primary_read_aligner_sam_paths, + self._paths.unaligned_reads_paths, + self._paths.primary_read_aligner_bam_paths): + if not self._file_needs_to_be_created(output_path): continue - elif self._file_needs_to_be_created(bam_path) is False: + elif not self._file_needs_to_be_created(bam_path): continue read_aligner.run_alignment( - read_path_pair, self._paths.index_path, - self._paths.ref_seq_paths, output_path, nomatch_path, + read_path_pair, self._paths.index_path, + self._paths.ref_seq_paths, output_path, nomatch_path, int(self._args.processes), int(self._args.segemehl_accuracy), - float(self._args.segemehl_evalue), self._args.split, + float(self._args.segemehl_evalue), self._args.split, paired_end=True) def _realign_unmapped_reads(self): - read_realigner = ReadRealigner(self._args.lack_bin, self._args.progress) - for (query_fasta_path, query_sam_path, realignment_sam_path, + read_realigner = ReadRealigner( + self._args.lack_bin, self._args.progress) + for (query_fasta_path, query_sam_path, realignment_sam_path, unaligned_reads_path) in zip( self._paths.unaligned_reads_paths, self._paths.read_realigner_tmp_sam_paths, self._paths.read_realigner_sam_paths, self._paths.realigned_unaligned_reads_paths): read_realigner.run_alignment( - query_fasta_path, query_sam_path, self._paths.ref_seq_paths, + query_fasta_path, query_sam_path, self._paths.ref_seq_paths, realignment_sam_path, unaligned_reads_path, int(self._args.processes), int(self._args.segemehl_accuracy)) os.remove(query_sam_path) @@ -360,13 +369,14 @@ def _sam_to_bam(self, sam_paths, bam_prefixes_paths, bam_paths): sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for sam_path, bam_prefix_path, bam_path in zip( sam_paths, bam_prefixes_paths, bam_paths): - if self._file_needs_to_be_created(bam_path) is False: + if not self._file_needs_to_be_created(bam_path): continue jobs.append(executor.submit( - sam_to_bam_converter.sam_to_bam, sam_path, bam_prefix_path)) + sam_to_bam_converter.sam_to_bam, + sam_path, bam_prefix_path)) # Evaluate thread outcome self._check_job_completeness(jobs) @@ -374,9 +384,9 @@ def _generate_sorted_tmp_sam_file(self): sam_to_bam_converter = SamToBamConverter() jobs = [] with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for bam_path, sam_path in zip( - self._paths.primary_read_aligner_bam_paths, + self._paths.primary_read_aligner_bam_paths, self._paths.read_realigner_tmp_sam_paths): jobs.append(executor.submit( sam_to_bam_converter.bam_to_sam, bam_path, sam_path)) @@ -389,21 +399,21 @@ def _generate_read_alignment_stats( """Manage the generation of alingment statistics.""" raw_stat_data_writer = RawStatDataWriter(pretty=True) read_files_and_jobs = {} - if self._file_needs_to_be_created(output_stats_path) is False: + if not self._file_needs_to_be_created(output_stats_path): return with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for (lib_name, read_alignment_bam_path, unaligned_reads_path) in zip( lib_names, result_bam_paths, unaligned_reads_paths): read_aligner_stats = ReadAlignerStats() - read_files_and_jobs[lib_name] = executor.submit( + read_files_and_jobs[lib_name] = executor.submit( read_aligner_stats.count, read_alignment_bam_path, unaligned_reads_path) # Evaluate thread outcome self._check_job_completeness(read_files_and_jobs.values()) read_files_and_stats = dict( - [(lib_name, job.result()) + [(lib_name, job.result()) for lib_name, job in read_files_and_jobs.items()]) raw_stat_data_writer.write(read_files_and_stats, output_stats_path) @@ -416,14 +426,15 @@ def _write_alignment_stat_table(self): self._paths.read_alignments_stats_path) realignment_stats = None primary_aligner_stats = None - if self._args.realign is True: + if self._args.realign: primary_aligner_stats = raw_stat_data_reader.read( self._paths.primary_read_aligner_stats_path) realignment_stats = raw_stat_data_reader.read( self._paths.read_realigner_stats_path) read_aligner_stats_table = ReadAlignerStatsTable( - read_processing_stats, final_alignment_stats, primary_aligner_stats, - realignment_stats, self._lib_names, + read_processing_stats, final_alignment_stats, + primary_aligner_stats, + realignment_stats, self._lib_names, self._paths.read_alignment_stats_table_path, self._args.paired_end) read_aligner_stats_table.write() @@ -451,73 +462,76 @@ def create_coverage_files(self): self._test_folder_existance(self._paths.required_coverage_folders()) raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( - self._paths.read_alignments_stats_path)] + self._paths.read_alignments_stats_path)] lib_names = list(alignment_stats[0].keys()) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) - if was_paired_end_alignment == False: - self._paths.set_read_files_dep_file_lists_single_end ( + if not was_paired_end_alignment: + self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) - else: - self._paths.set_read_files_dep_file_lists_paired_end ( + else: + self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) # Get number of aligned or number of uniquely aligned reads - if self._args.normalize_by_uniquely is False: + if not self._args.normalize_by_uniquely: aligned_counting = "no_of_aligned_reads" else: aligned_counting = "no_of_uniquely_aligned_reads" read_files_aligned_read_freq = dict([ (read_file, round(attributes["stats_total"][aligned_counting])) - for read_file, attributes in alignment_stats[0].items()]) + for read_file, attributes in alignment_stats[0].items()]) min_no_of_aligned_reads = float(min( read_files_aligned_read_freq.values())) # Run the generation of coverage in parallel jobs = [] with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for lib_name, bam_path in zip( - lib_names, self._paths.read_alignment_bam_paths): + lib_names, self._paths.read_alignment_bam_paths): no_of_aligned_reads = float( read_files_aligned_read_freq[lib_name]) jobs.append(executor.submit( - self._create_coverage_files_for_lib, - lib_name, bam_path, no_of_aligned_reads, - min_no_of_aligned_reads)) + self._create_coverage_files_for_lib, + lib_name, bam_path, no_of_aligned_reads, + min_no_of_aligned_reads)) # Evaluate thread outcome self._check_job_completeness(jobs) def _all_coverage_file_exist( - self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): + self, lib_name, strands, no_of_aligned_reads, + min_no_of_aligned_reads): """Test the existance of all coverage file of a library""" files = [] for strand in strands: files.append(self._paths.wiggle_file_raw_path(lib_name, strand)) files.append(self._paths.wiggle_file_tnoar_norm_min_path( - lib_name, strand, multi=min_no_of_aligned_reads, - div=no_of_aligned_reads)) + lib_name, strand, multi=min_no_of_aligned_reads, + div=no_of_aligned_reads)) files.append(self._paths.wiggle_file_tnoar_norm_mil_path( - lib_name, strand, multi=1000000, - div=no_of_aligned_reads)) - if any([self._file_needs_to_be_created(file, quiet=True) - for file in files]) is False: + lib_name, strand, multi=1000000, + div=no_of_aligned_reads)) + if not any([self._file_needs_to_be_created(file, quiet=True) + for file in files]): sys.stderr.write( - "The files %s exists. Skipping their generation.\n" % + "The files %s exists. Skipping their generation.\n" % ", " .join(files)) return True return False def _create_coverage_files_for_lib( - self, lib_name, bam_path, no_of_aligned_reads, min_no_of_aligned_reads): + self, lib_name, bam_path, no_of_aligned_reads, + min_no_of_aligned_reads): """Perform the coverage calculation for a given library.""" if not self._args.non_strand_specific: strands = ["forward", "reverse"] else: strands = ["forward_and_reverse"] if self._all_coverage_file_exist( - lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): + lib_name, strands, no_of_aligned_reads, + min_no_of_aligned_reads): return read_count_splitting = True - if self._args.skip_read_count_splitting is True: + if self._args.skip_read_count_splitting: read_count_splitting = False coverage_calculator = CoverageCalculator( read_count_splitting=read_count_splitting, @@ -531,7 +545,7 @@ def _create_coverage_files_for_lib( bam_path): for strand in strands: coverage_writers_raw[strand].write_replicons_coverages( - ref_seq, coverages[strand]) + ref_seq, coverages[strand]) coverage_writers_tnoar_min_norm[ strand].write_replicons_coverages( ref_seq, coverages[strand], @@ -547,25 +561,23 @@ def _wiggle_writers(self, lib_name, strands, no_of_aligned_reads, min_no_of_aligned_reads): """Write the calculated coverages to wiggle files.""" coverage_writers_raw = dict([( - strand, WiggleWriter( - "%s_%s" % (lib_name, strand), - open(self._paths.wiggle_file_raw_path(lib_name, strand), - "w"))) for strand in strands]) + strand, WiggleWriter( + "%s_%s" % (lib_name, strand), + open(self._paths.wiggle_file_raw_path(lib_name, strand), + "w"))) for strand in strands]) coverage_writers_tnoar_min_norm = dict([( - strand, WiggleWriter( - "%s_%s" % (lib_name, strand), - open(self._paths.wiggle_file_tnoar_norm_min_path( - lib_name, strand, multi=min_no_of_aligned_reads, - div=no_of_aligned_reads), "w"))) - for strand in strands]) + strand, WiggleWriter( + "%s_%s" % (lib_name, strand), + open(self._paths.wiggle_file_tnoar_norm_min_path( + lib_name, strand, multi=min_no_of_aligned_reads, + div=no_of_aligned_reads), "w"))) for strand in strands]) coverage_writers_tnoar_mil_norm = dict([( - strand, WiggleWriter( - "%s_%s" % (lib_name, strand), - open(self._paths.wiggle_file_tnoar_norm_mil_path( - lib_name, strand, multi=1000000, - div=no_of_aligned_reads), "w"))) - for strand in strands]) - return (coverage_writers_raw, coverage_writers_tnoar_min_norm, + strand, WiggleWriter( + "%s_%s" % (lib_name, strand), + open(self._paths.wiggle_file_tnoar_norm_mil_path( + lib_name, strand, multi=1000000, + div=no_of_aligned_reads), "w"))) for strand in strands]) + return (coverage_writers_raw, coverage_writers_tnoar_min_norm, coverage_writers_tnoar_mil_norm) def _check_job_completeness(self, jobs): @@ -586,26 +598,26 @@ def quantify_gene_wise(self): norm_by_overlap_freq = False raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( - self._paths.read_alignments_stats_path)] + self._paths.read_alignments_stats_path)] lib_names = sorted(list(alignment_stats[0].keys())) annotation_files = self._paths.get_annotation_files() - self._paths.set_annotation_paths(annotation_files) + self._paths.set_annotation_paths(annotation_files) was_paired_end_alignment = self._was_paired_end_alignment(lib_names) - if was_paired_end_alignment == False: + if not was_paired_end_alignment: self._paths.set_read_files_dep_file_lists_single_end( self._paths.get_read_files(), lib_names) - else: + else: self._paths.set_read_files_dep_file_lists_paired_end( self._paths.get_read_files(), lib_names) jobs = [] with concurrent.futures.ProcessPoolExecutor( - max_workers=self._args.processes) as executor: + max_workers=self._args.processes) as executor: for lib_name, read_alignment_path in zip( - lib_names, self._paths.read_alignment_bam_paths): + lib_names, self._paths.read_alignment_bam_paths): jobs.append(executor.submit( - self._quantify_gene_wise, lib_name, - read_alignment_path, norm_by_alignment_freq, - norm_by_overlap_freq, annotation_files)) + self._quantify_gene_wise, lib_name, + read_alignment_path, norm_by_alignment_freq, + norm_by_overlap_freq, annotation_files)) # Evaluate thread outcome self._check_job_completeness(jobs) self._gene_quanti_create_overview( @@ -617,19 +629,21 @@ def _was_paired_end_alignment(self, lib_names): return True return False - def _quantify_gene_wise(self, lib_name, read_alignment_path, - norm_by_alignment_freq, norm_by_overlap_freq, - annotation_files): + def _quantify_gene_wise( + self, lib_name, read_alignment_path, + norm_by_alignment_freq, norm_by_overlap_freq, + annotation_files): """Perform the gene wise quantification for a given library.""" gene_quanti_paths = [ self._paths.gene_quanti_path(lib_name, annotation_file) for annotation_file in annotation_files] # Check if all output files for this library exist - if so # skip their creation - if any([self._file_needs_to_be_created(gene_quanti_path, quiet=True) - for gene_quanti_path in gene_quanti_paths]) is False: + if not any([self._file_needs_to_be_created( + gene_quanti_path, quiet=True) + for gene_quanti_path in gene_quanti_paths]): sys.stderr.write( - "The file(s) %s exist(s). Skipping their/its generation.\n" % + "The file(s) %s exist(s). Skipping their/its generation.\n" % ", " .join(gene_quanti_paths)) return gene_wise_quantification = GeneWiseQuantification( @@ -642,7 +656,7 @@ def _quantify_gene_wise(self, lib_name, read_alignment_path, gene_wise_quantification.calc_overlaps_per_alignment( read_alignment_path, self._paths.annotation_paths) for annotation_file, annotation_path in zip( - annotation_files, self._paths.annotation_paths): + annotation_files, self._paths.annotation_paths): gene_wise_quantification.quantify( read_alignment_path, annotation_path, self._paths.gene_quanti_path( @@ -663,18 +677,18 @@ def _gene_quanti_create_overview( [read_file, self._paths.gene_quanti_path( read_file, annotation_file)]) if self._file_needs_to_be_created( - self._paths.gene_wise_quanti_combined_path) is True: + self._paths.gene_wise_quanti_combined_path): gene_wise_overview.create_overview_raw_countings( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_path) if self._file_needs_to_be_created( - self._paths.gene_wise_quanti_combined_rpkm_path) is True: + self._paths.gene_wise_quanti_combined_rpkm_path): gene_wise_overview.create_overview_rpkm( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_rpkm_path, self._libs_and_total_num_of_aligned_reads()) if self._file_needs_to_be_created( - self._paths.gene_wise_quanti_combined_tnoar_path) is True: + self._paths.gene_wise_quanti_combined_tnoar_path): gene_wise_overview.create_overview_norm_by_tnoar( path_and_name_combos, lib_names, self._paths.gene_wise_quanti_combined_tnoar_path, @@ -691,21 +705,22 @@ def _libs_and_total_num_of_uniquely_aligned_reads(self): """Read the total number of reads per library.""" with open(self._paths.read_alignments_stats_path) as read_aligner_stats_fh: read_aligner_stats = json.loads(read_aligner_stats_fh.read()) - return dict([(lib, values["stats_total"]["no_of_uniquely_aligned_reads"]) + return dict([(lib, values[ + "stats_total"]["no_of_uniquely_aligned_reads"]) for lib, values in read_aligner_stats.items()]) def compare_with_deseq(self): """Manage the pairwise expression comparison with DESeq.""" self._test_folder_existance( self._paths.required_deseq_folders()) - arg_libs = [self._paths._clean_file_name(lib) for lib in + arg_libs = [self._paths._clean_file_name(lib) for lib in self._args.libs.split(",")] conditions = self._args.conditions.split(",") self._check_deseq_args(arg_libs, conditions) deseq_runner = DESeqRunner( arg_libs, conditions, self._paths.deseq_raw_folder, self._paths.deseq_extended_folder, self._paths.deseq_script_path, - self._paths.gene_wise_quanti_combined_path, + self._paths.gene_wise_quanti_combined_path, self._paths.deseq_tmp_session_info_script, self._paths.deseq_session_info, self._args.cooks_cutoff_off) @@ -725,14 +740,14 @@ def _check_deseq_args(self, arg_libs, conditions): len(conditions))) raw_stat_data_reader = RawStatDataReader() alignment_stats = [raw_stat_data_reader.read( - self._paths.read_alignments_stats_path)] + self._paths.read_alignments_stats_path)] lib_names = list(alignment_stats[0].keys()) if len(lib_names) != len(arg_libs): self._write_err_msg_and_quit( "The number of read libraries is lower or higher than " "expected. The following read libs are available: %s\nThe " "following read list string is suggested: \"%s\"\n" % ( - ", ".join(read_files), ",".join(lib_names))) + ", ".join(read_files), ",".join(lib_names))) for lib in lib_names: if lib not in arg_libs: self._write_err_msg_and_quit( @@ -761,7 +776,7 @@ def viz_gene_quanti(self): """Generate plots based on the gene-wise read countings""" from reademptionlib.vizgenequanti import GeneQuantiViz gene_quanti_viz = GeneQuantiViz( - self._paths.gene_wise_quanti_combined_path, + self._paths.gene_wise_quanti_combined_path, self._paths.get_lib_names_single_end()) gene_quanti_viz.parse_input_table() gene_quanti_viz.plot_correlations( @@ -774,7 +789,8 @@ def viz_deseq(self): from reademptionlib.vizdeseq import DESeqViz deseq_path_template = ( self._paths.deseq_raw_folder + "/deseq_comp_%s_vs_%s.csv") - deseq_viz = DESeqViz(self._paths.deseq_script_path, deseq_path_template) + deseq_viz = DESeqViz( + self._paths.deseq_script_path, deseq_path_template) deseq_viz.create_scatter_plots( self._paths.viz_deseq_scatter_plot_path) deseq_viz.create_volcano_plots( diff --git a/reademptionlib/coveragecalculator.py b/reademptionlib/coveragecalculator.py index d259780d..04eab2a6 100644 --- a/reademptionlib/coveragecalculator.py +++ b/reademptionlib/coveragecalculator.py @@ -1,5 +1,6 @@ import pysam + class CoverageCalculator(object): def __init__(self, read_count_splitting=True, uniqueley_aligned_only=False, From 57da36211fcf98674294ee22f75d7f2e354c90bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 13:58:07 +0100 Subject: [PATCH 12/22] Cleaning --- reademptionlib/deseq.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/reademptionlib/deseq.py b/reademptionlib/deseq.py index 770cf1c3..51580a41 100644 --- a/reademptionlib/deseq.py +++ b/reademptionlib/deseq.py @@ -3,12 +3,13 @@ import os from subprocess import call + class DESeqRunner(object): def __init__( self, libs, conditions, deseq_raw_folder, deseq_extended_folder, - deseq_script_path, gene_wise_quanti_combined_path, - deseq_tmp_session_info_script, deseq_session_info, + deseq_script_path, gene_wise_quanti_combined_path, + deseq_tmp_session_info_script, deseq_session_info, cooks_cutoff_off=False): self._libs = libs self._conditions = conditions @@ -26,7 +27,7 @@ def write_session_info_file(self): tmp_r_script_fh.write("library('DESeq2')\nsessionInfo()\n") with open(self._deseq_session_info, "w") as session_info_fh: with open(os.devnull, "w") as devnull: - call(["Rscript", self._deseq_tmp_session_info_script,], + call(["Rscript", self._deseq_tmp_session_info_script], stdout=session_info_fh, stderr=devnull) os.remove(self._deseq_tmp_session_info_script) @@ -41,8 +42,8 @@ def create_deseq_script_file(self): conditions = [libs_to_conditions[lib] for lib in libs] condition_str = ", ".join(["'%s'" % cond for cond in conditions]) file_content = self._deseq_script_template() % ( - self._gene_wise_quanti_combined_path, self._first_data_column-1, - len(libs),self._first_data_column, libs_str, condition_str) + self._gene_wise_quanti_combined_path, self._first_data_column-1, + len(libs), self._first_data_column, libs_str, condition_str) file_content += self._comparison_call_strings(conditions) deseq_fh = open(self._deseq_script_path, "w") deseq_fh.write(file_content) @@ -56,25 +57,25 @@ def merge_counting_files_with_results(self): output_fh = open("%s/%s" % ( self._deseq_extended_folder, comparison_file.replace( - ".csv", "_with_annotation_and_countings.csv")), "w") + ".csv", "_with_annotation_and_countings.csv")), "w") try: deseq_result_fh = open("%s/%s" % ( self._deseq_raw_folder, comparison_file)) except: sys.stderr.write("Apparently DESeq did not generate the " - "file \"%s\". Extension stopped.\n" % + "file \"%s\". Extension stopped.\n" % comparison_file) continue for counting_file_row, comparison_file_row in zip( csv.reader(open( self._gene_wise_quanti_combined_path), delimiter="\t"), - csv.reader(deseq_result_fh, delimiter="\t")): + csv.reader(deseq_result_fh, delimiter="\t")): if comparison_file_row[0] == "baseMean": # Add another column to the header comparison_file_row = [""] + comparison_file_row # Extend column description counting_file_row[self._first_data_column:] = [ - "%s raw countings" % lib_name for lib_name in + "%s raw countings" % lib_name for lib_name in counting_file_row[self._first_data_column:]] output_fh.write("\t".join( counting_file_row + comparison_file_row[1:]) + "\n") @@ -96,7 +97,7 @@ def _comparison_call_strings(self, conditions): for index, condition_combo in enumerate(condition_combos): call_string += ("comp%s <- results(dds, contrast=" "c('condition','%s', '%s')%s)\n" % ( - index, condition_combo[0], condition_combo[1], + index, condition_combo[0], condition_combo[1], cooks_cutoff_str)) comparison_file = "deseq_comp_%s_vs_%s.csv" % ( condition_combo[0], condition_combo[1]) From 60bfd240fa7734c611fda26ceb12f837bbda93f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 14:16:48 +0100 Subject: [PATCH 13/22] Add PCA and Heatmap generation in DESeq --- reademptionlib/controller.py | 4 +++- reademptionlib/deseq.py | 26 ++++++++++++++++++++++---- reademptionlib/paths.py | 2 ++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index aa7e70ec..6d1e61f7 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -19,6 +19,7 @@ from reademptionlib.sambamconverter import SamToBamConverter from reademptionlib.wiggle import WiggleWriter + class Controller(object): """Manage the actions of the subcommands. @@ -705,7 +706,8 @@ def compare_with_deseq(self): deseq_runner = DESeqRunner( arg_libs, conditions, self._paths.deseq_raw_folder, self._paths.deseq_extended_folder, self._paths.deseq_script_path, - self._paths.gene_wise_quanti_combined_path, + self._paths.deseq_pca_heatmap_path, + self._paths.gene_wise_quanti_combined_path, self._paths.deseq_tmp_session_info_script, self._paths.deseq_session_info, self._args.cooks_cutoff_off) diff --git a/reademptionlib/deseq.py b/reademptionlib/deseq.py index 51580a41..94e33344 100644 --- a/reademptionlib/deseq.py +++ b/reademptionlib/deseq.py @@ -8,7 +8,8 @@ class DESeqRunner(object): def __init__( self, libs, conditions, deseq_raw_folder, deseq_extended_folder, - deseq_script_path, gene_wise_quanti_combined_path, + deseq_script_path, deseq_pca_heatmap_path, + gene_wise_quanti_combined_path, deseq_tmp_session_info_script, deseq_session_info, cooks_cutoff_off=False): self._libs = libs @@ -16,6 +17,7 @@ def __init__( self._deseq_raw_folder = deseq_raw_folder self._deseq_extended_folder = deseq_extended_folder self._deseq_script_path = deseq_script_path + self._deseq_pca_heatmap_path = deseq_pca_heatmap_path self._gene_wise_quanti_combined_path = gene_wise_quanti_combined_path self._deseq_tmp_session_info_script = deseq_tmp_session_info_script self._deseq_session_info = deseq_session_info @@ -43,7 +45,8 @@ def create_deseq_script_file(self): condition_str = ", ".join(["'%s'" % cond for cond in conditions]) file_content = self._deseq_script_template() % ( self._gene_wise_quanti_combined_path, self._first_data_column-1, - len(libs), self._first_data_column, libs_str, condition_str) + len(libs), self._first_data_column, libs_str, condition_str, + self._deseq_pca_heatmap_path) file_content += self._comparison_call_strings(conditions) deseq_fh = open(self._deseq_script_path, "w") deseq_fh.write(file_content) @@ -113,7 +116,8 @@ def _deseq_script_template(self): return ( "library('DESeq2')\n" "rawCountTable <- read.table('%s', skip=1, sep='\\t', " - "quote='', comment.char='', colClasses=c(rep('character',%s), rep('numeric',%s)))\n" + "quote='', comment.char='', " + "colClasses=c(rep('character',%s), rep('numeric',%s)))\n" "countTable <- round(rawCountTable[,%s:length(names(" "rawCountTable))])\n" "libs <- c(%s)\n" @@ -121,4 +125,18 @@ def _deseq_script_template(self): "samples <- data.frame(row.names=libs, condition=conds)\n" "dds <- DESeqDataSetFromMatrix(countData=countTable, " "colData=samples, design=~condition)\n" - "dds <- DESeq(dds)\n") + "dds <- DESeq(dds)\n\n" + "# PCA plot\n" + "pdf('%s')\n" + "rld <- rlog(dds)\n" + "print(plotPCA(rld, intgroup=c('condition')))\n\n" + "# Heatmap\n" + "library('RColorBrewer')\n" + "library('gplots')\n" + "distsRL <- dist(t(assay(rld)))\n" + "mat <- as.matrix(distsRL)\n" + "rownames(mat) <- with(colData(dds), " + "paste(condition, sep=' : '))\n" + "hmcol <- colorRampPalette(brewer.pal(9, 'GnBu'))(100)\n" + "heatmap.2(mat, trace='none', col = rev(hmcol), " + "margin=c(13, 13))\n") diff --git a/reademptionlib/paths.py b/reademptionlib/paths.py index 9e34e76b..8315cde1 100644 --- a/reademptionlib/paths.py +++ b/reademptionlib/paths.py @@ -124,6 +124,8 @@ def _set_static_files(self): self.align_report_folder) self.index_path = "%s/index.idx" % self.read_alignment_index_folder self.deseq_script_path = "%s/deseq.R" % self.deseq_raw_folder + self.deseq_pca_heatmap_path = "%s/sample_comparison_pca_heatmap.pdf" % ( + self.deseq_raw_folder) self.deseq_tmp_session_info_script = "%s/tmp.R" % self.deseq_raw_folder self.deseq_session_info = "%s/R_session_info.txt" % ( self.deseq_raw_folder) From 320afd78ffd68501fe84a6150deb40dd620bcace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 14:48:45 +0100 Subject: [PATCH 14/22] Add header lib description to DESeq output --- reademptionlib/deseq.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/reademptionlib/deseq.py b/reademptionlib/deseq.py index 94e33344..8720248d 100644 --- a/reademptionlib/deseq.py +++ b/reademptionlib/deseq.py @@ -61,6 +61,10 @@ def merge_counting_files_with_results(self): self._deseq_extended_folder, comparison_file.replace( ".csv", "_with_annotation_and_countings.csv")), "w") + output_fh.write("# Reference library (divisor): {}\n".format( + combo[1])) + output_fh.write("# Comparison library (numerator): {}\n".format( + combo[0])) try: deseq_result_fh = open("%s/%s" % ( self._deseq_raw_folder, comparison_file)) From 4a078ffbd239393732f8023da0f06188752daeab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 14:49:12 +0100 Subject: [PATCH 15/22] Git remove comparison redundancy --- reademptionlib/deseq.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/reademptionlib/deseq.py b/reademptionlib/deseq.py index 8720248d..438ab0fa 100644 --- a/reademptionlib/deseq.py +++ b/reademptionlib/deseq.py @@ -89,8 +89,9 @@ def merge_counting_files_with_results(self): output_fh.close() def _condition_combos(self, conditions): - for cond1 in conditions[:]: - for cond2 in conditions[:]: + non_redundant_conditions = set(conditions) + for cond1 in non_redundant_conditions: + for cond2 in non_redundant_conditions: if not cond1 == cond2: yield((cond1, cond2)) From 4940c353a1859d7a64f28acd5cf8208abbe3170d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 15:17:30 +0100 Subject: [PATCH 16/22] Reimplement MA plot with pandas --- reademptionlib/controller.py | 3 ++- reademptionlib/vizdeseq.py | 45 +++++++++++++++--------------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index 6d1e61f7..473752b7 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -776,7 +776,8 @@ def viz_deseq(self): from reademptionlib.vizdeseq import DESeqViz deseq_path_template = ( self._paths.deseq_raw_folder + "/deseq_comp_%s_vs_%s.csv") - deseq_viz = DESeqViz(self._paths.deseq_script_path, deseq_path_template) + deseq_viz = DESeqViz( + self._paths.deseq_script_path, deseq_path_template) deseq_viz.create_scatter_plots( self._paths.viz_deseq_scatter_plot_path) deseq_viz.create_volcano_plots( diff --git a/reademptionlib/vizdeseq.py b/reademptionlib/vizdeseq.py index 22ab6525..fc5533bb 100644 --- a/reademptionlib/vizdeseq.py +++ b/reademptionlib/vizdeseq.py @@ -4,6 +4,7 @@ matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages +import pandas as pd class DESeqViz(object): @@ -30,33 +31,25 @@ def create_scatter_plots(self, plot_path): self._pp_scatterplots.close() def _create_scatter_plots(self, condition_1, condition_2): - deseq_path = self._deseq_path_template % (condition_1, condition_2) - (basemeans, log2_fold_changes, p_values, - p_adj_values) = self._parse_deseq_file(deseq_path) + pd.options.display.mpl_style = 'default' + font = {'family': 'sans-serif', 'size': 7} + matplotlib.rc('font', **font) + deseq_result = pd.read_table( + self._deseq_path_template % (condition_1, condition_2)) fig = plt.figure() - ax = fig.add_subplot(111) - ax.set_xscale('log') - cleaned_basemeans = [] - cleaned_log2_fold_changes = [] - for basemean, log2_fc in zip(basemeans, log2_fold_changes): - if log2_fc != "NA" and basemean != "NA": - cleaned_basemeans.append(basemean) - cleaned_log2_fold_changes.append(log2_fc) - y_max = max([abs(log2_fc) for log2_fc in cleaned_log2_fold_changes]) - y_min = -1*y_max - x_min = min(cleaned_basemeans) - x_max = max(cleaned_basemeans) - # # Draw lines - plt.plot([x_min, x_max], [0, 0], color="k", alpha=0.2) - # Set axis ranges - plt.axis([x_min, x_max, y_max, y_min]) - plt.title("%s vs. %s" % (condition_1, condition_2)) - sig_basemeans = [] - sig_log2_fold_changes = [] - plt.plot(sig_basemeans, sig_log2_fold_changes, "r.", alpha=0.2) - plt.plot(cleaned_basemeans, cleaned_log2_fold_changes, "k.", alpha=0.2) - plt.xlabel("Mean of normalized counts ") - plt.ylabel("Log2 fold change") + plt.plot(np.log10(deseq_result.baseMean), + deseq_result.log2FoldChange, ".", alpha=0.3) + significant_deseq_result = deseq_result[ + deseq_result.padj < self._p_value_significance_limit] + plt.plot( + np.log10(significant_deseq_result.baseMean), + significant_deseq_result.log2FoldChange, ".r", alpha=0.5) + plt.title("{} vs. {} - MA plot".format(condition_1, condition_2)) + y_max = max([abs(log2_fc) + for log2_fc in deseq_result.log2FoldChange]) + plt.ylim(-1.1 * y_max, 1.1 * y_max) + plt.xlabel("log10 base mean") + plt.ylabel("log2 fold-change") self._pp_scatterplots.savefig() def create_volcano_plots(self, volcano_plot_path, volcano_plot_adj_path): From 004a1e805ae6793827af7568afada2438d74ded3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 15:22:56 +0100 Subject: [PATCH 17/22] Add pandas as requirement --- reademptionlib/projectcreator.py | 4 +++- setup.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/reademptionlib/projectcreator.py b/reademptionlib/projectcreator.py index 8fd68af3..56062cab 100644 --- a/reademptionlib/projectcreator.py +++ b/reademptionlib/projectcreator.py @@ -1,8 +1,9 @@ import os import sys import Bio -import pysam import matplotlib +import pandas as pd +import pysam class ProjectCreator(object): @@ -38,3 +39,4 @@ def create_version_file(self, version_file_path, version): fh.write("Biopython version: %s\n" % Bio.__version__) fh.write("pysam version: %s\n" % pysam.__version__) fh.write("matplotlib version: %s\n" % matplotlib.__version__) + fh.write("pandas version: %s\n" % pd.__version__) diff --git a/setup.py b/setup.py index 0cb116c4..488e80a4 100644 --- a/setup.py +++ b/setup.py @@ -12,9 +12,10 @@ description='A RNA-Seq Analysis Pipeline', url='', install_requires=[ - "pysam >= 0.8.1", "biopython >= 1.65", "matplotlib >= 1.4.2", + "pandas >= 0.15.2", + "pysam >= 0.8.1" ], scripts=['bin/reademption'], license='ISC License (ISCL)', From d71e41fe2f9346ff8af0c177bd23d41a01dea5f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 15:31:18 +0100 Subject: [PATCH 18/22] Remove obsolete argument --- reademptionlib/vizdeseq.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/reademptionlib/vizdeseq.py b/reademptionlib/vizdeseq.py index fc5533bb..2d0a787e 100644 --- a/reademptionlib/vizdeseq.py +++ b/reademptionlib/vizdeseq.py @@ -8,10 +8,9 @@ class DESeqViz(object): - def __init__(self, deseq_script_path, deseq_path_template, use_antisene=True): + def __init__(self, deseq_script_path, deseq_path_template): self._deseq_script_path = deseq_script_path self._deseq_path_template = deseq_path_template - self._use_antisene = use_antisene self._basemean_column = 1 self._log_2_fold_chance_column = 2 self._p_value_column = 5 From ca4b991db01dc1b53fef7c2fa23fea24a16de3b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 15:48:30 +0100 Subject: [PATCH 19/22] Add max p-value parameter --- bin/reademption | 6 ++++++ reademptionlib/controller.py | 3 ++- reademptionlib/vizdeseq.py | 5 +++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/bin/reademption b/bin/reademption index 8e2ae4d2..314ce32b 100755 --- a/bin/reademption +++ b/bin/reademption @@ -235,6 +235,12 @@ def main(): "project_path", default=".", nargs="?", help="Path of the project folder. If none is given the current " "directory is used.") + + viz_deseq_parser.add_argument( + "--max_pvalue", type=float, default="0.05", + help="Maximum adjusted p-value for genes considered to be regulated. " + "Genes with adjusted p-values below will be marked red. " + "(default 0.50)") viz_deseq_parser.set_defaults(func=viz_deseq) args = parser.parse_args() diff --git a/reademptionlib/controller.py b/reademptionlib/controller.py index 473752b7..bd749863 100644 --- a/reademptionlib/controller.py +++ b/reademptionlib/controller.py @@ -777,7 +777,8 @@ def viz_deseq(self): deseq_path_template = ( self._paths.deseq_raw_folder + "/deseq_comp_%s_vs_%s.csv") deseq_viz = DESeqViz( - self._paths.deseq_script_path, deseq_path_template) + self._paths.deseq_script_path, deseq_path_template, + max_pvalue=self._args.max_pvalue) deseq_viz.create_scatter_plots( self._paths.viz_deseq_scatter_plot_path) deseq_viz.create_volcano_plots( diff --git a/reademptionlib/vizdeseq.py b/reademptionlib/vizdeseq.py index 2d0a787e..18c1fb72 100644 --- a/reademptionlib/vizdeseq.py +++ b/reademptionlib/vizdeseq.py @@ -6,16 +6,17 @@ from matplotlib.backends.backend_pdf import PdfPages import pandas as pd + class DESeqViz(object): - def __init__(self, deseq_script_path, deseq_path_template): + def __init__(self, deseq_script_path, deseq_path_template, max_pvalue): self._deseq_script_path = deseq_script_path self._deseq_path_template = deseq_path_template self._basemean_column = 1 self._log_2_fold_chance_column = 2 self._p_value_column = 5 self._adj_p_value_column = 6 - self._p_value_significance_limit = 0.05 + self._p_value_significance_limit = max_pvalue self._log_fold_change_limit = 2 self._log_2_fold_chance_limit = 1 From 0b1cbc700769ca816a9758b0e53e5d85f62fe2a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 15:55:16 +0100 Subject: [PATCH 20/22] Fix paramter help --- bin/reademption | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/reademption b/bin/reademption index 314ce32b..833d1828 100755 --- a/bin/reademption +++ b/bin/reademption @@ -240,7 +240,7 @@ def main(): "--max_pvalue", type=float, default="0.05", help="Maximum adjusted p-value for genes considered to be regulated. " "Genes with adjusted p-values below will be marked red. " - "(default 0.50)") + "(default 0.05)") viz_deseq_parser.set_defaults(func=viz_deseq) args = parser.parse_args() From 893372ea02b2f70365dbc81676fd74864db61960 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 15:56:31 +0100 Subject: [PATCH 21/22] Update documentation --- docs/source/subcommands.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/source/subcommands.rst b/docs/source/subcommands.rst index b2093f11..daa4e42b 100644 --- a/docs/source/subcommands.rst +++ b/docs/source/subcommands.rst @@ -336,3 +336,7 @@ vs. p-values / adjusted p-values). optional arguments: -h, --help show this help message and exit + --max_pvalue MAX_PVALUE + Maximum adjusted p-value for genes considered to be + regulated. Genes with adjusted p-values below will be + marked red. (default 0.05) From cd783fcb454cf048f3cfe6ddd91ae14cd25d4a99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Konrad=20F=C3=B6rstner?= Date: Tue, 13 Jan 2015 16:39:20 +0100 Subject: [PATCH 22/22] Set version to 0.3.5 --- CHANGELOG.txt | 6 ++++++ bin/reademption | 2 +- docs/source/conf.py | 2 +- setup.py | 2 +- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 75ce1c3e..7ccfcc00 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -1,3 +1,9 @@ +v0.3.5 (2015-01-13) +- Update some pysam related function due to changes in API changes of pysam +- Add further requirements (matplotlib, Biopython, pandas) +- Reimplement MA plot generation +- Add strand-unspecific coverage calculation +- Several small code improvements v0.3.4 (2014-08-23) - Fix bug with paired-end suffix - Fix bug with paired-end mapping stats table diff --git a/bin/reademption b/bin/reademption index 833d1828..d8aadb1c 100755 --- a/bin/reademption +++ b/bin/reademption @@ -9,7 +9,7 @@ __author__ = "Konrad Foerstner " __copyright__ = "2011-2014 by Konrad Foerstner " __license__ = "ISC license" __email__ = "konrad@foerstner.org" -__version__ = "0.3.4" +__version__ = "0.3.5" def main(): parser = argparse.ArgumentParser() diff --git a/docs/source/conf.py b/docs/source/conf.py index e73188d6..fdc28710 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -50,7 +50,7 @@ # The short X.Y version. version = '0.3' # The full version, including alpha/beta/rc tags. -release = '0.3.4' +release = '0.3.5' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 488e80a4..eeb4a2a4 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='READemption', - version='0.3.4', + version='0.3.5', packages=['reademptionlib', 'tests'], author='Konrad U. Förstner', author_email='konrad@foerstner.org',