From 9a2cbada0ddc096575e585bb2f2ee362606ebdf0 Mon Sep 17 00:00:00 2001 From: Marcus Stoiber Date: Tue, 20 Mar 2018 12:44:04 -0700 Subject: [PATCH] Minor fixes for some python2 bugs. Addressed #42. Fixes #41. --- tombo/_version.py | 2 +- tombo/plot_commands.py | 50 ++++++++++++++---------------------------- tombo/resquiggle.py | 3 +-- tombo/tombo_helper.py | 21 ++++++++++++++++++ tombo/tombo_stats.py | 6 +++-- 5 files changed, 43 insertions(+), 39 deletions(-) diff --git a/tombo/_version.py b/tombo/_version.py index fcc1204..9885504 100644 --- a/tombo/_version.py +++ b/tombo/_version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -TOMBO_VERSION = '1.2.1' +TOMBO_VERSION = '1.2.1b' diff --git a/tombo/plot_commands.py b/tombo/plot_commands.py index 5ca94e9..4160f46 100644 --- a/tombo/plot_commands.py +++ b/tombo/plot_commands.py @@ -203,6 +203,12 @@ def plot_per_read_roc( 'stat':r.FloatVector(unzip_stats[0]), 'motif_match':r.BoolVector(unzip_stats[1])}) + # python2 rpy2 ListVector can't take unicode keys + if sys.version_info[0] < 3: + conv_all_motif_stats_for_r = {} + for k, v in all_motif_stats_for_r.items(): + conv_all_motif_stats_for_r[k.encode()] = v + all_motif_stats_for_r = conv_all_motif_stats_for_r all_motif_stats_for_r = r.ListVector(all_motif_stats_for_r) if VERBOSE: sys.stderr.write('Computing accuracy statistics.\n') @@ -889,7 +895,7 @@ def plot_corrections( def plot_multi_corrections( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, num_reads_per_plot, num_regions, num_obs, include_orig_bcs, - genome_locations): + genome_locs): th._warning_message('The plot_multi_correction command may be deprecated ' + 'in future versions of Tombo.') num_regions = num_regions if num_regions % 2 == 0 else \ @@ -898,7 +904,7 @@ def plot_multi_corrections( f5_dirs1, corrected_group, basecall_subgroups) read_coverage = th.get_coverage(raw_read_coverage) - if genome_locations is None: + if genome_locs is None: coverage_regions = [] for (chrm, strand), cs_coverage in read_coverage.items(): reg_covs, reg_lens = zip(*[ @@ -923,16 +929,7 @@ def plot_multi_corrections( 'number of reads than requested.') else: if VERBOSE: sys.stderr.write('Parsing genome locations.\n') - parsed_locations = [] - for chrm_pos_strand in genome_locations: - split_vals = chrm_pos_strand.replace('"', '').replace( - "'", "").split(':')[:3] - # default to plus strand if not specified - if len(split_vals) == 2: - parsed_locations.append(( - split_vals[0], split_vals[1], '+')) - else: - parsed_locations.append(split_vals) + parsed_locs = th.parse_genome_locations(genome_locs, default_strand='+') plot_locs = [ ('{:03d}'.format(i), (chrm, int(pos) - 1, strand)) for i, (chrm, pos, strand) in enumerate(parsed_locations)] @@ -1480,22 +1477,14 @@ def plot_max_coverage( def plot_genome_locations( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, f5_dirs2, num_bases, overplot_thresh, overplot_type, - genome_locations, tb_model_fn, alt_model_fn, plot_default_stnd, + genome_locs, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): if VERBOSE: sys.stderr.write('Parsing genome locations.\n') - # ignore strand for genome location plotting - genome_locations = [ - chrm_pos.replace('"', '').replace("'", "").split(':')[:3] - for chrm_pos in genome_locations] # minus one here as all python internal coords are 0-based, but # genome is generally 1-based plot_intervals = [] - for i, chrm_pos_strand in enumerate(genome_locations): - if len(chrm_pos_strand) == 2: - chrm, pos = chrm_pos_strand - strand = None - else: - chrm, pos, strand = chrm_pos_strand + for i, (chrm, pos, strand) in enumerate( + th.parse_genome_locations(genome_locs)): int_start = max( 0, int(int(pos) - np.floor(num_bases / 2.0) - 1)) plot_intervals.append(th.intervalData( @@ -1527,19 +1516,12 @@ def plot_genome_locations( def plot_per_read_mods_genome_location( f5_dirs, corrected_group, basecall_subgroups, pdf_fn, - per_read_stats_fn, genome_locations, num_bases, num_reads, box_center, + per_read_stats_fn, genome_locs, num_bases, num_reads, box_center, fasta_fn): if VERBOSE: sys.stderr.write('Parsing genome locations.\n') - genome_locations = [ - chrm_pos.replace('"', '').replace("'", "").split(':')[:3] - for chrm_pos in genome_locations] plot_intervals = [] - for i, chrm_pos_strand in enumerate(genome_locations): - if len(chrm_pos_strand) == 2: - chrm, pos = chrm_pos_strand - strand = '+' - else: - chrm, pos, strand = chrm_pos_strand + for i, (chrm, pos, strand) in enumerate(th.parse_genome_locations( + genome_locs, default_strand='+')): int_start = max( 0, int(int(pos) - np.floor(num_bases / 2.0) - 1) + 1) plot_intervals.append(th.intervalData( @@ -2146,7 +2128,7 @@ def plot_main(args): if 'num_obs' in args else None),] nread_opt = [('num_reads', args.num_reads if 'num_reads' in args else None),] - glocs_opt = [('genome_locations', args.genome_locations + glocs_opt = [('genome_locs', args.genome_locations if 'genome_locations' in args else None),] f5dirs2_opt = [('f5_dirs2', args.control_fast5_basedirs if 'control_fast5_basedirs' in args else None),] diff --git a/tombo/resquiggle.py b/tombo/resquiggle.py index 9e8f924..85b761b 100644 --- a/tombo/resquiggle.py +++ b/tombo/resquiggle.py @@ -1128,8 +1128,7 @@ def resquiggle_all_reads( rsqgl_args = ( proc_rsqgl_conns, std_ref, outlier_thresh, corr_grp, bio_samp_type, seg_params, sig_aln_params, obs_filter, index_q is None, const_scale) - rsqgl_process = Process(target=_resquiggle_worker, args=rsqgl_args, - daemon=True) + rsqgl_process = Process(target=_resquiggle_worker, args=rsqgl_args) rsqgl_process.start() # now open mapping thread for each map connection created above diff --git a/tombo/tombo_helper.py b/tombo/tombo_helper.py index f8e3f48..38bfb54 100644 --- a/tombo/tombo_helper.py +++ b/tombo/tombo_helper.py @@ -128,6 +128,27 @@ def get_chrm_sizes(raw_read_coverage, raw_read_coverage2=None): for chrm, strnd_sizes in strand_chrm_sizes.items()) +def parse_genome_locations(genome_locs, default_strand=None): + parsed_locs = [] + for chrm_pos_strand in genome_locs: + # strip off any quotes and return up to the first 3 values + split_vals = chrm_pos_strand.replace('"', '').replace( + "'", "").split(':')[:3] + # default to plus strand if not specified + if len(split_vals) == 1: + _error_message_and_exit( + 'Invalid genome location provided: ' + chrm_pos_strand + + '\n\t\tTry adding quotation marks around specified genome ' + + 'locations (especially for sequence identifiers with ' + + 'special characters).') + elif len(split_vals) == 2: + parsed_locs.append(( + split_vals[0], split_vals[1], default_strand)) + else: + parsed_locs.append(split_vals) + + return parsed_locs + class TomboMotif(object): def _parse_motif(self, rev_comp_motif=False): """ diff --git a/tombo/tombo_stats.py b/tombo/tombo_stats.py index ca1c71d..efba36b 100644 --- a/tombo/tombo_stats.py +++ b/tombo/tombo_stats.py @@ -1276,7 +1276,8 @@ def calc_damp_fraction(self, cov_damp_counts): # on the fraction estimation as a binomial variable) damp_frac = (non_mod_counts + cov_damp_counts[0]) / ( self.stats['valid_cov'] + sum(cov_damp_counts)) - self.stats = append_fields(self.stats, 'damp_frac', damp_frac) + damp_name = 'damp_frac' if sys.version_info[0] > 2 else b'damp_frac' + self.stats = append_fields(self.stats, damp_name, damp_frac) return @@ -2018,7 +2019,8 @@ def compute_read_stats( ctrl_cov = [ctrl_cov[pos] if pos in ctrl_cov else 0 for pos in reg_poss] else: - ctrl_cov = repeat(0, reg_poss.shape[0]) + # convert to list since python2 repeat objects can't be pickled + ctrl_cov = list(repeat(0, reg_poss.shape[0])) return reg_base_stats, us_reg_poss, reg_cov, ctrl_cov, valid_cov