From 9743b4fe18b079e4c7fe38353c6258410a2df1ee Mon Sep 17 00:00:00 2001 From: Sourcery AI <> Date: Mon, 4 Dec 2023 23:01:53 +0000 Subject: [PATCH] 'Refactored by Sourcery' --- .../python/driver/analyze_gcode_parameters.py | 22 -- code/python/driver/build_gcode_features.py | 38 ++-- code/python/driver/build_mgm_models.py | 6 +- .../build_mgm_models_from_gms2_models.py | 82 ++++---- ...d_mgm_models_from_gms2_models_curr_best.py | 199 ++++++------------ code/python/driver/collect_gms2_models.py | 4 +- .../driver/compare_clustering_algorithms.py | 20 +- .../compare_metagenome_and_genome_tools.py | 9 +- code/python/driver/compare_msa_to_16s.py | 12 +- .../driver/compare_multiple_mgm2_models.py | 14 +- code/python/driver/compute_gcode_accuracy.py | 38 ++-- code/python/driver/independent_predictions.py | 4 +- code/python/driver/run-pbs-job.py | 6 +- code/python/driver/run_mgm_on_genome_list.py | 41 ++-- code/python/driver/run_tool_on_chunks.py | 128 ++++------- code/python/driver/run_tool_on_genome_list.py | 31 ++- code/python/driver/run_tools_on_chunks.py | 10 +- code/python/driver/stats_per_gene.py | 36 ++-- code/python/driver/stats_per_gene_on_chunk.py | 133 ++++++------ .../driver/stats_per_gene_on_chunk_revised.py | 189 ++++++++--------- code/python/driver/stats_tools_on_chunks.py | 2 +- .../driver/test_component_on_verified.py | 13 +- .../test_gms2_components_on_verified_set.py | 24 +-- ...st_rbs_built_from_most_conserved_motifs.py | 4 +- .../driver/test_start_codon_perturbations.py | 30 +-- code/python/driver/update_model_from_model.py | 8 +- code/python/driver/viz_gms2_model.py | 4 +- code/python/driver/viz_gms2_models_over_gc.py | 38 ++-- .../viz_group_distribution_across_gc.py | 23 +- code/python/driver/viz_mgm_model.py | 66 +++--- code/python/driver/viz_motifs_across_gc.py | 14 +- code/python/driver/viz_stats_per_gene.py | 4 +- .../driver/viz_stats_per_gene_on_chunks.py | 34 +-- .../viz_stats_per_gene_on_chunks_large.py | 55 ++--- .../viz_stats_per_gene_with_reference.py | 28 ++- code/python/lib/mg_bio/general.py | 5 +- code/python/lib/mg_container/genome_list.py | 2 +- code/python/lib/mg_container/gms2_mod.py | 14 +- code/python/lib/mg_container/mgm_model.py | 18 +- code/python/lib/mg_container/msa.py | 55 +++-- code/python/lib/mg_container/shelf.py | 23 +- code/python/lib/mg_container/taxonomy_tree.py | 44 ++-- code/python/lib/mg_general/general.py | 14 +- code/python/lib/mg_general/genome_splitter.py | 4 +- code/python/lib/mg_general/labels.py | 102 ++++----- .../mg_general/labels_comparison_detailed.py | 33 ++- code/python/lib/mg_general/shelf.py | 49 +++-- code/python/lib/mg_io/general.py | 7 +- code/python/lib/mg_io/labels.py | 16 +- code/python/lib/mg_io/shelf.py | 7 +- code/python/lib/mg_models/building.py | 36 ++-- code/python/lib/mg_models/mgm_motif_model.py | 21 +- .../lib/mg_models/mgm_motif_model_v2.py | 23 +- code/python/lib/mg_models/motif_model.py | 37 ++-- code/python/lib/mg_models/shelf.py | 101 ++++----- code/python/lib/mg_options/options.py | 13 +- .../mg_parallelization/generic_threading.py | 22 +- code/python/lib/mg_parallelization/pbs.py | 140 +++++------- code/python/lib/mg_pbs_data/mergers.py | 22 +- code/python/lib/mg_pbs_data/splitters.py | 66 ++---- code/python/lib/mg_stats/shelf.py | 31 +-- code/python/lib/mg_stats/small.py | 10 +- code/python/lib/mg_viz/colormap.py | 15 +- code/python/lib/mg_viz/general.py | 8 +- code/python/lib/mg_viz/mgm_motif_model.py | 15 +- code/python/lib/mg_viz/mgm_motif_model_v2.py | 6 +- code/python/lib/mg_viz/shelf.py | 2 +- code/python/lib/mg_viz/stats_large.py | 32 +-- code/python/lib/mg_viz/stats_small.py | 2 +- 69 files changed, 929 insertions(+), 1435 deletions(-) diff --git a/code/python/driver/analyze_gcode_parameters.py b/code/python/driver/analyze_gcode_parameters.py index 24b4036..8a558e0 100644 --- a/code/python/driver/analyze_gcode_parameters.py +++ b/code/python/driver/analyze_gcode_parameters.py @@ -89,28 +89,6 @@ def main(env, args): # plt.show() return - fig = plt.figure() - ax = fig.gca(projection='3d') - ax.plot_trisurf(df['Chunk Size'], df['p4,p11'], df['Match Rate'], linewidth=0.2) - ax.set_xlabel("Chunk Size") - ax.set_ylabel("Parameters") - ax.set_zlabel("Match Rate") - plt.show() - - df2 = df[df["Tool"] == "mgm2"].groupby(["p4", "p11"], as_index=False).mean() - - idx = df2["Match Rate"].argmax() - p4 = df2.at[idx, "p4"] - p11 = df2.at[idx, "p11"] - df_best = df[(df["p4"] == p4) & (df["p11"] == p11)] - df_alex = df[(df["p4"] == 10) & (df["p11"] == 20)] - fig, ax = plt.subplots() - sns.lineplot("Chunk Size", "Match Rate", data=df_best, label="Optimized") - sns.lineplot("Chunk Size", "Match Rate", data=df[df["Tool"] == "mprodigal"], label="MProdigal") - sns.lineplot("Chunk Size", "Match Rate", data=df_alex, label="Original") - ax.set_ylim(0, 1) - plt.show() - diff --git a/code/python/driver/build_gcode_features.py b/code/python/driver/build_gcode_features.py index 4503530..0d8a878 100644 --- a/code/python/driver/build_gcode_features.py +++ b/code/python/driver/build_gcode_features.py @@ -87,12 +87,11 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag): labels_per_seqname = dict() for lab in labels: if lab.seqname() not in labels_per_seqname: - labels_per_seqname[lab.seqname()] = list() + labels_per_seqname[lab.seqname()] = [] labels_per_seqname[lab.seqname()].append(lab) - counter = 0 - for seqname in labels_per_seqname: + for counter, (seqname, value) in enumerate(labels_per_seqname.items()): entries[seqname] = dict() total_score = 0 @@ -100,7 +99,7 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag): avg_gc = 0 num_genes = 0 - for lab in labels_per_seqname[seqname]: + for lab in value: score = lab.get_attribute_value("score") try: score = float(score) @@ -117,17 +116,16 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag): avg_gene_length += abs(lab.right() - lab.left() + 1) - avg_gene_length /= num_genes if num_genes > 0 else 0 - avg_gc /= num_genes if num_genes > 0 else 0 + avg_gene_length /= max(num_genes, 0) + avg_gc /= max(num_genes, 0) entries[seqname] = { f"{tag}: Total Score": total_score, f"{tag}: Average Gene Length": avg_gene_length, f"{tag}: Average Gene GC": avg_gc, f"{tag}: Number of Genes": num_genes } - counter += 1 - # if counter > 5: - # break + # if counter > 5: + # break return entries @@ -184,9 +182,6 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs): pf_chunks = mkstemp_closed(dir=env["pd-work"], suffix=".fasta") gs.write_to_file(pf_chunks) - list_entries = list() - - pd_run = os_join(env["pd-work"], gi.name, f"{dn_prefix}{dn}_{chunk}") mkdir_p(pd_run) @@ -195,8 +190,7 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs): gcode_true=gcode_true, **kwargs) results["Genome"] = gi.name - list_entries.append(results) - + list_entries = [results] remove_p(pf_prediction) remove_p(pf_chunks) @@ -205,7 +199,7 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs): def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs): # type: (Environment, GenomeInfo, str, List[int], Dict[str, Any]) -> pd.DataFrame - list_df = list() + list_df = [] num_processors = get_value(kwargs, "num_processors", 1, valid_type=int) if num_processors > 1: @@ -217,7 +211,7 @@ def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs): ) else: - list_df = list() + list_df = [] for chunk in chunks: logger.debug(f"{gi.name};{chunk}") curr = build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs) @@ -227,14 +221,10 @@ def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs): def build_gcode_features(env, gil, tool, chunks, **kwargs): - # type: (Environment, GenomeInfoList, str, List[int], Dict[str, Any]) -> pd.DataFrame - list_df = list() - - for gi in gil: - list_df.append( - build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs) - ) - + list_df = [ + build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs) + for gi in gil + ] return pd.concat(list_df, ignore_index=True, sort=False) diff --git a/code/python/driver/build_mgm_models.py b/code/python/driver/build_mgm_models.py index db18cad..2635813 100644 --- a/code/python/driver/build_mgm_models.py +++ b/code/python/driver/build_mgm_models.py @@ -142,7 +142,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): binned_dfs = bin_by_gc(df, step=bin_size) # for each binned dataframe, build specific model - list_mgm_models = list() # type: List[Tuple[float, float, MGMMotifModel]] + list_mgm_models = [] for info in binned_dfs: lower, upper, df_gc = info @@ -158,7 +158,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): if mgm_mm is None: # use previous model - if len(list_mgm_models) > 0: + if list_mgm_models: prev = list_mgm_models[-1][2] list_mgm_models.append([lower, upper, prev]) else: @@ -190,7 +190,7 @@ def build_mgm_models(env, df, pf_output): } name_to_models = dict() # type: Dict[str, Dict[str, Dict[str, MGMMotifModelAllGC]]] - for species_type in type_model_group.keys(): + for species_type in type_model_group: name_to_models[species_type] = dict() # type: Dict[str, Dict[str, MGMMotifModelAllGC]] for name in type_model_group[species_type].keys(): name_to_models[species_type][name] = dict() diff --git a/code/python/driver/build_mgm_models_from_gms2_models.py b/code/python/driver/build_mgm_models_from_gms2_models.py index 0364e11..aa52bc4 100644 --- a/code/python/driver/build_mgm_models_from_gms2_models.py +++ b/code/python/driver/build_mgm_models_from_gms2_models.py @@ -127,7 +127,7 @@ def get_loess(local_x, local_y): def visualize_start_codons(env, viz_collector): # type: (Environment, Dict[str, Dict[str, Dict[str, Any]]]) -> None - list_entries = list() + list_entries = [] for genome_type in viz_collector: for group in viz_collector[genome_type]: @@ -139,15 +139,19 @@ def visualize_start_codons(env, viz_collector): y = vals["y"] y_fit = vals["y_fit"] - for i in range(len(x)): - list_entries.append({ + list_entries.extend( + { "Genome Type": genome_type, - "Group": group if genome_type == "Bacteria" else f"A*,D*", + "Group": group + if genome_type == "Bacteria" + else "A*,D*", "Codon": codon, "x": x[i], "y": y[i], - "y_fit": y_fit[i] - }) + "y_fit": y_fit[i], + } + for i in range(len(x)) + ) if genome_type == "Archaea": break @@ -198,7 +202,7 @@ def add_codon_probabilities(env, df, mgm, codons, gms2_group, **kwargs): df = df[df["Type"] == genome_type].copy() - list_entries = list() + list_entries = [] fig, ax = plt.subplots() # values_per_codon = dict() @@ -302,8 +306,8 @@ def add_stop_codon_probabilities(env, df, mgm, **kwargs): def compute_bin_averages(x, y, x_min, x_max, x_step): # type: (List[float], List[float], float, float, float) -> [List[float], List[float]] - x_out = list() - y_out = list() + x_out = [] + y_out = [] current = 0 for x_tag in np.arange(x_min, x_max, x_step): @@ -319,7 +323,7 @@ def compute_bin_averages(x, y, x_min, x_max, x_step): total += 1 current += 1 - if total == 0 and len(y_out) == 0: + if total == 0 and not y_out: continue avg = y_out[-1] if total == 0 else acc / float(total) x_out.append(x_tag) @@ -422,7 +426,7 @@ def add_start_context_probabilities(env, df, mgm, input_tag, output_tag, **kwarg # add gc models to mgm # for genome_tag in ["A", "B"]: # genome_type[0] FIXME genome_tag = genome_type[0] - for gc_tag in sc_gc.keys(): + for gc_tag in sc_gc: mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[output_tag + "_MAT"] = sc_gc[gc_tag] mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}"] = 1 mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}_ORDER"] = 2 @@ -515,7 +519,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): binned_dfs = bin_by_gc(df, step=bin_size, gc_feature=gc_feature) # for each binned dataframe, build specific model - list_mgm_models = list() # type: List[List[float, float, MGMMotifModelV2]] + list_mgm_models = [] for info in binned_dfs: lower, upper, df_gc = info # @@ -528,7 +532,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): if mgm_mm is None: # use previous model - if len(list_mgm_models) > 0: + if list_mgm_models: prev = list_mgm_models[-1][2] list_mgm_models.append([lower, upper, prev]) else: @@ -560,37 +564,24 @@ def add_motif_probabilities(env, df, mgm, input_tag, output_tag, genome_type, ** motif = motif_by_gc.get_model_by_gc(gc) - if True or "RBS" in output_tag: - # create a label for each shift - for shift, prob in motif._shift_prior.items(): - prob /= 100.0 - output_tag_ws = f"{output_tag}_{int(shift)}" - try: - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift] - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = \ - motif._spacer[ - shift] - except KeyError: - pass - - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob - else: - # promoter aren't shifted (for now) - best_shift = max(motif._shift_prior.items(), key=operator.itemgetter(1))[0] - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAT"] = motif._motif[best_shift] - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_POS_DISTR"] = motif._spacer[ - best_shift] - - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}"] = 1 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_ORDER"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_WIDTH"] = width - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MARGIN"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAX_DUR"] = dur + # create a label for each shift + for shift, prob in motif._shift_prior.items(): + prob /= 100.0 + output_tag_ws = f"{output_tag}_{int(shift)}" + try: + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift] + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = \ + motif._spacer[ + shift] + except KeyError: + pass + + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1 + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0 + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0 + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob def _build_start_or_stop_codons(env, df, mgm, genome_type, codons, **kwargs): @@ -666,9 +657,9 @@ def _build_motifs(env, df, mgm, genome_type, tag, **kwargs): learn_from_component = learn_from[tag] # get for component + df_type = df[df["Type"] == genome_type] if genome_type == "Archaea": - df_type = df[df["Type"] == genome_type] for o, l in learn_from_component[genome_type].items(): if "PROMOTER" in tag and o != "D": continue # promoters are only in group D @@ -681,7 +672,6 @@ def _build_motifs(env, df, mgm, genome_type, tag, **kwargs): ) else: - df_type = df[df["Type"] == genome_type] for o, l in learn_from_component[genome_type].items(): if "PROMOTER" in tag and o != "C": continue # promoters are only in group C diff --git a/code/python/driver/build_mgm_models_from_gms2_models_curr_best.py b/code/python/driver/build_mgm_models_from_gms2_models_curr_best.py index f274fdb..c1d428c 100644 --- a/code/python/driver/build_mgm_models_from_gms2_models_curr_best.py +++ b/code/python/driver/build_mgm_models_from_gms2_models_curr_best.py @@ -71,7 +71,7 @@ def get_loess(local_x, local_y): def visualize_start_codons(env, viz_collector): # type: (Environment, Dict[str, Dict[str, Dict[str, Any]]]) -> None - list_entries = list() + list_entries = [] for genome_type in viz_collector: for group in viz_collector[genome_type]: @@ -83,17 +83,19 @@ def visualize_start_codons(env, viz_collector): y = vals["y"] y_fit = vals["y_fit"] - for i in range(len(x)): - - list_entries.append({ + list_entries.extend( + { "Genome Type": genome_type, - "Group": group if genome_type == "Bacteria" else f"{group}*", + "Group": group + if genome_type == "Bacteria" + else f"{group}*", "Codon": codon, "x": x[i], "y": y[i], - "y_fit": y_fit[i] - }) - + "y_fit": y_fit[i], + } + for i in range(len(x)) + ) df = pd.DataFrame(list_entries) g = seaborn.FacetGrid(df, col="Codon", hue="Group") g.map(plt.scatter, "x", "y", alpha=.3, s=2) @@ -127,7 +129,7 @@ def add_codon_probabilities(env, df, mgm, codons, gms2_group, **kwargs): df = df[df["Type"] == genome_type].copy() - list_entries = list() + list_entries = [] fig, ax = plt.subplots() # values_per_codon = dict() @@ -232,8 +234,8 @@ def add_stop_codon_probabilities(env, df, mgm, **kwargs): def compute_bin_averages(x, y, x_min, x_max, x_step): # type: (List[float], List[float], float, float, float) -> [List[float], List[float]] - x_out = list() - y_out = list() + x_out = [] + y_out = [] current = 0 for x_tag in np.arange(x_min, x_max, x_step): @@ -250,7 +252,7 @@ def compute_bin_averages(x, y, x_min, x_max, x_step): total += 1 current += 1 - if total == 0 and len(y_out) == 0: + if total == 0 and not y_out: continue avg = y_out[-1] if total == 0 else acc / float(total) x_out.append(x_tag) @@ -349,7 +351,7 @@ def add_start_context_probabilities(df, mgm, input_tag, output_tag, **kwargs): # add gc models to mgm # for genome_tag in ["A", "B"]: # genome_type[0] FIXME genome_tag = genome_type[0] - for gc_tag in sc_gc.keys(): + for gc_tag in sc_gc: mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[output_tag + "_MAT"] = sc_gc[gc_tag] mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}"] = 1 mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}_ORDER"] = 2 @@ -445,7 +447,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): binned_dfs = bin_by_gc(df, step=bin_size, gc_feature=gc_feature) # for each binned dataframe, build specific model - list_mgm_models = list() # type: List[List[float, float, MGMMotifModelV2]] + list_mgm_models = [] for info in binned_dfs: lower, upper, df_gc = info # @@ -461,7 +463,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): if mgm_mm is None: # use previous model - if len(list_mgm_models) > 0: + if list_mgm_models: prev = list_mgm_models[-1][2] list_mgm_models.append([lower, upper, prev]) else: @@ -490,35 +492,23 @@ def add_motif_probabilities(env, df, mgm, input_tag, output_tag, genome_type, ** motif = motif_by_gc.get_model_by_gc(gc) - if True or "RBS" in output_tag: - # create a label for each shift - for shift, prob in motif._shift_prior.items(): - prob /= 100.0 - output_tag_ws = f"{output_tag}_{int(shift)}" - try: - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift] - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = motif._spacer[ - shift] - except KeyError: - pass - - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob - else: - # promoter aren't shifted (for now) - best_shift = max(motif._shift_prior.items(), key=operator.itemgetter(1))[0] - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAT"] = motif._motif[best_shift] - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_POS_DISTR"] = motif._spacer[best_shift] - - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}"] = 1 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_ORDER"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_WIDTH"] = width - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MARGIN"] = 0 - mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAX_DUR"] = dur + # create a label for each shift + for shift, prob in motif._shift_prior.items(): + prob /= 100.0 + output_tag_ws = f"{output_tag}_{int(shift)}" + try: + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift] + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = motif._spacer[ + shift] + except KeyError: + pass + + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1 + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0 + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0 + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur + mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob def build_mgm_models_from_gms2_models(env, df, mgm, **kwargs): @@ -560,137 +550,72 @@ def build_mgm_models_from_gms2_models(env, df, mgm, **kwargs): visualize_start_codons(env, viz_collector) - if "Stop Codons" in components and False: - viz_collector = dict() - - if genome_type == "Archaea": - output_group = ["A", "D"] - learn_from = learn_from_arc - viz_collector[genome_type] = dict() - - for o, l in zip(output_group, learn_from): - viz_collector[genome_type][o] = dict() - - df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] - add_stop_codon_probabilities(env, df_curr, mgm, genome_type=genome_type, plot=plot, gms2_group=o, - viz_collector=viz_collector[genome_type][o]) - if genome_type == "Bacteria": - output_group = ["A", "B", "C", "X"] - learn_from = [{"A"}, {"B"}, {"C"}, {"A"}] - viz_collector[genome_type] = dict() - - for o, l in zip(output_group, learn_from): - viz_collector[genome_type][o] = dict() - - df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] - add_stop_codon_probabilities(env, df_curr, mgm, genome_type=genome_type, plot=plot, gms2_group=o, - viz_collector=viz_collector[genome_type][o]) - - visualize_stop_codons(env, viz_collector) - - # if "Stop Codons" in components: - # add_stop_codon_probabilities(df, mgm, genome_type=genome_type, plot=plot) - # add_stop_codon_probabilities(df, mgm, genome_type="Archaea", plot=plot) - - - - # Motifs if "RBS" in components: + df_type = df[df["Type"] == genome_type] if genome_type == "Archaea": output_group = ["A", "D"] learn_from = [{"A", "D"}, {"A", "D"}] - df_type = df[df["Type"] == genome_type] - for o, l in zip(output_group, learn_from): - add_motif_probabilities( - env, - df_type[(df_type["GENOME_TYPE"].isin(l))], - mgm, - "RBS", f"RBS_{o}", genome_type, plot=plot - ) else: output_group = ["A", "B", "C", "X"] learn_from = [{"A"}, {"B"}, {"C"}, {"A"}] - df_type = df[df["Type"] == genome_type] - for o, l in zip(output_group, learn_from): - add_motif_probabilities( - env, - df_type[(df_type["GENOME_TYPE"].isin(l))], - mgm, - "RBS", f"RBS_{o}", genome_type, plot=plot - ) - + for o, l in zip(output_group, learn_from): + add_motif_probabilities( + env, + df_type[(df_type["GENOME_TYPE"].isin(l))], + mgm, + "RBS", f"RBS_{o}", genome_type, plot=plot + ) if "Promoter" in components: + df_type = df[df["Type"] == genome_type] if genome_type == "Archaea": output_group = ["D"] learn_from = [{"D"}] # always learn Promoter form group D - df_type = df[df["Type"] == genome_type] - for o, l in zip(output_group, learn_from): - add_motif_probabilities( - env, - df_type[(df_type["GENOME_TYPE"].isin(l))], - mgm, - "PROMOTER", f"PROMOTER_{o}", genome_type, plot=plot - ) else: output_group = ["C"] learn_from = [{"C"}] # always learn Promoter form group C - df_type = df[df["Type"] == genome_type] - for o, l in zip(output_group, learn_from): - add_motif_probabilities( - env, - df_type[(df_type["GENOME_TYPE"].isin(l))], - mgm, - "PROMOTER", f"PROMOTER_{o}", genome_type, plot=plot - ) - + for o, l in zip(output_group, learn_from): + add_motif_probabilities( + env, + df_type[(df_type["GENOME_TYPE"].isin(l))], + mgm, + "PROMOTER", f"PROMOTER_{o}", genome_type, plot=plot + ) # Start Context if "Start Context" in components: if genome_type == "Archaea": output_group = ["A", "D"] learn_from = learn_from_arc - for o, l in zip(output_group, learn_from): - df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] - add_start_context_probabilities(df_curr, mgm, "SC_RBS", f"SC_RBS_{o}", genome_type=genome_type, - plot=plot) else: output_group = ["A", "B", "C", "X"] learn_from = [{"A"}, {"B"}, {"C"}, {"A"}] - for o, l in zip(output_group, learn_from): - df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] - add_start_context_probabilities(df_curr, mgm, "SC_RBS", f"SC_RBS_{o}", genome_type=genome_type, - plot=plot) - + for o, l in zip(output_group, learn_from): + df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] + add_start_context_probabilities(df_curr, mgm, "SC_RBS", f"SC_RBS_{o}", genome_type=genome_type, + plot=plot) # promoter if genome_type == "Archaea": output_group = ["D"] learn_from = [{"A", "D"}] # always learn RBS form group A - for o, l in zip(output_group, learn_from): - df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] - - # NOTE: SC_PROMOTER is intentionally learned from SC_RBS. This is not a bug - # GMS2 has equal values for SC_RBS and SC_PROMOTER. Training from SC_RBS therefore allows us - # to learn from group A genomes as well. - add_start_context_probabilities(df_curr, mgm, "SC_RBS", f"SC_PROMOTER_{o}", genome_type=genome_type, - plot=plot) else: output_group = ["C"] learn_from = [{"C"}] - for o, l in zip(output_group, learn_from): - df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] - # NOTE: SC_PROMOTER is intentionally learned from SC_RBS. This is not a bug - # GMS2 has equal values for SC_RBS and SC_PROMOTER. Training from SC_RBS therefore allows us - # to learn from group A genomes as well. - add_start_context_probabilities(df_curr, mgm, "SC_RBS", f"SC_PROMOTER_{o}", genome_type=genome_type, - plot=plot) + for o, l in zip(output_group, learn_from): + df_curr = df[(df["Type"] == genome_type) & (df["GENOME_TYPE"].isin(l))] + + # NOTE: SC_PROMOTER is intentionally learned from SC_RBS. This is not a bug + # GMS2 has equal values for SC_RBS and SC_PROMOTER. Training from SC_RBS therefore allows us + # to learn from group A genomes as well. + add_start_context_probabilities(df_curr, mgm, "SC_RBS", f"SC_PROMOTER_{o}", genome_type=genome_type, + plot=plot) def main(env, args): diff --git a/code/python/driver/collect_gms2_models.py b/code/python/driver/collect_gms2_models.py index 9c34334..807fe0f 100644 --- a/code/python/driver/collect_gms2_models.py +++ b/code/python/driver/collect_gms2_models.py @@ -104,7 +104,7 @@ def collect_start_info_from_gil(env, gil, **kwargs): pf_output = get_value(kwargs, "pf_output", None, valid_type=str) dn_gms2 = get_value(kwargs, "dn_gms2", "gms2", valid_type=str) - list_entries = list() + list_entries = [] for gi in tqdm(gil, total=len(gil)): entry = collect_start_info_from_gi(env, gi, dn_gms2=dn_gms2) list_entries.append(entry) @@ -118,7 +118,7 @@ def collect_start_info_from_gil(env, gil, **kwargs): def collect_start_info_from_gil_and_print_to_file(env, gil, pf_output): # type: (Environment, GenomeInfoList, str) -> str - list_entries = list() + list_entries = [] for gi in tqdm(gil, total=len(gil)): entry = collect_start_info_from_gi(env, gi) list_entries.append(entry) diff --git a/code/python/driver/compare_clustering_algorithms.py b/code/python/driver/compare_clustering_algorithms.py index 15de061..432aaf6 100644 --- a/code/python/driver/compare_clustering_algorithms.py +++ b/code/python/driver/compare_clustering_algorithms.py @@ -120,7 +120,9 @@ def load_gms2_models_from_pickle(pf_mods): df["Type"] = "Bacteria" df.reset_index(inplace=True) - df[f"CONSENSUS_RBS_MAT"] = df.apply(lambda r: get_consensus_sequence(r["Mod"].items["RBS_MAT"]), axis=1) + df["CONSENSUS_RBS_MAT"] = df.apply( + lambda r: get_consensus_sequence(r["Mod"].items["RBS_MAT"]), axis=1 + ) df["GENOME_TYPE"] = df.apply(lambda r: r["Mod"].items["GENOME_TYPE"], axis=1) # df["GC"] = df.apply(lambda r: r["Mod"].items["GC"], axis=1) @@ -150,13 +152,13 @@ def merge_spacers_by_peak(df): for l in list_pos_dist: peak = max(l, key=lambda key: l[key]) # get position of peak if peak not in peak_to_list_pos_dist: - peak_to_list_pos_dist[peak] = list() + peak_to_list_pos_dist[peak] = [] peak_to_list_pos_dist[peak].append(l) # average positions (per peak) values = dict() peak_counter = 0 - for peak in peak_to_list_pos_dist.keys(): + for peak in peak_to_list_pos_dist: values[peak] = dict() peak_counter = peak peak_prior_per_shift[s][peak_counter] = len(peak_to_list_pos_dist[peak]) @@ -165,11 +167,11 @@ def merge_spacers_by_peak(df): try: for i in l.keys(): if i not in values[peak].keys(): - values[peak][i] = list() + values[peak][i] = [] values[peak][i].append(l[i]) if i not in total_merged: - total_merged[i] = list() + total_merged[i] = [] total_merged[i].append(l[i]) @@ -186,9 +188,7 @@ def merge_spacers_by_peak(df): x = sorted(values[peak].keys()) y = [values[peak][a] for a in x] - position_distributions_by_shift_by_peak[s][peak_counter] = { - a: b for a, b in zip(x, y) - } + position_distributions_by_shift_by_peak[s][peak_counter] = dict(zip(x, y)) peak_counter += 1 @@ -198,11 +198,11 @@ def merge_spacers_by_peak(df): for pc in peak_prior_per_shift[s].keys(): peak_prior_per_shift[s][pc] /= float(total) - for i in total_merged.keys(): + for i in total_merged: total_merged[i] = np.mean(total_merged[i]) total = sum(total_merged.values()) - for i in total_merged.keys(): + for i in total_merged: total_merged[i] /= total return position_distributions_by_shift_by_peak[0], peak_prior_per_shift[0], total_merged diff --git a/code/python/driver/compare_metagenome_and_genome_tools.py b/code/python/driver/compare_metagenome_and_genome_tools.py index 6b176fe..5c980df 100644 --- a/code/python/driver/compare_metagenome_and_genome_tools.py +++ b/code/python/driver/compare_metagenome_and_genome_tools.py @@ -101,10 +101,7 @@ def compare_for_gi(env, gi, **kwargs): def compare_for_gil(env, gil, **kwargs): - list_df = list() - for gi in gil: - list_df.append(compare_for_gi(env, gi, **kwargs)) - + list_df = [compare_for_gi(env, gi, **kwargs) for gi in gil] return pd.concat(list_df, ignore_index=True, sort=False) @@ -154,10 +151,10 @@ def main(env, args): if not prl_options["use-pbs"]: df_list = run_n_per_thread( - [g for g in gil], + list(gil), compare_for_gi, data_arg_name="gi", - func_kwargs={"env": env} + func_kwargs={"env": env}, ) else: pbs = PBS(env, prl_options, diff --git a/code/python/driver/compare_msa_to_16s.py b/code/python/driver/compare_msa_to_16s.py index b671f7a..6871658 100644 --- a/code/python/driver/compare_msa_to_16s.py +++ b/code/python/driver/compare_msa_to_16s.py @@ -72,7 +72,11 @@ def best_match_to_16s(seq_16s, consensus): bm_score = 0 for curr_pos in range(len(seq_16s)-len(consensus)): - curr_score = sum([1 for i in range(len(consensus)) if consensus[i] == seq_16s[curr_pos+i]]) + curr_score = sum( + 1 + for i in range(len(consensus)) + if consensus[i] == seq_16s[curr_pos + i] + ) if curr_score > bm_score: bm_score = curr_score @@ -83,13 +87,13 @@ def best_match_to_16s(seq_16s, consensus): def best_shifts_to_16s(seq_16s, df, col): # type: (str, pd.DataFrame, str) -> None - list_pos = list() + list_pos = [] for idx in df.index: consensus = df.at[idx, "CONSENSUS_RBS_MAT"] list_pos.append(best_match_to_16s(seq_16s, consensus)) df[col] = list_pos - df[col] = df[col] - min(df[col]) + df[col] -= min(df[col]) def compare_msa_to_16s(env, df, seq_16s, **kwargs): @@ -103,7 +107,7 @@ def compare_msa_to_16s(env, df, seq_16s, **kwargs): df[f"CONSENSUS_{name}"] = df.apply(lambda r: get_consensus_sequence(r["Mod"].items[name]), axis=1) # df = df[(df["GC"] > gc_range[0]) & (df["GC"] < gc_range[1])] - print([x for x in df["CONSENSUS_RBS_MAT"].value_counts().items()]) + print(list(df["CONSENSUS_RBS_MAT"].value_counts().items())) df = df[df["CONSENSUS_RBS_MAT"] == "AAAAAA"] # if group: # df = df[df["GENOME_TYPE"] == group] diff --git a/code/python/driver/compare_multiple_mgm2_models.py b/code/python/driver/compare_multiple_mgm2_models.py index 0d4ae40..e5d06ff 100644 --- a/code/python/driver/compare_multiple_mgm2_models.py +++ b/code/python/driver/compare_multiple_mgm2_models.py @@ -55,7 +55,7 @@ def test_component_for_gi(env, gi, list_pf_mgm): # type: (Environment, GenomeInfo, List[str]) -> pd.DataFrame - list_entries = list() + list_entries = [] pd_gi = os_join(env["pd-work"], gi.name) mkdir_p(pd_gi) @@ -74,7 +74,7 @@ def test_component_for_gi(env, gi, list_pf_mgm): def test_mgm2_models_on_verified(env, gil, list_pf_mgm2): # type: (Environment, GenomeInfoList, List[str]) -> None - list_df = list() + list_df = [] for gi in gil: logger.info(f"Genome: {gi.name}") @@ -111,11 +111,11 @@ def main(env, args): pf_mgm2 = args.pf_mgm2 else: pd_mgm2 = os.path.abspath(args.pd_mgm2) - pf_mgm2 = list() - for file in os.listdir(pd_mgm2): - if file.endswith(".mod"): - pf_mgm2.append(os.path.join(pd_mgm2, file)) - + pf_mgm2 = [ + os.path.join(pd_mgm2, file) + for file in os.listdir(pd_mgm2) + if file.endswith(".mod") + ] test_mgm2_models_on_verified(env, gil, pf_mgm2) diff --git a/code/python/driver/compute_gcode_accuracy.py b/code/python/driver/compute_gcode_accuracy.py index 8759c70..9bba244 100644 --- a/code/python/driver/compute_gcode_accuracy.py +++ b/code/python/driver/compute_gcode_accuracy.py @@ -81,9 +81,7 @@ def get_gcode_per_contig_for_mgm2(pf_prediction): for line in f: line = line.strip() if "seqid" in line and "genetic code" in line: - m = pattern.match(line) - - if m: + if m := pattern.match(line): gcode = str(m.group(2)) gcode_per_contig[str(contig)] = gcode contig += 1 @@ -101,9 +99,7 @@ def get_gcode_per_contig_for_mprodigal(pf_prediction): for line in f: line = line.strip() if "Model Data" in line: - m = pattern.match(line) - - if m: + if m := pattern.match(line): gcode = str(m.group(1)) gcode_per_contig[str(contig)] = gcode contig += 1 @@ -116,13 +112,15 @@ def get_accuracy_gcode_predicted(tool, pf_prediction, gcode_true): if tool == "mgm2": gcode_per_contig = get_gcode_per_contig_for_mgm2(pf_prediction) - elif tool == "mprodigal" or tool == "prodigal": + elif tool in ["mprodigal", "prodigal"]: gcode_per_contig = get_gcode_per_contig_for_mprodigal(pf_prediction) else: raise ValueError("Unknown tool") - num_matches = sum([1 for v in gcode_per_contig.values() if str(v) == gcode_true]) - num_mismatches = sum([1 for v in gcode_per_contig.values() if str(v) != gcode_true]) + num_matches = sum(1 for v in gcode_per_contig.values() if str(v) == gcode_true) + num_mismatches = sum( + 1 for v in gcode_per_contig.values() if str(v) != gcode_true + ) total = len(gcode_per_contig) @@ -146,7 +144,7 @@ def compute_gcode_accuracy_for_tool_on_sequence(env, tool, pf_sequences, pf_pred run_tool(env, pf_sequences, pf_prediction, tool + "_autogcode", **kwargs) - list_entries = list() + list_entries = [] if tool == "mgm2" and pf_summary: data = pd.read_csv(pf_summary) @@ -195,7 +193,7 @@ def compute_gcode_accuracy_for_tools_on_chunk_deprecated(env, gi, tools, chunk, pf_chunks = mkstemp_closed(dir=env["pd-work"], suffix=".fasta") gs.write_to_file(pf_chunks) - list_entries = list() + list_entries = [] ran_prod = False @@ -249,7 +247,7 @@ def compute_gcode_accuracy_for_tools_on_chunk(env, gi, tools, chunk, **kwargs): pf_chunks = mkstemp_closed(dir=env["pd-work"], suffix=".fasta") gs.write_to_file(pf_chunks) - list_df = list() + list_df = [] for t, dn in zip(tools, dn_tools): pd_run = os_join(env["pd-work"], gi.name, f"{dn_prefix}{dn}_{chunk}") @@ -307,7 +305,7 @@ def compute_gcode_accuracy_for_tools_on_chunk(env, gi, tools, chunk, **kwargs): def compute_gcode_accuracy_for_gi(env, gi, tools, chunks, **kwargs): # type: (Environment, GenomeInfo, List[str], List[int], Dict[str, Any]) -> pd.DataFrame - list_df = list() + list_df = [] num_processors = get_value(kwargs, "num_processors", 1, valid_type=int) if num_processors > 1: @@ -320,7 +318,7 @@ def compute_gcode_accuracy_for_gi(env, gi, tools, chunks, **kwargs): else: - list_df = list() + list_df = [] for chunk in chunks: logger.debug(f"{gi.name};{chunk}") curr = compute_gcode_accuracy_for_tools_on_chunk(env, gi, tools, chunk, **kwargs) @@ -330,14 +328,10 @@ def compute_gcode_accuracy_for_gi(env, gi, tools, chunks, **kwargs): def compute_gcode_accuracy(env, gil, tools, chunks, **kwargs): - # type: (Environment, GenomeInfoList, List[str], List[int], Dict[str, Any]) -> pd.DataFrame - list_df = list() - - for gi in gil: - list_df.append( - compute_gcode_accuracy_for_gi(env, gi, tools, chunks, **kwargs) - ) - + list_df = [ + compute_gcode_accuracy_for_gi(env, gi, tools, chunks, **kwargs) + for gi in gil + ] return pd.concat(list_df, ignore_index=True, sort=False) diff --git a/code/python/driver/independent_predictions.py b/code/python/driver/independent_predictions.py index 150b5c3..4941a65 100644 --- a/code/python/driver/independent_predictions.py +++ b/code/python/driver/independent_predictions.py @@ -40,9 +40,7 @@ def ratio_false_true(p_true): # type: (float) -> float - if p_true == 0: - return float('inf') - return 1.0 / p_true - 1 + return float('inf') if p_true == 0 else 1.0 / p_true - 1 def main(env, args): diff --git a/code/python/driver/run-pbs-job.py b/code/python/driver/run-pbs-job.py index 3586302..a390d59 100644 --- a/code/python/driver/run-pbs-job.py +++ b/code/python/driver/run-pbs-job.py @@ -65,7 +65,7 @@ def main(env, args): random.seed(100) else: random.seed(int(rs)) - logger.critical("Random-seed: {}".format(rs)) + logger.critical(f"Random-seed: {rs}") else: random.seed(100) @@ -73,12 +73,12 @@ def main(env, args): if "env" in func_args: if args.pd_work is not None: func_args["env"] = func_args["env"].duplicate({"pd-work": args.pd_work}) - logger.critical("{}".format(func_args["env"]["pd-work"])) + logger.critical(f'{func_args["env"]["pd-work"]}') # Update pd-work to create a tmp directory mkdir_p(func_args["env"]["pd-work"]) func_args["env"]["pd-work"] = run_shell_cmd( - "mktemp --tmpdir={} -d".format(func_args["env"]["pd-work"]) + f'mktemp --tmpdir={func_args["env"]["pd-work"]} -d' ).strip() # logger.critical("{}\n{}".format(func, func_args)) diff --git a/code/python/driver/run_mgm_on_genome_list.py b/code/python/driver/run_mgm_on_genome_list.py index d23c799..c0e3d11 100644 --- a/code/python/driver/run_mgm_on_genome_list.py +++ b/code/python/driver/run_mgm_on_genome_list.py @@ -56,15 +56,15 @@ def parse_tags_from_list(list_tag_value_pairs): # type: (List[str]) -> List[Tuple] if list_tag_value_pairs is None: - return list() + return [] if len(list_tag_value_pairs) % 2 != 0: raise ValueError("Tag/value pairs list must have a length multiple of 2.") - list_parsed = list() - for i in range(0, len(list_tag_value_pairs), 2): - list_parsed.append((list_tag_value_pairs[i], list_tag_value_pairs[i + 1])) - return list_parsed + return [ + (list_tag_value_pairs[i], list_tag_value_pairs[i + 1]) + for i in range(0, len(list_tag_value_pairs), 2) + ] def parse_and_set_tags(list_tag_value_pairs, mgm): @@ -113,24 +113,21 @@ def run_mgm_on_genome_list(env, gil, pf_mgm_mod, **kwargs): # No parallelization if prl_options is None: helper_run_mgm_on_genome_list(env, gil, pf_mgm_mod, **kwargs) + elif prl_options["use-pbs"]: + # setup PBS jobs + pbs = PBS(env, prl_options, splitter=split_gil, merger=merge_identity) + pbs.run( + data={"gil": gil}, + func=helper_run_mgm_on_genome_list, + func_kwargs={"env": env, "pf_mgm_mod": pf_mgm_mod, **kwargs} + ) else: - # PBS Parallelization - if prl_options["use-pbs"]: - # setup PBS jobs - pbs = PBS(env, prl_options, splitter=split_gil, merger=merge_identity) - pbs.run( - data={"gil": gil}, - func=helper_run_mgm_on_genome_list, - func_kwargs={"env": env, "pf_mgm_mod": pf_mgm_mod, **kwargs} - ) - # Multithreading parallelization - else: - # parallel using threads - run_n_per_thread( - list(gil), run_mgm_on_gi, "gi", - {"env": env, "pf_mgm_mod": pf_mgm_mod, **kwargs}, - simultaneous_runs=prl_options.safe_get("num-processors") - ) + # parallel using threads + run_n_per_thread( + list(gil), run_mgm_on_gi, "gi", + {"env": env, "pf_mgm_mod": pf_mgm_mod, **kwargs}, + simultaneous_runs=prl_options.safe_get("num-processors") + ) def main(env, args): diff --git a/code/python/driver/run_tool_on_chunks.py b/code/python/driver/run_tool_on_chunks.py index 6fb0053..7960617 100644 --- a/code/python/driver/run_tool_on_chunks.py +++ b/code/python/driver/run_tool_on_chunks.py @@ -66,7 +66,7 @@ def split_fasta_into_chunks(env, pf_sequence, chunk_size_nt): # type: (Environment, str, int) -> List[[str, str, int]] - list_chunk_info = list() + list_chunk_info = [] sequences = SeqIO.to_dict(SeqIO.parse(pf_sequence, "fasta")) counter = 0 @@ -99,7 +99,7 @@ def split_fasta_into_chunks(env, pf_sequence, chunk_size_nt): def merge_predictions_with_offsets(list_prediction_info, pf_output): # type: (List[str, str, int], str) -> None - list_label = list() # type: List[Label] + list_label = [] for pi in list_prediction_info: pf_pred, seqname, offset = pi @@ -119,19 +119,14 @@ def merge_predictions_with_offsets(list_prediction_info, pf_output): def helper_run_chunks(env, t, list_chunk_info, gi, pd_work_tool, **kwargs): # type: (Environment, str, List[[str, str, int]], GenomeInfo, str, Dict[str, Any]) -> List[[str, str, int]] - list_prediction_info = list() # type: List[[str, str, int]] + list_prediction_info = [] pf_mgm2_mod = get_value(kwargs, "pf_mgm2_mod", type=str, required=t == "MGM2") pf_mgm_mod = get_value(kwargs, "pf_mgm_mod", type=str, required=t == "MGM") dn_prefix = get_value(kwargs, "dn_prefix", None, type=str) - if dn_prefix is not None: - dn_prefix = dn_prefix + "_" - else: - dn_prefix = "" - - - list_pd_chunks = list() + dn_prefix = dn_prefix + "_" if dn_prefix is not None else "" + list_pd_chunks = [] # run tool on separate chunks @@ -168,11 +163,7 @@ def run_tools_on_chunk_size_for_gi(env, gi, tools, chunk_size_nt, **kwargs): prl_options = get_value(kwargs, "prl_options", None) # type: ParallelizationOptions clean = get_value(kwargs, "clean", default=False) - if dn_suffix is not None: - dn_suffix = "_" + str(dn_suffix) - else: - dn_suffix = "" - + dn_suffix = f"_{str(dn_suffix)}" if dn_suffix is not None else "" if prl_options is not None: prl_options = copy.deepcopy(prl_options) prl_options.update_env(env) @@ -183,7 +174,7 @@ def run_tools_on_chunk_size_for_gi(env, gi, tools, chunk_size_nt, **kwargs): list_chunk_info = split_fasta_into_chunks(env.duplicate({"pd-work": pd_chunk_seqs}), pf_sequence, chunk_size_nt) - list_summary = list() + list_summary = [] for t in tools: pd_work_tool = os_join(env["pd-runs"], gi.name, f"chunking{dn_suffix}", f"{t}_{chunk_size_nt}") @@ -193,41 +184,39 @@ def run_tools_on_chunk_size_for_gi(env, gi, tools, chunk_size_nt, **kwargs): list_prediction_info, list_pd_chunks = helper_run_chunks( env, t, list_chunk_info, gi, pd_work_tool, **kwargs ) + elif prl_options["use-pbs"]: + pbs = PBS(env, prl_options, + splitter=split_list, + merger=merge_identity + ) + + output = pbs.run( + data=list_chunk_info, + func=helper_run_chunks, + func_kwargs={ + "env": env, "t": t, "gi": gi, "pd_work_tool": pd_work_tool, **kwargs + }, + split_kwargs={ + "arg_name_data": "list_chunk_info", + "arg_name_jobid": "dn_prefix" + } + ) + list_prediction_info = [] + list_pd_chunks = [] + for o in output: + list_prediction_info += o[0] + list_pd_chunks += o[1] + + print(gi.name, list_prediction_info) else: - # def helper_run_chunks(env, t, list_chunk_info, gi, pd_work_tool, pf_mgm2_mod, pf_mgm_mod, clean): - if prl_options["use-pbs"]: - pbs = PBS(env, prl_options, - splitter=split_list, - merger=merge_identity - ) - - output = pbs.run( - data=list_chunk_info, - func=helper_run_chunks, - func_kwargs={ - "env": env, "t": t, "gi": gi, "pd_work_tool": pd_work_tool, **kwargs - }, - split_kwargs={ - "arg_name_data": "list_chunk_info", - "arg_name_jobid": "dn_prefix" - } - ) - list_prediction_info = list() - list_pd_chunks = list() - for o in output: - list_prediction_info += o[0] - list_pd_chunks += o[1] - - print(gi.name, list_prediction_info) - else: - # threading - list_prediction_info, list_pd_chunks = run_slice_per_thread( - list_chunk_info, helper_run_chunks, "list_chunk_info", - { - "env": env, "t": t, "gi": gi, "pd_work_tool": pd_work_tool, **kwargs - }, - arg_name_threadid="dn_prefix" - ) + # threading + list_prediction_info, list_pd_chunks = run_slice_per_thread( + list_chunk_info, helper_run_chunks, "list_chunk_info", + { + "env": env, "t": t, "gi": gi, "pd_work_tool": pd_work_tool, **kwargs + }, + arg_name_threadid="dn_prefix" + ) # list_prediction_info = list() # type: List[[str, str, int]] # @@ -276,7 +265,7 @@ def run_tools_on_chunk_size_for_gi(env, gi, tools, chunk_size_nt, **kwargs): def helper_run_tools_on_chunks_for_gi(env, gi, tools, chunk_sizes_nt, **kwargs): # type: (Environment, GenomeInfo, List[str], List[int], Dict[str, Any]) -> pd.DataFrame - list_df = list() + list_df = [] for cst in chunk_sizes_nt: df = run_tools_on_chunk_size_for_gi(env, gi, tools, cst, **kwargs) list_df.append(df) @@ -294,38 +283,9 @@ def run_tools_on_chunks_for_gi(env, gi, tools, chunk_sizes_nt, **kwargs): env_curr = env.duplicate({"pd-work": pd_work}) else: env_curr = env - # prl_options = get_value(kwargs, "prl_options", None) - - # if prl_options is None: - df = helper_run_tools_on_chunks_for_gi(env_curr, gi, tools, chunk_sizes_nt, **kwargs) - # else: - # if prl_options["use-pbs"]: - # pbs = PBS(env, prl_options, - # splitter=split_list, - # merger=merge_dataframes - # ) - # - # df = pbs.run( - # data=chunk_sizes_nt, - # func=helper_run_tools_on_chunks_for_gi, - # func_kwargs={ - # "env": env, "gi": gi, "tools": tools, **kwargs - # }, - # split_kwargs={ - # "arg_name_data": "chunk_sizes_nt" - # } - # ) - # else: - # list_df = run_n_per_thread( - # chunk_sizes_nt, run_tools_on_chunk_size_for_gi, "chunk_size_nt", - # { - # "env": env, "gi": gi, "tools": tools, **kwargs - # } - # ) - # - # df = pd.concat(list_df, ignore_index=True, sort=False) - - return df + return helper_run_tools_on_chunks_for_gi( + env_curr, gi, tools, chunk_sizes_nt, **kwargs + ) def helper_run_tools_on_chunks(env, gil, tools, chunk_sizes_nt, **kwargs): @@ -335,7 +295,7 @@ def helper_run_tools_on_chunks(env, gil, tools, chunk_sizes_nt, **kwargs): simultaneous_genomes = get_value(kwargs, "simultaneous_genomes", 8) if prl_options is None or not prl_options["use-pbs"]: - list_df = list() + list_df = [] for gi in gil: df = run_tools_on_chunks_for_gi(env, gi, tools, chunk_sizes_nt, **kwargs) list_df.append(df) diff --git a/code/python/driver/run_tool_on_genome_list.py b/code/python/driver/run_tool_on_genome_list.py index 6b597fa..985deba 100644 --- a/code/python/driver/run_tool_on_genome_list.py +++ b/code/python/driver/run_tool_on_genome_list.py @@ -138,24 +138,21 @@ def run_tool_on_genome_list(env, gil, tool, **kwargs): # No parallelization if prl_options is None: helper_run_tool_on_genome_list(env, gil, tool, **kwargs) + elif prl_options["use-pbs"]: + # setup PBS jobs + pbs = PBS(env, prl_options, splitter=split_gil, merger=merge_identity) + pbs.run( + data=gil, + func=helper_run_tool_on_genome_list, + func_kwargs={"env": env, "tool": tool, **kwargs} + ) else: - # PBS Parallelization - if prl_options["use-pbs"]: - # setup PBS jobs - pbs = PBS(env, prl_options, splitter=split_gil, merger=merge_identity) - pbs.run( - data=gil, - func=helper_run_tool_on_genome_list, - func_kwargs={"env": env, "tool": tool, **kwargs} - ) - # Multithreading parallelization - else: - # parallel using threads - run_n_per_thread( - list(gil), run_tool_on_gi, "gi", - {"env": env, "tool": tool, **kwargs}, - simultaneous_runs=prl_options.safe_get("num-processors") - ) + # parallel using threads + run_n_per_thread( + list(gil), run_tool_on_gi, "gi", + {"env": env, "tool": tool, **kwargs}, + simultaneous_runs=prl_options.safe_get("num-processors") + ) def main(env, args): diff --git a/code/python/driver/run_tools_on_chunks.py b/code/python/driver/run_tools_on_chunks.py index 40dd6da..8bf1e02 100644 --- a/code/python/driver/run_tools_on_chunks.py +++ b/code/python/driver/run_tools_on_chunks.py @@ -128,7 +128,7 @@ def run_tools_on_chunk(env, gi, tools, chunk, **kwargs): pf_chunks = mkstemp_closed(dir=env["pd-work"], suffix=".fasta") gs.write_to_file(pf_chunks) - list_entries = list() + list_entries = [] for t, dn in zip(tools, dn_tools): logger.debug(f"{gi.name};{chunk};{t}") @@ -214,7 +214,7 @@ def run_tools_on_gi(env, gi, tools, chunks, **kwargs): else: - list_df = list() + list_df = [] for chunk in chunks: logger.debug(f"{gi.name};{chunk}") curr = run_tools_on_chunk(env, gi, tools, chunk, **kwargs) @@ -224,11 +224,7 @@ def run_tools_on_gi(env, gi, tools, chunks, **kwargs): def run_tools_on_gil(env, gil, tools, chunks, **kwargs): - # type: (Environment, GenomeInfoList, List[str], List[int], Dict[str, Any]) -> None - list_df = list() - for gi in gil: - list_df.append(run_tools_on_gi(env, gi, tools, chunks, **kwargs)) - + list_df = [run_tools_on_gi(env, gi, tools, chunks, **kwargs) for gi in gil] return pd.concat(list_df, sort=False, ignore_index=True) def main(env, args): diff --git a/code/python/driver/stats_per_gene.py b/code/python/driver/stats_per_gene.py index d577424..6b80bf7 100644 --- a/code/python/driver/stats_per_gene.py +++ b/code/python/driver/stats_per_gene.py @@ -67,12 +67,13 @@ def stats_per_gene_for_gi(env, gi, tools, **kwargs): if os.path.isfile(os_join(env["pd-runs"], gi.name, tools[t], "prediction.gff")) } - if len(pf_predictions) == 0: + if not pf_predictions: return pd.DataFrame() name_to_labels = { - t: read_labels_from_file(pf_predictions[t], shift=-1, name=t) for t in pf_predictions.keys() - } # type: Dict[str, Labels] + t: read_labels_from_file(pf_predictions[t], shift=-1, name=t) + for t in pf_predictions + } keys_3prime = get_unique_gene_keys(*name_to_labels.values()) @@ -86,13 +87,13 @@ def stats_per_gene_for_gi(env, gi, tools, **kwargs): # Each gene key will have a row in the dataframe # Columns will indicate whether it was 3p and 5p were predicted by each tool - list_entries = list() + list_entries = [] for key in keys_3prime: entry = dict() shortest_label = None tool_of = None - for t in pf_predictions.keys(): + for t in pf_predictions: label = name_to_labels[t].get_by_3prime_key(key) if label is None: @@ -124,11 +125,7 @@ def stats_per_gene_for_gi(env, gi, tools, **kwargs): **entry }) - df = pd.DataFrame(list_entries) - # for t in tools.keys(): - # df[f"5p-{t}"] = df[f"5p-{t}"].astype(int) - - return df + return pd.DataFrame(list_entries) def helper_stats_per_gene(env, gil, tools, **kwargs): @@ -143,15 +140,13 @@ def helper_stats_per_gene(env, gil, tools, **kwargs): :return: Either nothing or the dataframe, depending on the value of suppress_return. """ - list_df = list() - for gi in gil: - list_df.append(stats_per_gene_for_gi(env, gi, tools, **kwargs)) - - if len(list_df) == 0: + if list_df := [ + stats_per_gene_for_gi(env, gi, tools, **kwargs) for gi in gil + ]: + return pd.concat(list_df, ignore_index=True, sort=False) + else: return pd.DataFrame() - return pd.concat(list_df, ignore_index=True, sort=False) - def stats_per_gene(env, gil, tools, pf_output, **kwargs): # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None @@ -171,9 +166,6 @@ def stats_per_gene(env, gil, tools, pf_output, **kwargs): "env": env, "tools": tools, **kwargs } ) - df = pd.concat(list_df, ignore_index=True, sort=False) - - # threading else: list_df = run_n_per_thread( list(gil), stats_per_gene_for_gi, @@ -184,7 +176,7 @@ def stats_per_gene(env, gil, tools, pf_output, **kwargs): simultaneous_runs=1 #prl_options.safe_get("num-processors") ) - df = pd.concat(list_df, ignore_index=True, sort=False) + df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(pf_output, index=False) @@ -204,7 +196,7 @@ def main(env, args): prl_options = ParallelizationOptions(env, args.pf_parallelization_options, **vars(args)) # collect tool name and directory together - tool_to_dir = {a: b for a, b in zip(tools, dn_tools)} + tool_to_dir = dict(zip(tools, dn_tools)) stats_per_gene(env, gil, tool_to_dir, args.pf_output, prl_options=prl_options) diff --git a/code/python/driver/stats_per_gene_on_chunk.py b/code/python/driver/stats_per_gene_on_chunk.py index dac8946..def2bae 100644 --- a/code/python/driver/stats_per_gene_on_chunk.py +++ b/code/python/driver/stats_per_gene_on_chunk.py @@ -102,7 +102,7 @@ def apply_genome_splitter_to_labels(seqname_to_info, labels): for s, list_info in seqname_to_info.items() } - list_labels = list() + list_labels = [] for lab in labels: overlapping = seqname_to_interval_tree[lab.seqname()].overlap(lab.left(), lab.right() + 1) @@ -122,63 +122,57 @@ def apply_genome_splitter_to_labels(seqname_to_info, labels): - if True or len(overlapping) > 2: - for chunk in overlapping: + for chunk in overlapping: # partial both - if lab.left() < chunk[0] and lab.right() > chunk[1]: - lab_partial = copy.deepcopy(lab) - lab_partial.coordinates().left = chunk[0] - lab_partial.coordinates().right = chunk[1] - - if lab.strand() == "+": - r = (lab.right() - lab_partial.left() + 1) % 3 - lab_partial.coordinates().left += r - r = (lab_partial.right() - lab_partial.left() + 1) % 3 - lab_partial.coordinates().right -= r - else: - r = (lab_partial.right() - lab.left() + 1) % 3 - lab_partial.coordinates().right -= r - r = (lab_partial.right() - lab_partial.left() + 1) % 3 - lab_partial.coordinates().left += r - + if lab.left() < chunk[0] and lab.right() > chunk[1]: + lab_partial = copy.deepcopy(lab) + lab_partial.coordinates().left = chunk[0] + lab_partial.coordinates().right = chunk[1] + + if lab.strand() == "+": + r = (lab.right() - lab_partial.left() + 1) % 3 + lab_partial.coordinates().left += r + r = (lab_partial.right() - lab_partial.left() + 1) % 3 + lab_partial.coordinates().right -= r + else: + r = (lab_partial.right() - lab.left() + 1) % 3 + lab_partial.coordinates().right -= r + r = (lab_partial.right() - lab_partial.left() + 1) % 3 + lab_partial.coordinates().left += r + + lab_partial.set_attribute_value("partial", "11") + list_labels.append(lab_partial) + + elif lab.left() < chunk[0]: + lab_partial = copy.deepcopy(lab) + lab_partial.coordinates().left = chunk[0] + if lab.strand() == "+": + r = (lab.right() - lab_partial.left() + 1) % 3 + else: + r = (lab_partial.right() - lab_partial.left() + 1) % 3 + lab_partial.coordinates().left += r + if lab_partial.get_attribute_value("partial") in {"01", "11"}: lab_partial.set_attribute_value("partial", "11") - list_labels.append(lab_partial) - - # partial left - elif lab.left() < chunk[0]: - lab_partial = copy.deepcopy(lab) - lab_partial.coordinates().left = chunk[0] - if lab.strand() == "+": - r = (lab.right() - lab_partial.left() + 1) % 3 - lab_partial.coordinates().left += r - else: - r = (lab_partial.right() - lab_partial.left() + 1) % 3 - lab_partial.coordinates().left += r - if lab_partial.get_attribute_value("partial") in {"01", "11"}: - lab_partial.set_attribute_value("partial", "11") - else: - lab_partial.set_attribute_value("partial", "10") - - list_labels.append(lab_partial) - # partial right - elif lab.right() > chunk[1]: - lab_partial = copy.deepcopy(lab) - lab_partial.coordinates().right = chunk[1] - - if lab.strand() == "+": - r = (lab_partial.right() - lab_partial.left() + 1) % 3 - lab_partial.coordinates().right -= r - else: - r = (lab_partial.right() - lab.left() + 1) % 3 - lab_partial.coordinates().right -= r - - if lab_partial.get_attribute_value("partial") in {"10", "11"}: - lab_partial.set_attribute_value("partial", "11") - else: - lab_partial.set_attribute_value("partial", "01") + else: + lab_partial.set_attribute_value("partial", "10") + + list_labels.append(lab_partial) + elif lab.right() > chunk[1]: + lab_partial = copy.deepcopy(lab) + lab_partial.coordinates().right = chunk[1] + + if lab.strand() == "+": + r = (lab_partial.right() - lab_partial.left() + 1) % 3 + else: + r = (lab_partial.right() - lab.left() + 1) % 3 + lab_partial.coordinates().right -= r + if lab_partial.get_attribute_value("partial") in {"10", "11"}: + lab_partial.set_attribute_value("partial", "11") + else: + lab_partial.set_attribute_value("partial", "01") - list_labels.append(lab_partial) + list_labels.append(lab_partial) @@ -187,17 +181,8 @@ def apply_genome_splitter_to_labels(seqname_to_info, labels): left_chunk = seqname_to_interval_tree[lab.seqname()][lab.left()] right_chunk = seqname_to_interval_tree[lab.seqname()][lab.right()] - if len(left_chunk) == 0: - left_chunk = None - else: - left_chunk = left_chunk.pop().data - - if len(right_chunk) == 0: - right_chunk = None - else: - right_chunk = right_chunk.pop().data - - + left_chunk = None if len(left_chunk) == 0 else left_chunk.pop().data + right_chunk = None if len(right_chunk) == 0 else right_chunk.pop().data if left_chunk == right_chunk: # no splitting list_labels.append(lab) @@ -247,7 +232,10 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, **kwargs): """ - list_entries = list() + list_entries = [] + # compute GC of label + gene_gc = 0 # compute_gc(sequences, shortest_label) + for genome, df_genome in df_summary_genome.groupby("Genome", as_index=False): pf_sequence = os_join(env["pd-data"], genome, "sequence.fasta") @@ -264,7 +252,7 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, **kwargs): reference_tools = sorted(reference_tools) # if multiple reference, merge them first by 5 - list_reference_labels = list() + list_reference_labels = [] if reference_tools is not None: list_reference_labels = [read_labels_from_file(os_join(env["pd-runs"], genome, rt, "prediction.gff"), ignore_partial=False, shift=-1) @@ -282,7 +270,7 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, **kwargs): merged_reference_labels, list_reference_labels[i] ).match_3p_5p("a") - for chunk_size, df_chunk in df_genome.groupby("Chunk Size", as_index=False): # type: int, pd.DataFrame + for chunk_size, df_chunk in df_genome.groupby("Chunk Size", as_index=False):# type: int, pd.DataFrame # read all label files for chunk tool_to_labels = { @@ -308,7 +296,7 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, **kwargs): # get all possible chunks, and map reference labels to it chunked_reference_labels = dict() - if len(list_reference_labels) > 0: + if list_reference_labels: chunked_reference_labels = { ref: apply_genome_splitter_to_labels(seqname_to_chunks, labels) for ref, labels in zip( @@ -339,7 +327,7 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, **kwargs): shortest_label = None - for t in name_to_labels.keys(): + for t in name_to_labels: label = name_to_labels[t].get_by_3prime_key(key) if label is None: @@ -358,9 +346,6 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, **kwargs): - # compute GC of label - gene_gc = 0 # compute_gc(sequences, shortest_label) - list_entries.append({ "3p-key": key, "Genome": genome, @@ -441,7 +426,7 @@ def main(env, args): df.reset_index(inplace=True) bs = args.batch_size - list_df = list([x[1] for x in df.groupby("Genome", as_index=False)]) + list_df = [x[1] for x in df.groupby("Genome", as_index=False)] start = 0 end = min(bs, len(list_df)) diff --git a/code/python/driver/stats_per_gene_on_chunk_revised.py b/code/python/driver/stats_per_gene_on_chunk_revised.py index 0a42307..57faa26 100644 --- a/code/python/driver/stats_per_gene_on_chunk_revised.py +++ b/code/python/driver/stats_per_gene_on_chunk_revised.py @@ -102,7 +102,7 @@ def apply_genome_splitter_to_labels(seqname_to_info, labels): for s, list_info in seqname_to_info.items() } - list_labels = list() + list_labels = [] for lab in labels: overlapping = seqname_to_interval_tree[lab.seqname()].overlap(lab.left(), lab.right() + 1) @@ -164,17 +164,8 @@ def apply_genome_splitter_to_labels(seqname_to_info, labels): left_chunk = seqname_to_interval_tree[lab.seqname()][lab.left()] right_chunk = seqname_to_interval_tree[lab.seqname()][lab.right()] - if len(left_chunk) == 0: - left_chunk = None - else: - left_chunk = left_chunk.pop().data - - if len(right_chunk) == 0: - right_chunk = None - else: - right_chunk = right_chunk.pop().data - - + left_chunk = None if len(left_chunk) == 0 else left_chunk.pop().data + right_chunk = None if len(right_chunk) == 0 else right_chunk.pop().data if left_chunk == right_chunk: # no splitting list_labels.append(lab) @@ -247,25 +238,18 @@ def has_same_3prime_end(partial, full): if strand == "+": # partial at 3prime end if partial.incomplete_at_3prime(): - if full.right() >= partial.right()-buffer: - return True - else: - return False + return full.right() >= partial.right()-buffer elif full.incomplete_at_3prime(): - if partial.right() >= full.right()-buffer: - return True - else: - return False + return partial.right() >= full.right()-buffer else: # both complete return partial.right() == full.right() + elif partial.incomplete_at_3prime(): + return full.left() <= partial.left()+buffer + elif full.incomplete_at_3prime(): + return partial.left() <= full.left()+buffer else: - if partial.incomplete_at_3prime(): - return full.left() <= partial.left()+buffer - elif full.incomplete_at_3prime(): - return partial.left() <= full.left()+buffer - else: - return partial.left() == full.left() + return partial.left() == full.left() # def has_same_5prime_and_3prime_end(partial, full): # # type: (Label, Label) -> bool @@ -300,10 +284,7 @@ def compare_chunked_prediction_to_annotation(env, labels_pred, labels_ref_fp, la # first, get interval where this label exists if lab.seqname() in ref_intervals and lab.strand() in ref_intervals[lab.seqname()]: overlaps = ref_intervals[lab.seqname()][lab.strand()].overlap(lab.left(), lab.right()) - if len(overlaps) == 0: - # result["FP"] += 1 - pass - else: + if len(overlaps) != 0: largest_overlap, score = get_interval_with_largest_overlap(lab, overlaps) if has_same_3prime_end(lab, largest_overlap.data): @@ -362,14 +343,13 @@ def filter_labels_shorter_than(labels, threshold_nt, threshold_nt_partial=None): if threshold_nt_partial is None: threshold_nt_partial = threshold_nt - list_labels = list() + list_labels = [] for lab in labels: if lab.is_partial(): if lab.length() >= threshold_nt_partial: list_labels.append(lab) - else: - if lab.length() >= threshold_nt: - list_labels.append(lab) + elif lab.length() >= threshold_nt: + list_labels.append(lab) return Labels(list_labels, name=labels.name) @@ -393,7 +373,7 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, reference_tools_ reference_tools_fn = sorted(reference_tools_fn) if isinstance(reference_tools_fn, list) else reference_tools_fn min_gene_length_nt = get_value(kwargs, "min_gene_length_nt", 90) - list_entries = list() + list_entries = [] for genome, df_genome in df_summary_genome.groupby("Genome", as_index=False): pf_sequence = os_join(env["pd-data"], genome, "sequence.fasta") @@ -423,7 +403,7 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, reference_tools_ ref_labels_fp = filter_labels_shorter_than(ref_labels_fp, min_gene_length_nt) - for chunk_size, df_chunk in df_genome.groupby("Chunk Size", as_index=False): # type: int, pd.DataFrame + for chunk_size, df_chunk in df_genome.groupby("Chunk Size", as_index=False):# type: int, pd.DataFrame # read all label files for chunk tool_to_labels = { @@ -464,73 +444,74 @@ def stats_per_gene_on_chunks_for_genome(env, df_summary_genome, reference_tools_ chunked_ref_labels_fn = apply_genome_splitter_to_labels(seqname_to_chunks, ref_labels_fn) chunked_ref_labels_fp = apply_genome_splitter_to_labels(seqname_to_chunks, ref_labels_fp) - for t in tool_to_labels: - # if t != "mgm2": - # continue - list_entries.append( - { - "Genome": genome, - "Tool": t, - "Genome GC": genome_gc, - "Chunk Size": chunk_size, - **compare_chunked_prediction_to_annotation(env, tool_to_labels[t], chunked_ref_labels_fp, - chunked_ref_labels_fn) - } - ) - - # - # # Add prediction info to dataframe - # - # name_to_labels = copy.copy(tool_to_labels) - # name_to_labels.update(chunked_reference_labels) - # - # all_labels = list(chunked_reference_labels.values()) + list(tool_to_labels.values()) - # keys_3prime = get_unique_gene_keys(*all_labels) # all 3prime keys - # - - - - - # # Each gene key will have a row in the dataframe - # # Columns will indicate whether it was 3p and 5p were predicted by each tool - # for key in keys_3prime: - # entry = dict() - # - # - # - # shortest_label = None - # for t in name_to_labels.keys(): - # - # label = name_to_labels[t].get_by_3prime_key(key) - # if label is None: - # entry[f"5p-{t}"] = None # 5prime end - # entry[f"3p-{t}"] = None - # else: - # entry[f"5p-{t}"] = label.get_5prime() - # entry[f"3p-{t}"] = label.get_3prime() - # if shortest_label is None: - # shortest_label = label - # elif shortest_label.length() < label.length(): - # shortest_label = label - # - # entry[f"Partial3p-{t}"] = label.incomplete_at_3prime() - # entry[f"Partial5p-{t}"] = label.incomplete_at_5prime() - # - # - # - # # compute GC of label - # gene_gc = 0 # compute_gc(sequences, shortest_label) - # - # list_entries.append({ - # "3p-key": key, - # "Genome": genome, - # "Genome GC": genome_gc, - # "Gene GC": gene_gc, - # "Chunk Size": chunk_size, - # "Runtime": df_chunk["Runtime"].mean(), - # "Clade": clade, - # **entry - # }) + list_entries.extend( + { + "Genome": genome, + "Tool": t, + "Genome GC": genome_gc, + "Chunk Size": chunk_size, + **compare_chunked_prediction_to_annotation( + env, + value, + chunked_ref_labels_fp, + chunked_ref_labels_fn, + ), + } + for t, value in tool_to_labels.items() + ) + # + # # Add prediction info to dataframe + # + # name_to_labels = copy.copy(tool_to_labels) + # name_to_labels.update(chunked_reference_labels) + # + # all_labels = list(chunked_reference_labels.values()) + list(tool_to_labels.values()) + # keys_3prime = get_unique_gene_keys(*all_labels) # all 3prime keys + # + + + + + # # Each gene key will have a row in the dataframe + # # Columns will indicate whether it was 3p and 5p were predicted by each tool + # for key in keys_3prime: + # entry = dict() + # + # + # + # shortest_label = None + # for t in name_to_labels.keys(): + # + # label = name_to_labels[t].get_by_3prime_key(key) + # if label is None: + # entry[f"5p-{t}"] = None # 5prime end + # entry[f"3p-{t}"] = None + # else: + # entry[f"5p-{t}"] = label.get_5prime() + # entry[f"3p-{t}"] = label.get_3prime() + # if shortest_label is None: + # shortest_label = label + # elif shortest_label.length() < label.length(): + # shortest_label = label + # + # entry[f"Partial3p-{t}"] = label.incomplete_at_3prime() + # entry[f"Partial5p-{t}"] = label.incomplete_at_5prime() + # + # + # + # # compute GC of label + # gene_gc = 0 # compute_gc(sequences, shortest_label) + # + # list_entries.append({ + # "3p-key": key, + # "Genome": genome, + # "Genome GC": genome_gc, + # "Gene GC": gene_gc, + # "Chunk Size": chunk_size, + # "Runtime": df_chunk["Runtime"].mean(), + # "Clade": clade, + # **entry + # }) print(pd.DataFrame(list_entries).to_csv()) @@ -605,7 +586,7 @@ def main(env, args): df.reset_index(inplace=True) bs = args.batch_size - list_df = list([x[1] for x in df.groupby("Genome", as_index=False)]) + list_df = [x[1] for x in df.groupby("Genome", as_index=False)] start = 0 end = min(bs, len(list_df)) diff --git a/code/python/driver/stats_tools_on_chunks.py b/code/python/driver/stats_tools_on_chunks.py index 76a77e5..8438d88 100644 --- a/code/python/driver/stats_tools_on_chunks.py +++ b/code/python/driver/stats_tools_on_chunks.py @@ -51,7 +51,7 @@ def stats_tools_on_chunks(env, df): # type: (Environment, pd.DataFrame) -> pd.DataFrame - list_entries = list() + list_entries = [] for idx in tqdm(df.index, total=len(df)): pf_prediction = df.at[idx, "Predictions"] diff --git a/code/python/driver/test_component_on_verified.py b/code/python/driver/test_component_on_verified.py index 4ff8291..6916c64 100644 --- a/code/python/driver/test_component_on_verified.py +++ b/code/python/driver/test_component_on_verified.py @@ -52,10 +52,6 @@ def test_component_for_gi(env, gi, list_pf_mgm, list_component): - # type: (Environment, GenomeInfo, List[str], List[str]) -> pd.DataFrame - - list_entries = list() - pd_gi = os_join(env["pd-work"], gi.name) mkdir_p(pd_gi) @@ -68,8 +64,7 @@ def test_component_for_gi(env, gi, list_pf_mgm, list_component): ##### GMS2 results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, set(), native_coding_off=False) - list_entries.append({"Tool": "GMS2", **results}) - + list_entries = [{"Tool": "GMS2", **results}] # ##### MGM + native component: MGM with native trained component # results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, components_off, native_coding_off=True) # list_entries.append({"Tool": f"MGM: Native {component}", **results}) @@ -77,11 +72,11 @@ def test_component_for_gi(env, gi, list_pf_mgm, list_component): ##### MGM + GC component: MGM from new model for pf_mgm, component in zip(list_pf_mgm, list_component): results = run_mgm2_and_get_accuracy(env_dup, gi, pf_mgm) - list_entries.append({"Tool": f"MGM2", **results}) + list_entries.append({"Tool": "MGM2", **results}) ##### MGM results = run_mgm_and_get_accuracy(env_dup, gi, os_join(env["pd-bin-external"], "gms2", "mgm_11.mod")) - list_entries.append({"Tool": f"MGM", **results}) + list_entries.append({"Tool": "MGM", **results}) return pd.DataFrame(list_entries) @@ -89,7 +84,7 @@ def test_component_for_gi(env, gi, list_pf_mgm, list_component): def test_component_on_verified(env, gil, list_pf_mgm_bac, list_pf_mgm_arc, list_component): # type: (Environment, GenomeInfoList, List[str], List[str], List[str]) -> None - list_df = list() + list_df = [] for gi in gil: if "Halobacterium" in gi.name or "pharaonis" in gi.name or "pernix" in gi.name: diff --git a/code/python/driver/test_gms2_components_on_verified_set.py b/code/python/driver/test_gms2_components_on_verified_set.py index e739a81..be1bad8 100644 --- a/code/python/driver/test_gms2_components_on_verified_set.py +++ b/code/python/driver/test_gms2_components_on_verified_set.py @@ -53,7 +53,7 @@ def analyze_gms2_components_on_verified_set_for_gi(env, gi): # type: (Environment, GenomeInfo) -> pd.DataFrame - list_entries = list() + list_entries = [] start_components = { "Start Codons", "Start Context", "RBS", "Promoter", @@ -66,14 +66,14 @@ def analyze_gms2_components_on_verified_set_for_gi(env, gi): for component_on in sorted(start_components) + ["MGM2*", "MGM", "GMS2"]: components_off = start_components.difference({component_on}) - if component_on == "MGM2*" or component_on == "GMS2": + if component_on in ["MGM2*", "GMS2"]: components_off = set() elif component_on == "MGM": pass elif not component_in_model_file(env, gi, component_on) and component_on not in {"MGM2*", "MGM", "GMS2"}: continue - native_coding_off = False if component_on == "GMS2" else True + native_coding_off = component_on != "GMS2" pd_gi_component = os_join(pd_gi, component_on).replace(" ", "") mkdir_p(pd_gi_component) @@ -81,13 +81,9 @@ def analyze_gms2_components_on_verified_set_for_gi(env, gi): env_dup = env.duplicate({"pd-work": pd_gi_component}) if component_on == "Start Context": - component_on = {component_on} # "rbs", "promoter"} components_off.remove("RBS") components_off.remove("Promoter") - else: - component_on = {component_on} - - + component_on = {component_on} # "rbs", "promoter"} results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, components_off, native_coding_off=native_coding_off) @@ -105,15 +101,9 @@ def analyze_gms2_components_on_verified_set_for_gi(env, gi): def analyze_gms2_components_on_verified_set(env, gil): - # type: (Environment, GenomeInfoList) -> None - - # run different components - list_df = list() - for gi in gil: - list_df.append( - analyze_gms2_components_on_verified_set_for_gi(env, gi) - ) - + list_df = [ + analyze_gms2_components_on_verified_set_for_gi(env, gi) for gi in gil + ] df = pd.concat(list_df, ignore_index=True, sort=False) df["Genome"] = df.apply(fix_names, axis=1) print(df.to_csv()) diff --git a/code/python/driver/test_rbs_built_from_most_conserved_motifs.py b/code/python/driver/test_rbs_built_from_most_conserved_motifs.py index d9a2068..2fbca6e 100644 --- a/code/python/driver/test_rbs_built_from_most_conserved_motifs.py +++ b/code/python/driver/test_rbs_built_from_most_conserved_motifs.py @@ -141,7 +141,7 @@ def test_rbs_built_from_most_conserved_motifs_for_gi(env, gi, **kwargs): gms2_mod = GMS2Mod.init_from_file(pf_mod) - list_entries = list() + list_entries = [] for thresh in tqdm(np.arange(-2.0, 6, 0.2)): while current_index < len(labels) and float( @@ -190,7 +190,7 @@ def test_rbs_built_from_most_conserved_motifs(env, gil, **kwargs): if pf_trained is None: - list_df = list() + list_df = [] for gi in gil: # if "denitrificans" not in gi.name: # continue diff --git a/code/python/driver/test_start_codon_perturbations.py b/code/python/driver/test_start_codon_perturbations.py index 744f184..26979de 100644 --- a/code/python/driver/test_start_codon_perturbations.py +++ b/code/python/driver/test_start_codon_perturbations.py @@ -106,7 +106,7 @@ def test_start_codon_perturbation_for_gi(env, gi, p, gms2_mod): } def run_perturbations(env, gi, perturbations, gms2_mod): - list_entries = list() + list_entries = [] for p in tqdm(perturbations, f"{gi.name}", leave=False, total=len(perturbations)): entry = test_start_codon_perturbation_for_gi(env, gi, p, gms2_mod) @@ -130,43 +130,35 @@ def test_start_codon_perturbations_for_gi(env, gi, **kwargs): # sample perturbations perturbations = np.random.normal(alpha_mean, alpha_std, num_perturbations) - list_entries = list() + list_entries = [] # for p in tqdm(perturbations, f"{gi.name}", leave=False, total=len(perturbations)): # entry = test_start_codon_perturbation_for_gi(env, gi, p, gms2_mod) # list_entries.append(entry) list_entries = run_n_per_thread( - [x for x in perturbations], + list(perturbations), test_start_codon_perturbation_for_gi, "p", - {"gi": gi, "env": env, "gms2_mod": gms2_mod} + {"gi": gi, "env": env, "gms2_mod": gms2_mod}, ) return pd.DataFrame(list_entries) def test_start_codon_perturbations(env, gil, **kwargs): - # type: (Environment, GenomeInfoList, Dict[str, Any]) -> None - - from_existing = get_value(kwargs, "from_existing", False) - - if not from_existing: - list_df = list() - counter = 0 - for gi in tqdm(gil, "Genomes", total=len(gil)): - list_df.append(test_start_codon_perturbations_for_gi(env, gi, **kwargs)) - counter += 1 - # if counter == 2: - # break + if from_existing := get_value(kwargs, "from_existing", False): + df = pd.read_csv(os_join(env["pd-work"], "summary.csv")) + else: + list_df = [ + test_start_codon_perturbations_for_gi(env, gi, **kwargs) + for gi in tqdm(gil, "Genomes", total=len(gil)) + ] df = pd.concat(list_df, ignore_index=True, sort=False) df["Genome"] = df.apply(fix_names, axis=1) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) - else: - df = pd.read_csv(os_join(env["pd-work"], "summary.csv")) - sns.catplot(df, "Genome", "Error") sns.lmplot(df, "Perturbation", "Error", hue="Genome", sns_kwargs={"lowess": True}) diff --git a/code/python/driver/update_model_from_model.py b/code/python/driver/update_model_from_model.py index 01ca011..53fe078 100644 --- a/code/python/driver/update_model_from_model.py +++ b/code/python/driver/update_model_from_model.py @@ -51,7 +51,7 @@ def get_tags_for_motif(motif, group): group = group.upper() prefix = f"{motif}_{group}" - entries = list() + entries = [] for index in [0,1,2]: p_index = f"{prefix}_{index}" @@ -83,19 +83,19 @@ def get_tags_for_start_context(sc_tag, group): def component_to_tags(component, group): # type: (str, str) -> List[str] - if component.upper() == "RBS" or component.upper() == "PROMOTER": + if component.upper() in ["RBS", "PROMOTER"]: return get_tags_for_motif(component, group) if component.upper() == "STARTS": return [f"{s}_{group}" for s in ["ATG", "GTG", "TTG"]] if component.upper() in {"SC_RBS", "SC_PROMOTER"}: return get_tags_for_start_context(component.upper(), group) - return list() + return [] def get_tags_from_components(list_components, list_groups): # type: (List[str], List[str]) -> List[str] - list_tags = list() + list_tags = [] for c in list_components: for g in list_groups: list_tags += component_to_tags(c, g) diff --git a/code/python/driver/viz_gms2_model.py b/code/python/driver/viz_gms2_model.py index fa06a5f..73155d7 100644 --- a/code/python/driver/viz_gms2_model.py +++ b/code/python/driver/viz_gms2_model.py @@ -48,7 +48,7 @@ def viz_motif_and_spacer(env, mod, motif, **kwargs): # type: (Environment, GMS2Mod, str, Dict[str, Any]) -> None motif_mat = MotifModel(mod.items[f"{motif}_MAT"], mod.items[f"{motif}_POS_DISTR"]) - nonc_mat = GMS2Noncoding(mod.items[f"NON_MAT"]) + nonc_mat = GMS2Noncoding(mod.items["NON_MAT"]) df_motif_mat = motif_mat.pwm_to_df() np_nonc_mat = nonc_mat.pwm_to_array(0) @@ -68,7 +68,7 @@ def viz_motif_and_spacer(env, mod, motif, **kwargs): # Plot spacer ax = ax_spacer - x = [a for a in range(len(motif_mat._spacer))] + x = list(range(len(motif_mat._spacer))) y = motif_mat._spacer seaborn.lineplot(x, y, ax=ax) diff --git a/code/python/driver/viz_gms2_models_over_gc.py b/code/python/driver/viz_gms2_models_over_gc.py index 7a04715..5c696f1 100644 --- a/code/python/driver/viz_gms2_models_over_gc.py +++ b/code/python/driver/viz_gms2_models_over_gc.py @@ -75,8 +75,8 @@ def viz_rbs(env, df): mod = df.at[idx, "Mod"] # get models - rbs_model = MotifModel(mod.items[f"RBS_MAT"], mod.items[f"RBS_POS_DISTR"]) - nonc_mat = GMS2Noncoding(mod.items[f"NON_MAT"]) + rbs_model = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"]) + nonc_mat = GMS2Noncoding(mod.items["NON_MAT"]) np_nonc_mat = nonc_mat.pwm_to_array(0) # rbs @@ -175,7 +175,7 @@ def viz_genome_type_per_gc(env, list_gi, list_mod, list_gc): fig, axes = plt.subplots(num_rows, num_columns, sharey="all") fig_rbs, axes_rbs = plt.subplots(num_rows, num_columns, sharey="all") - list_df = list() + list_df = [] for i in range(num_rows): for j in range(num_columns): @@ -192,8 +192,10 @@ def viz_genome_type_per_gc(env, list_gi, list_mod, list_gc): break mod = df_c.at[df_c.index[i*num_columns + j], "Mod"] - motif_mat = MotifModel(mod.items[f"PROMOTER_MAT"], mod.items[f"PROMOTER_POS_DISTR"]) - nonc_mat = GMS2Noncoding(mod.items[f"NON_MAT"]) + motif_mat = MotifModel( + mod.items["PROMOTER_MAT"], mod.items["PROMOTER_POS_DISTR"] + ) + nonc_mat = GMS2Noncoding(mod.items["NON_MAT"]) list_df.append(pd.DataFrame({ "Distance": range(len(motif_mat._spacer)), @@ -212,8 +214,8 @@ def viz_genome_type_per_gc(env, list_gi, list_mod, list_gc): ax.set_ylim(0, 2.5) ax = axes_rbs[i][j] - motif_mat = MotifModel(mod.items[f"RBS_MAT"], mod.items[f"RBS_POS_DISTR"]) - nonc_mat = GMS2Noncoding(mod.items[f"NON_MAT"]) + motif_mat = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"]) + nonc_mat = GMS2Noncoding(mod.items["NON_MAT"]) df_motif_mat = motif_mat.pwm_to_df() np_nonc_mat = nonc_mat.pwm_to_array(0) @@ -240,9 +242,11 @@ def viz_genome_type_per_gc(env, list_gi, list_mod, list_gc): mod = df_c.at[df_c.index[i], "Mod"] # get models - rbs_model = MotifModel(mod.items[f"RBS_MAT"], mod.items[f"RBS_POS_DISTR"]) - prom_model = MotifModel(mod.items[f"PROMOTER_MAT"], mod.items[f"PROMOTER_POS_DISTR"]) - nonc_mat = GMS2Noncoding(mod.items[f"NON_MAT"]) + rbs_model = MotifModel(mod.items["RBS_MAT"], mod.items["RBS_POS_DISTR"]) + prom_model = MotifModel( + mod.items["PROMOTER_MAT"], mod.items["PROMOTER_POS_DISTR"] + ) + nonc_mat = GMS2Noncoding(mod.items["NON_MAT"]) np_nonc_mat = nonc_mat.pwm_to_array(0) # rbs @@ -299,8 +303,6 @@ def viz_genome_type_per_gc(env, list_gi, list_mod, list_gc): leg = fig.legend(handles, labels, bbox_to_anchor=(0.5, 0.1), loc='upper center', ncol=2, bbox_transform=fig.transFigure, frameon=False) - # fontsize=fontsize) - for lh in leg.legendHandles: lh.set_alpha(1) # lh.set_sizes([18] * 2) @@ -327,9 +329,9 @@ def helper_read_genome_data(env, gil, **kwargs): dn_gms2 = get_value(kwargs, "dn_gms2", "gms2") prl_options = get_value(kwargs, "prl_options", None) - list_gi = list() - list_mod = list() - list_gc = list() + list_gi = [] + list_mod = [] + list_gc = [] for gi in tqdm(gil, total=len(gil)): try: @@ -352,9 +354,9 @@ def read_genome_data(env, gil, **kwargs): dn_gms2 = get_value(kwargs, "dn_gms2", "gms2") prl_options = get_value(kwargs, "prl_options", None) - list_gi = list() - list_mod = list() - list_gc = list() + list_gi = [] + list_mod = [] + list_gc = [] if not prl_options or not prl_options["use-pbs"]: list_gi, list_mod, list_gc = helper_read_genome_data(env, gil, **kwargs) diff --git a/code/python/driver/viz_group_distribution_across_gc.py b/code/python/driver/viz_group_distribution_across_gc.py index 60c08da..50e1931 100644 --- a/code/python/driver/viz_group_distribution_across_gc.py +++ b/code/python/driver/viz_group_distribution_across_gc.py @@ -68,7 +68,7 @@ def helper_read_group_data(env, gil, **kwargs): # type: (Environment, GenomeInfoList, Dict[str, Any]) -> pd.DataFrame dn_gms2 = get_value(kwargs, "dn_gms2", "gms2") - list_entries = list() + list_entries = [] for gi in gil: try: @@ -102,18 +102,15 @@ def read_group_data(env, gil, **kwargs): prl_options = get_value(kwargs, "prl_options", None) if not prl_options or not prl_options["use-pbs"]: - df = helper_read_group_data(env, gil, **kwargs) - else: - pbs = PBS(env, prl_options, splitter=split_gil, merger=merge_identity) - list_results = pbs.run( - gil, - helper_read_group_data, - {"env": env, **kwargs} - ) - - df = pd.concat(list_results, ignore_index=True, sort=False) - - return df + return helper_read_group_data(env, gil, **kwargs) + pbs = PBS(env, prl_options, splitter=split_gil, merger=merge_identity) + list_results = pbs.run( + gil, + helper_read_group_data, + {"env": env, **kwargs} + ) + + return pd.concat(list_results, ignore_index=True, sort=False) def viz_gms2_groups_over_gc(env, df): diff --git a/code/python/driver/viz_mgm_model.py b/code/python/driver/viz_mgm_model.py index c468001..25e2a32 100644 --- a/code/python/driver/viz_mgm_model.py +++ b/code/python/driver/viz_mgm_model.py @@ -70,15 +70,13 @@ def visualize_motif(env, mgm, genome_type, tag, **kwargs): gc_mod_pair = dict() # type: Dict[float, MGMModelGC] - for i, gc_tag in enumerate(all_gc_tags): + for gc_tag in all_gc_tags: mgm_mod_gc = mgm.items_by_species_and_gc[genome_type[0]][gc_tag] - if f"{tag}_MAT" not in mgm_mod_gc.items.keys(): - continue - - gc_mod_pair[gc_tag] = mgm_mod_gc + if f"{tag}_MAT" in mgm_mod_gc.items.keys(): + gc_mod_pair[gc_tag] = mgm_mod_gc - if len(gc_mod_pair) == 0: + if not gc_mod_pair: return fig, axes = plt.subplots(num_rows, num_cols, sharex="all", sharey="all") @@ -90,7 +88,7 @@ def visualize_motif(env, mgm, genome_type, tag, **kwargs): mgm_mod_gc = gc_mod_pair[gc_tag] motif_mat = MotifModel(mgm_mod_gc.items[f"{tag}_MAT"], mgm_mod_gc.items[f"{tag}_POS_DISTR"]) - nonc_mat = GMS2Noncoding(mgm_mod_gc.items[f"NON_MAT"]) + nonc_mat = GMS2Noncoding(mgm_mod_gc.items["NON_MAT"]) df_motif_mat = motif_mat.pwm_to_df() np_nonc_mat = nonc_mat.pwm_to_array(0) @@ -107,7 +105,7 @@ def visualize_motif(env, mgm, genome_type, tag, **kwargs): # spacer ax = axes_sp.ravel()[i] - x = [a for a in range(len(motif_mat._spacer))] + x = list(range(len(motif_mat._spacer))) y = motif_mat._spacer seaborn.lineplot(x, y, ax=ax) @@ -127,20 +125,36 @@ def visualize_start_context(env, mgm, genome_type, tag, **kwargs): all_gc_tags = sorted(mgm.items_by_species_and_gc[genome_type[0]]) # get all possible words in start contexts - all_words = sorted(set( - w for gc_tag in all_gc_tags - for w in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items[f"{tag}_MAT"] - if f"{tag}_MAT" in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items - - )) - - all_positions = sorted(set( - p for gc_tag in all_gc_tags - for w in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items[f"{tag}_MAT"] - if f"{tag}_MAT" in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items - for p in range(len(mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items[f"{tag}_MAT"][w])) - - )) + all_words = sorted( + { + w + for gc_tag in all_gc_tags + for w in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items[ + f"{tag}_MAT" + ] + if f"{tag}_MAT" + in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items + } + ) + + all_positions = sorted( + { + p + for gc_tag in all_gc_tags + for w in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items[ + f"{tag}_MAT" + ] + if f"{tag}_MAT" + in mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items + for p in range( + len( + mgm.items_by_species_and_gc[genome_type[0]][gc_tag].items[ + f"{tag}_MAT" + ][w] + ) + ) + } + ) if len(all_words) == 0: return @@ -153,8 +167,8 @@ def visualize_start_context(env, mgm, genome_type, tag, **kwargs): for i, w in tqdm(enumerate(all_words), f"Words in position {p}", total=len(all_words), leave=True, position=2): ax = axes.ravel()[i] - x = list() - y = list() + x = [] + y = [] for gc_tag in all_gc_tags: mod = mgm.items_by_species_and_gc[genome_type[0]][gc_tag] @@ -181,8 +195,8 @@ def visualize_start_codons(env, mgm, genome_type, gms2_group, **kwargs): fig, axes = plt.subplots() for s in starts: - x = list() - y = list() + x = [] + y = [] for gc_tag in all_gc_tags: mod = mgm.items_by_species_and_gc[genome_type[0]][gc_tag] diff --git a/code/python/driver/viz_motifs_across_gc.py b/code/python/driver/viz_motifs_across_gc.py index 987433a..b4ed366 100644 --- a/code/python/driver/viz_motifs_across_gc.py +++ b/code/python/driver/viz_motifs_across_gc.py @@ -90,6 +90,10 @@ def visualize_matrix_column(env, df, col): gc = df["GC"] group = df["GENOME_TYPE"] + # with cbar + legend_size = 20 + fontsize="x-large" + for r in range(1): reducer = umap.UMAP(random_state=r) @@ -174,10 +178,8 @@ def visualize_matrix_column(env, df, col): fig.savefig(next_name(env["pd-work"]), box_inches='tight') plt.show() - - # with cbar - legend_size = 20 + # figsize = set_size("thesis", subplots=(2,3)) fig = plt.figure(figsize=(17, 6)) #subplots(1, 2, figsize=(12, 6)) # fig = plt.figure(figsize=figsize) # subplots(1, 2, figsize=(12, 6)) @@ -190,9 +192,7 @@ def visualize_matrix_column(env, df, col): # cbar_pad=0.1 ) - axes = [ax for ax in grid] - fontsize="x-large" - + axes = list(grid) df_tmp = pd.DataFrame({ "x1": embedding[:, 0], "x2": embedding[:, 1], "Type": df["Type"].values }) @@ -216,7 +216,7 @@ def visualize_matrix_column(env, df, col): leg = axes[1].legend(loc="upper center", ncol=3, fontsize=fontsize, bbox_to_anchor=(0.5, 0), frameon=False) leg2 = leg - for lh in leg.legendHandles: + for lh in leg2.legendHandles: lh.set_sizes([legend_size] * len(df_tmp["Group"].unique())) mappable = create_mappable_for_colorbar(gc, "viridis") diff --git a/code/python/driver/viz_stats_per_gene.py b/code/python/driver/viz_stats_per_gene.py index fa779b8..213a2a6 100644 --- a/code/python/driver/viz_stats_per_gene.py +++ b/code/python/driver/viz_stats_per_gene.py @@ -48,7 +48,7 @@ def get_stats_at_gcfid_level(df, tools): # type: (pd.DataFrame) -> pd.DataFrame - list_entries = list() + list_entries = [] ps = powerset(tools, min_len=2) @@ -163,7 +163,7 @@ def tools_match_for_dataframe_row(r, tools): # type: (pd.Series, Iterable[str]) -> bool # check all tools make a prediction for current gene - list_5ps = list() + list_5ps = [] for t in tools: if r[f"5p-{t}"] is None: diff --git a/code/python/driver/viz_stats_per_gene_on_chunks.py b/code/python/driver/viz_stats_per_gene_on_chunks.py index 5d4be80..6ef5f7a 100644 --- a/code/python/driver/viz_stats_per_gene_on_chunks.py +++ b/code/python/driver/viz_stats_per_gene_on_chunks.py @@ -67,7 +67,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): # type: (pd.DataFrame, List[str], str) -> pd.DataFrame - list_entries = list() + list_entries = [] for gcfid, df_group in df.groupby("Genome", as_index=False): @@ -122,7 +122,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): for t in tools + [reference]: result[f"Number of Predictions({t},{t})"] = df_group[f"5p-{t}"].count() - result[f"Runtime({t},{t})"] = df_group[f"Runtime"].mean() + result[f"Runtime({t},{t})"] = df_group["Runtime"].mean() if t != reference: result[f"Precision({t},{reference})"] = result[f"Number of Found({t},{reference})"] / result[ f"Number of Predictions({t},{t})"] @@ -136,7 +136,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): result[f"Specificity({t},{reference})"] = result[f"Number of Found({t},{reference})"] / result[ f"Number of Predictions({t},{t})"] - # result[f"Runtime({t, t})"] = df_group[f"Runtime"].mean() + # result[f"Runtime({t, t})"] = df_group[f"Runtime"].mean() result["Genome"] = gcfid result["Genome GC"] = df_group.at[df_group.index[0], "Genome GC"] @@ -366,7 +366,7 @@ def viz_plot_per_genome_y_error_x_chunk(env, df): "Number of IC5p Match", "Number of IC5p Found", "Number of IC3p Match", "Number of IC3p Found", "Number of Comp Match", "Number of Comp Found", "Precision", "Recall", "WR", "Number of Missed", "IC3p Match", "IC5p Match", "Comp Match"] - df_total = list() + df_total = [] for v in values_to_melt: if v == "Precision": print('hi') @@ -476,32 +476,6 @@ def viz_plot_per_genome_y_error_x_chunk(env, df): return - fig, axes = plt.subplots(2, 4, sharey="all", sharex="all") - axes = axes.ravel() - for i, g in enumerate(genomes): - ax = axes[i] # type: plt.Axes - - df_curr = df[df["Genome"] == g] - df_curr = pd.melt(df_curr, id_vars=["Genome", "Chunk Size"], - value_vars=[x for x in df_curr.columns if "Number of Error(" in x], - var_name="Combination", value_name="Number of Error") - - seaborn.lineplot("Chunk Size", "Number of Error", data=df_curr, hue="Combination", ax=ax, legend=False) - - plt.show() - fig, axes = plt.subplots(2, 4, sharey="all", sharex="all") - axes = axes.ravel() - - for i, g in enumerate(genomes): - ax = axes[i] # type: plt.Axes - df_curr = df[df["Genome"] == g] - df_curr = pd.melt(df_curr, id_vars=["Genome", "Chunk Size"], - value_vars=[x for x in df_curr.columns if "Number of Found(" in x], - var_name="Combination", value_name="Number of Found") - seaborn.lineplot("Chunk Size", "Number of Found", data=df_curr, hue="Combination", ax=ax, legend=False) - - plt.show() - def viz_plot_per_genome_5p(env, df_gcfid): # type: (Environment, pd.DataFrame) -> None diff --git a/code/python/driver/viz_stats_per_gene_on_chunks_large.py b/code/python/driver/viz_stats_per_gene_on_chunks_large.py index 07494c5..c7ccb6f 100644 --- a/code/python/driver/viz_stats_per_gene_on_chunks_large.py +++ b/code/python/driver/viz_stats_per_gene_on_chunks_large.py @@ -74,7 +74,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): # type: (pd.DataFrame, List[str], str) -> pd.DataFrame - list_entries = list() + list_entries = [] for t in tools + [reference]: df = df[~((df[f"Partial5p-{t}"] == True) & (df[f"Partial3p-{t}"] == True))] @@ -132,7 +132,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): for t in tools + [reference]: result[f"Number of Predictions({t},{t})"] = df_group[f"5p-{t}"].count() - result[f"Runtime({t},{t})"] = df_group[f"Runtime"].mean() + result[f"Runtime({t},{t})"] = df_group["Runtime"].mean() if t != reference: result[f"Precision({t},{reference})"] = result[f"Number of Found({t},{reference})"] / result[ f"Number of Predictions({t},{t})"] @@ -146,7 +146,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): result[f"Specificity({t},{reference})"] = result[f"Number of Found({t},{reference})"] / result[ f"Number of Predictions({t},{t})"] - # result[f"Runtime({t, t})"] = df_group[f"Runtime"].mean() + # result[f"Runtime({t, t})"] = df_group[f"Runtime"].mean() result["Genome"] = gcfid result["Genome GC"] = df_group.at[df_group.index[0], "Genome GC"] @@ -376,7 +376,7 @@ def viz_plot_per_genome_y_error_x_chunk(env, df): "Number of IC5p Match", "Number of IC5p Found", "Number of IC3p Match", "Number of IC3p Found", "Number of Comp Match", "Number of Comp Found", "Precision", "Recall", "WR", "Number of Missed", "IC3p Match", "IC5p Match", "Comp Match"] - df_total = list() + df_total = [] for v in values_to_melt: if v == "Precision": print('hi') @@ -486,32 +486,6 @@ def viz_plot_per_genome_y_error_x_chunk(env, df): return - fig, axes = plt.subplots(2, 4, sharey="all", sharex="all") - axes = axes.ravel() - for i, g in enumerate(genomes): - ax = axes[i] # type: plt.Axes - - df_curr = df[df["Genome"] == g] - df_curr = pd.melt(df_curr, id_vars=["Genome", "Chunk Size"], - value_vars=[x for x in df_curr.columns if "Number of Error(" in x], - var_name="Combination", value_name="Number of Error") - - seaborn.lineplot("Chunk Size", "Number of Error", data=df_curr, hue="Combination", ax=ax, legend=False) - - plt.show() - fig, axes = plt.subplots(2, 4, sharey="all", sharex="all") - axes = axes.ravel() - - for i, g in enumerate(genomes): - ax = axes[i] # type: plt.Axes - df_curr = df[df["Genome"] == g] - df_curr = pd.melt(df_curr, id_vars=["Genome", "Chunk Size"], - value_vars=[x for x in df_curr.columns if "Number of Found(" in x], - var_name="Combination", value_name="Number of Found") - seaborn.lineplot("Chunk Size", "Number of Found", data=df_curr, hue="Combination", ax=ax, legend=False) - - plt.show() - def viz_plot_per_genome_5p(env, df_gcfid): # type: (Environment, pd.DataFrame) -> None @@ -753,7 +727,9 @@ def viz_stats_5p_error_rate_partial(env, df_tidy, reference): df2_tidy = reduce(lambda df1, df2: pd.merge(df1, df2, on=["Chunk Size", "Condition", "Tool"], how="outer"), [df2_tidy, df_tmp]) - df2_tidy[f"Error Rate"] = (df2_tidy[f"Found"] - df2_tidy[f"Match"]) / df2_tidy[f"Found"] + df2_tidy["Error Rate"] = ( + df2_tidy["Found"] - df2_tidy["Match"] + ) / df2_tidy["Found"] df2_tidy["Condition"].replace({ "IC5p": "Incomplete at Gene Start", @@ -765,7 +741,7 @@ def viz_stats_5p_error_rate_partial(env, df_tidy, reference): g = seaborn.FacetGrid(df2_tidy, col="Condition", hue="Tool", sharey=True, palette=CM.get_map("tools"), hue_order=hue_order) - g.map(plt.plot, "Chunk Size", f"Error Rate") + g.map(plt.plot, "Chunk Size", "Error Rate") g.set_titles("{col_name}", style="italic") # g.set(ylim=(0, 1)) g.set(xlim=(0, 5100)) @@ -809,7 +785,7 @@ def viz_stats_5p_error_rate_partial(env, df_tidy, reference): df_tmp = reduce(lambda df1, df2: pd.merge(df1, df2, on=["Chunk Size", "Condition", "Tool"], how="outer"), [df2_tidy, df_tmp]) - df_tmp[f"Score"] = (df_tmp[f"Score"] - df_tmp[f"Match"]) / df_tmp[f"Score"] + df_tmp["Score"] = (df_tmp["Score"] - df_tmp["Match"]) / df_tmp["Score"] df_tmp["Metric"] = "Error Rate" df2_tidy = pd.concat([df2_tidy, df_tmp]) @@ -827,7 +803,7 @@ def viz_stats_5p_error_rate_partial(env, df_tidy, reference): row="Metric", hue_order=hue_order ) - g.map(plt.plot, "Chunk Size", f"Score") + g.map(plt.plot, "Chunk Size", "Score") g.set_titles("{col_name}", style="italic") # g.set(ylim=(0, 1)) # g.set(xlim=(0, 5100)) @@ -854,11 +830,10 @@ def _helper_join_reference_and_tidy_data(env, df_per_gene, tools, list_ref): reference = _helper_df_joint_reference(df_per_gene, list_ref) df_per_gene = update_dataframe_with_stats(df_per_gene, tools, reference).copy() - #### Genome Level - # compute stats per genome - df_stats_gcfid = list() - for _, df_group in df_per_gene.groupby("Chunk Size", as_index=False): - df_stats_gcfid.append(get_stats_at_gcfid_level_with_reference(df_group, tools, reference)) + df_stats_gcfid = [ + get_stats_at_gcfid_level_with_reference(df_group, tools, reference) + for _, df_group in df_per_gene.groupby("Chunk Size", as_index=False) + ] df_per_genome = pd.concat(df_stats_gcfid, ignore_index=True, sort=False) df_tidy = tidy_genome_level(env, df_per_genome) @@ -876,7 +851,7 @@ def yeild_from_file_per_genome_per_chunk(pf_data): gaat = 8 genome_to_df = dict() # type: Dict[str, Dict[str, pd.DataFrame]] - list_df_genome = list() + list_df_genome = [] prev_genome = None prev_chunk = None for df_chunk in gen_df_chunk: diff --git a/code/python/driver/viz_stats_per_gene_with_reference.py b/code/python/driver/viz_stats_per_gene_with_reference.py index 3e0f906..f70a1ba 100644 --- a/code/python/driver/viz_stats_per_gene_with_reference.py +++ b/code/python/driver/viz_stats_per_gene_with_reference.py @@ -64,7 +64,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): # type: (pd.DataFrame, List[str], str) -> pd.DataFrame - list_entries = list() + list_entries = [] for gcfid, df_group in df.groupby("Genome", as_index=False): @@ -106,7 +106,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): def viz_stats_as_function_of_reference_length(env, df_per_gene, tools, reference): # type: (Environment, pd.DataFrame, List[str], str) -> None - list_entries = list() + list_entries = [] max_length = np.nanmax(df_per_gene[f"Length({reference})"]) @@ -293,7 +293,7 @@ def viz_stats_3p_missed_vs_length(env, df_per_gene, tools, reference): min_length = df_with_reference.iloc[0][f"Length({reference})"] - list_entries = list() + list_entries = [] for t in tools: curr_length = min_length position = 0 @@ -331,7 +331,7 @@ def viz_stats_3p_missed_vs_length(env, df_per_gene, tools, reference): # collect in bins bins = [[0, 150], [150, 300], [300, 600], [600, 900], [900, float('inf')]] - list_entries = list() + list_entries = [] for t in tools + [reference]: df_tool = df_per_gene[~df_per_gene[f"5p-{t}"].isna()] for b in bins: @@ -392,11 +392,10 @@ def viz_stats_3p(env, df_per_gene, tools, list_ref): reference = _helper_df_joint_reference(df_per_gene, list_ref) df_per_gene = update_dataframe_with_stats(df_per_gene, tools, reference).copy() - #### Genome Level - # compute stats per genome - df_stats_gcfid = list() - for _, df_group in df_per_gene.groupby("Genome", as_index=False): - df_stats_gcfid.append(get_stats_at_gcfid_level_with_reference(df_group, tools, reference)) + df_stats_gcfid = [ + get_stats_at_gcfid_level_with_reference(df_group, tools, reference) + for _, df_group in df_per_gene.groupby("Genome", as_index=False) + ] df_per_genome = pd.concat(df_stats_gcfid, ignore_index=True, sort=False) df_tidy = tidy_genome_level(env, df_per_genome) @@ -433,11 +432,10 @@ def viz_stats_5p(env, df_per_gene, tools, list_ref): reference = _helper_df_joint_reference(df_per_gene, list_ref) df_per_gene = update_dataframe_with_stats(df_per_gene, tools, reference).copy() - #### Genome Level - # compute stats per genome - df_stats_gcfid = list() - for _, df_group in df_per_gene.groupby("Genome", as_index=False): - df_stats_gcfid.append(get_stats_at_gcfid_level_with_reference(df_group, tools, reference)) + df_stats_gcfid = [ + get_stats_at_gcfid_level_with_reference(df_group, tools, reference) + for _, df_group in df_per_gene.groupby("Genome", as_index=False) + ] df_per_genome = pd.concat(df_stats_gcfid, ignore_index=True, sort=False) df_tidy = tidy_genome_level(env, df_per_genome) @@ -468,7 +466,7 @@ def main(env, args): # get tools list # If not provided, extract from df # Make sure it doesn't contain any references - all_tools = sorted(set([x.split("-")[1] for x in df.columns if "5p-" in x])) + all_tools = sorted({x.split("-")[1] for x in df.columns if "5p-" in x}) # check that references exist for list_ref in [args.ref_5p, args.ref_3p]: diff --git a/code/python/lib/mg_bio/general.py b/code/python/lib/mg_bio/general.py index 1de4452..1e5ea7a 100644 --- a/code/python/lib/mg_bio/general.py +++ b/code/python/lib/mg_bio/general.py @@ -20,10 +20,7 @@ def compute_gc_from_sequences(sequences): total = sum(counts.values()) count_gc = counts["G"] + counts["C"] - if total == 0: - return 0.0 - - return 100 * count_gc / float(total) + return 0.0 if total == 0 else 100 * count_gc / float(total) def compute_single_gc_from_file(pf_sequences): diff --git a/code/python/lib/mg_container/genome_list.py b/code/python/lib/mg_container/genome_list.py index 0e67fab..b85651e 100644 --- a/code/python/lib/mg_container/genome_list.py +++ b/code/python/lib/mg_container/genome_list.py @@ -81,7 +81,7 @@ def init_from_file(cls, pf_table): df = pd.read_csv(pf_table, header=0) - list_genome_info = list() + list_genome_info = [] def parse_attributes(my_str): # type: (str) -> Dict[str, Any] diff --git a/code/python/lib/mg_container/gms2_mod.py b/code/python/lib/mg_container/gms2_mod.py index 16d62d5..c922049 100644 --- a/code/python/lib/mg_container/gms2_mod.py +++ b/code/python/lib/mg_container/gms2_mod.py @@ -33,17 +33,15 @@ def init_from_file(cls, pf_mod): if len(value) == 1: result[tag] = value[0] + elif tag.endswith("_MAT"): + result[tag] = convert_to_matrix(value) + elif tag.endswith("_POS_DISTR"): + result[tag] = convert_to_position_distribution(value) else: - if tag.endswith("_MAT"): - result[tag] = convert_to_matrix(value) - elif tag.endswith("_POS_DISTR"): - result[tag] = convert_to_position_distribution(value) - else: - log.warning(f"Unknown format for tag: {tag}") + log.warning(f"Unknown format for tag: {tag}") else: - pass position += 1 - # raise ValueError("Error in reading file") + # raise ValueError("Error in reading file") return cls(result) diff --git a/code/python/lib/mg_container/mgm_model.py b/code/python/lib/mg_container/mgm_model.py index 666aa6b..7e31134 100644 --- a/code/python/lib/mg_container/mgm_model.py +++ b/code/python/lib/mg_container/mgm_model.py @@ -91,18 +91,15 @@ def init_from_file(cls, pf_mod): curr_word = words[position] + position += 1 if curr_word.startswith("__"): species, _, gc = curr_word[2:].split("_") - position += 1 mgm_model_gc, position = MGMModel._read_mgm_model_gc(words, position, gc) if species not in result.keys(): result[species] = dict() result[species][gc] = mgm_model_gc - else: - position += 1 - return cls(result) @staticmethod @@ -116,7 +113,7 @@ def _read_value(words, position): """ num_words = len(words) - result = list() + result = [] while position < num_words: curr_word = words[position] @@ -143,13 +140,12 @@ def _read_mgm_model_gc(words, position, gc): if len(value) == 1: result[tag] = value[0] + elif tag.endswith("_MAT"): + result[tag] = convert_to_matrix(value) + elif tag.endswith("_POS_DISTR"): + result[tag] = convert_to_position_distribution(value) else: - if tag.endswith("_MAT"): - result[tag] = convert_to_matrix(value) - elif tag.endswith("_POS_DISTR"): - result[tag] = convert_to_position_distribution(value) - else: - log.warning(f"Unknown format for tag: {tag}") + log.warning(f"Unknown format for tag: {tag}") elif curr_word.startswith("__"): break else: diff --git a/code/python/lib/mg_container/msa.py b/code/python/lib/mg_container/msa.py index 28d16e7..d3851b3 100644 --- a/code/python/lib/mg_container/msa.py +++ b/code/python/lib/mg_container/msa.py @@ -35,10 +35,7 @@ def to_string(self, begin=None, end=None): # my_str = self.gap * self.mark_position + self.mark + self.gap * (self.msa_length - self.mark_position - 1) my_str = MSASinglePointMarker.create_mark_line(self.mark_position, self.msa_length, mark_tag=self.mark) - if begin != 0 or end != self.msa_length: - return my_str[begin:end] - else: - return my_str + return my_str[begin:end] if begin != 0 or end != self.msa_length else my_str def change_symbol(self, new_symbol, old_symbol=None): # type: (str, str) -> None @@ -53,14 +50,15 @@ def create_mark_line(mark_position, length, **kwargs): mark_tag = get_value(kwargs, "mark_tag", "M", invalid={None}) if len(mark_tag) != 1: - raise ValueError("Mark tag ({}) should have length of 1".format(mark_tag)) + raise ValueError(f"Mark tag ({mark_tag}) should have length of 1") - if mark_position is None: - mark_sequence = "-" * length - else: - mark_sequence = "-" * mark_position + mark_tag + "-" * (length - mark_position - 1) - - return mark_sequence + return ( + "-" * length + if mark_position is None + else "-" * mark_position + + mark_tag + + "-" * (length - mark_position - 1) + ) class MSAType: @@ -68,9 +66,9 @@ class MSAType: def __init__(self, alignments, **kwargs): # type: (MultipleSeqAlignment, Dict[str, Any]) -> None - self.list_msa_markers = get_value(kwargs, "list_msa_markers", list()) # type: List[MSASinglePointMarker] + self.list_msa_markers = get_value(kwargs, "list_msa_markers", []) - self.list_alignment_sequences = [s for s in alignments] # type: List[SeqRecord] + self.list_alignment_sequences = list(alignments) def get_mark_position(self, name): # type: (str) -> Union[int, None] @@ -78,7 +76,7 @@ def get_mark_position(self, name): mark = list_find_first(self.list_msa_markers, lambda x: x.name == name) # type: MSASinglePointMarker if mark is None: - raise ValueError("Unknown mark name ({})".format(name)) + raise ValueError(f"Unknown mark name ({name})") return mark.mark_position @@ -136,8 +134,8 @@ def find_first_non_gap(my_str): return None - list_alignments_without_marks = list() - marks = list() + list_alignments_without_marks = [] + marks = [] msa_length = alignment.get_alignment_length() for a in alignment: @@ -179,8 +177,8 @@ def read_as_standard_clustal(pf_msa): alignments_from_file = read_as_standard_clustal(pf_msa) - alignments_processed_list = list() - list_markers_info = list() + alignments_processed_list = [] + list_markers_info = [] for a in alignments_from_file: if a.id[0] != "#": @@ -190,7 +188,7 @@ def read_as_standard_clustal(pf_msa): non_gap_positions = [x for x in range(len(a.seq._data)) if a.seq._data[x] != "-"] position = None mark_tag = "M" - if len(non_gap_positions) > 0: + if non_gap_positions: position = non_gap_positions[0] mark_tag = a.seq._data[position] @@ -225,11 +223,7 @@ def pad_by(curr_string, pad_len, padding=" "): for line in headers_split: val = int(float(line[1])) - if val == -1: - line[1] = "-" - else: - line[1] = "{}".format(val) - + line[1] = "-" if val == -1 else "{}".format(val) # remove dN column headers_split = [ line_split[:min(len(line_split), 5)] for line_split in headers_split @@ -247,7 +241,7 @@ def pad_by(curr_string, pad_len, padding=" "): max(len(l[col]) for l in headers_split if col < len(l)) for col in range(max_num_columns) ] - headers_pretty = list() + headers_pretty = [] for line_split in headers_split: headers_pretty.append( @@ -271,7 +265,7 @@ def change_marker(self, marker_name, new_symbol, old_symbol=None): marker = self.get_marker(marker_name) marker.change_symbol(new_symbol) except ValueError: - logger.warning("Cannot change marker for unknown name: {}".format(marker_name)) + logger.warning(f"Cannot change marker for unknown name: {marker_name}") @@ -319,7 +313,7 @@ def get_summary_statistics_line_for_alignment(): ref_position = self.get_mark_position("ref") - is_lorf = len(set(self[0][0:ref_position])) <= 1 + is_lorf = len(set(self[0][:ref_position])) <= 1 def count_lorf_targets_near_position(position): # type: (int) -> int @@ -330,11 +324,11 @@ def count_lorf_targets_near_position(position): j = 0 while True: if position-j >= 0 and self[idx][position-j].isupper(): - if len(set(self[idx][0:position-j])) <= 1: + if len(set(self[idx][: position - j])) <= 1: count += 1 break if position+j < self.alignment_length() and self[idx][position+j].isupper(): - if len(set(self[idx][0:position+j])) <= 1: + if len(set(self[idx][: position + j])) <= 1: count += 1 break @@ -364,7 +358,8 @@ def to_string(self, begin=None, end=None, **kwargs): # add markers as sequence records seq_records = [ - SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name)) for m in self.list_msa_markers + SeqRecord(Seq(m.to_string(begin, end)), id=f"#{m.name}") + for m in self.list_msa_markers ] if begin is not None or end is not None: diff --git a/code/python/lib/mg_container/shelf.py b/code/python/lib/mg_container/shelf.py index 941b050..62cdb15 100644 --- a/code/python/lib/mg_container/shelf.py +++ b/code/python/lib/mg_container/shelf.py @@ -15,16 +15,11 @@ def read_value_for_tag(words, position, **kwargs): stop_if_starts_with = get_value(kwargs, "stop_if_starts_with", {"$"}, valid_type=Set[str]) num_words = len(words) - result = list() + result = [] while position < num_words: curr_word = words[position] - should_stop = False - for s in stop_if_starts_with: - if curr_word.startswith(s): - should_stop = True - break - + should_stop = any(curr_word.startswith(s) for s in stop_if_starts_with) if should_stop: break @@ -70,11 +65,10 @@ def convert_to_matrix(words): break try: - float_word = float(curr_word) - # number if key is None: raise ValueError(f"Readingn value {curr_word} without key") + float_word = float(curr_word) result[key].append(float_word) except ValueError: @@ -83,7 +77,7 @@ def convert_to_matrix(words): if key in result: raise ValueError(f"Reading same key multiple times {key}") - result[key] = list() + result[key] = [] return result @@ -103,13 +97,6 @@ def gms2_model_matrix_to_string(value): def gms2_model_position_distribution_to_string(value): - # type: (List[float]) -> str - - out = "" - - for i, v in enumerate(value): - out += f"{i} {v}\n" - - return out + return "".join(f"{i} {v}\n" for i, v in enumerate(value)) diff --git a/code/python/lib/mg_container/taxonomy_tree.py b/code/python/lib/mg_container/taxonomy_tree.py index 1afe4a4..1eb9c6c 100644 --- a/code/python/lib/mg_container/taxonomy_tree.py +++ b/code/python/lib/mg_container/taxonomy_tree.py @@ -127,7 +127,7 @@ def init_from_file(cls, pf_input, pf_names): logger.info("Get names of nodes") dict_taxid_names = TaxonomyTree._get_names_per_taxid(df_names) - root_nodes = list() + root_nodes = [] # add each node to its parent's children for tax_id, node in tqdm(dict_tax_id_node.items(), "Building tree"): @@ -148,7 +148,7 @@ def init_from_file(cls, pf_input, pf_names): if len(root_nodes) > 1: raise ValueError("More than one root node available") - if len(root_nodes) == 0: + if not root_nodes: raise ValueError("No root node detected") return TaxonomyTree(root_nodes[0]) @@ -202,14 +202,10 @@ def to_string_current_level(node, depth, **kwargs): attribute_name = get_value(kwargs, "attribute_name", None) attribute_format = get_value(kwargs, "attribute_format", "{}", default_if_none=True) - output = "" - single_level = " |" depth_level = single_level * depth - if depth > 0: - output = depth_level + "__ " - + output = depth_level + "__ " if depth > 0 else "" # get tag tag_value = node.tax_id if tag_name is not None: @@ -218,7 +214,7 @@ def to_string_current_level(node, depth, **kwargs): output += str(tag_value) if attribute_name is not None: - output += "\t({})".format(attribute_format).format(node.attributes[attribute_name]) + output += f"\t({attribute_format})".format(node.attributes[attribute_name]) return output @@ -292,11 +288,9 @@ def get_node_with_tag(self, ancestor_tag, tag_type): current_node = self.root - lifo = list() - - lifo.append(current_node) + lifo = [current_node] - while len(lifo) > 0: + while lifo: p = lifo.pop() # if is node we're searching for @@ -304,20 +298,16 @@ def get_node_with_tag(self, ancestor_tag, tag_type): return p # otherwise add all children - for child in p.children(): - lifo.append(child) - + lifo.extend(iter(p.children())) return None @staticmethod def get_leaves_under_node(node): # type: (Node) -> Generator[Node] - lifo = list() - - lifo.append(node) + lifo = [node] - while len(lifo) > 0: + while lifo: p = lifo.pop() # if is leaf @@ -325,8 +315,7 @@ def get_leaves_under_node(node): yield p # otherwise add all children - for child in p.children(): - lifo.append(child) + lifo.extend(iter(p.children())) def get_genomes_under_ancestor(self, ancestor_tag, tag_type): # type: (Union[str, int], str) -> Generator[Dict[str, Any]] @@ -334,7 +323,7 @@ def get_genomes_under_ancestor(self, ancestor_tag, tag_type): ancestor_node = self.get_node_with_tag(ancestor_tag, tag_type) if ancestor_tag is None: # empty generator - return (_ for _ in ()) + return iter(()) for curr_node in TaxonomyTree.get_leaves_under_node(ancestor_node): yield curr_node.attributes @@ -343,18 +332,15 @@ def get_genomes_under_ancestor(self, ancestor_tag, tag_type): def get_nodes_under_ancestor(node): # type: (Node) -> Generator[Node] - lifo = list() - - lifo.append(node) + lifo = [node] - while len(lifo) > 0: + while lifo: p = lifo.pop() yield p # otherwise add all children - for child in p.children(): - lifo.append(child) + lifo.extend(iter(p.children())) def get_possible_genomes_under_ancestor(self, ancestor_tag, tag_type): # type: (Union[str, int], str) -> Generator[Dict[str, Any]] @@ -362,7 +348,7 @@ def get_possible_genomes_under_ancestor(self, ancestor_tag, tag_type): ancestor_node = self.get_node_with_tag(ancestor_tag, tag_type) if ancestor_node is None: # empty generator - return (_ for _ in ()) + return iter(()) for curr_node in TaxonomyTree.get_nodes_under_ancestor(ancestor_node): if curr_node is not None: diff --git a/code/python/lib/mg_general/general.py b/code/python/lib/mg_general/general.py index 094581a..41caf4c 100644 --- a/code/python/lib/mg_general/general.py +++ b/code/python/lib/mg_general/general.py @@ -113,22 +113,14 @@ def next_name(pd_work, **kwargs): next_name.counters[ext] = -1 next_name.counters[ext] += 1 - return os_join(pd_work, "{}.{}".format(next_name.counters[ext], ext)) + return os_join(pd_work, f"{next_name.counters[ext]}.{ext}") def create_gene_key(genome=None, accession=None, left=None, right=None, strand=None, delimiter=";"): # type: (object, object, object, object, object, str) -> str - return "{}{}{}{}{}{}{}{}{}".format( - genome, delimiter, - accession, delimiter, - left, delimiter, - right, delimiter, - strand - ) + return f"{genome}{delimiter}{accession}{delimiter}{left}{delimiter}{right}{delimiter}{strand}" def fix_names(r): # type: (pd.Series) -> str - return "{}. {}".format( - r["Genome"][0], r["Genome"].split("_")[1] - ) + return f'{r["Genome"][0]}. {r["Genome"].split("_")[1]}' diff --git a/code/python/lib/mg_general/genome_splitter.py b/code/python/lib/mg_general/genome_splitter.py index ec022ee..a9174c2 100644 --- a/code/python/lib/mg_general/genome_splitter.py +++ b/code/python/lib/mg_general/genome_splitter.py @@ -56,7 +56,7 @@ def split_fasta_into_chunks(sequences, chunk_size_nt, **kwargs): if not allow_split_in_cds and labels: interval_labels = GenomeSplitter.split_labels_into_intervals(labels) - list_chunk_info = list() + list_chunk_info = [] counter = 0 for seqname, seqrecord in sequences.items(): @@ -68,7 +68,7 @@ def split_fasta_into_chunks(sequences, chunk_size_nt, **kwargs): right_excluded = min(offset + chunk_size_nt, len(seqrecord)) while interval_labels and left > 0 and interval_labels.overlaps_point(right_excluded-1) \ - and right_excluded < len(seqrecord): + and right_excluded < len(seqrecord): lab = interval_labels[right_excluded-1].pop().data right_excluded = lab.right() + 1 # skip diff --git a/code/python/lib/mg_general/labels.py b/code/python/lib/mg_general/labels.py index 3151c27..c4ab088 100644 --- a/code/python/lib/mg_general/labels.py +++ b/code/python/lib/mg_general/labels.py @@ -16,9 +16,7 @@ def to_string(self, field=None, shift_coordinates_by=0, delim="\t"): s = int(shift_coordinates_by) def stringify(element): - if element is None: - return "" - return str(element) + return "" if element is None else str(element) if field is None: return stringify(int(self.left) + s) + delim + stringify(int(self.right) + s) + delim + stringify( @@ -33,7 +31,7 @@ def stringify(element): if field == "strand": return stringify(self.strand) - raise ValueError("Unrecognized field: " + stringify(field)) + raise ValueError(f"Unrecognized field: {stringify(field)}") def get_5prime(self): return self.left if self.strand == "+" else self.right @@ -43,18 +41,9 @@ def get_3prime(self): @classmethod def from_fields(cls, fields): - # type: (dict) -> Coordinates - left = None - right = None - strand = None - - if "left" in fields: - left = fields["left"] - if "right" in fields: - right = fields["right"] - if "strand" in fields: - strand = fields["strand"] - + left = fields["left"] if "left" in fields else None + right = fields["right"] if "right" in fields else None + strand = fields["strand"] if "strand" in fields else None return cls(left, right, strand) @@ -165,10 +154,9 @@ def incomplete_at_3prime(self): def is_hypothetical(self): # type: () -> bool - if self.get_attribute_value("product") is not None and "hypothetical" in self.get_attribute_value("product"): - return True - - return False + return self.get_attribute_value( + "product" + ) is not None and "hypothetical" in self.get_attribute_value("product") def is_frameshifted(self): # type: () -> bool @@ -177,10 +165,7 @@ def is_frameshifted(self): return False length = self.coordinates().right - self.coordinates().left + 1 - if length % 3 != 0: - return True - - return False + return length % 3 != 0 @classmethod def minimum_set_of_field_names(cls): @@ -214,7 +199,7 @@ def default_for_key(cls, key): if key == "coordinates": return Coordinates() - raise ValueError("Unknown key: " + str(key)) + raise ValueError(f"Unknown key: {str(key)}") def get_3prime(self): if self._fields["coordinates"] is not None: @@ -250,7 +235,7 @@ def __init__(self, labels=None, name=None): # type: (List[Label], str) -> None if labels is None: - labels = list() + labels = [] self._labels = copy.copy(labels) # type: List[Label] self._labels_by_3p = {create_key_3prime_from_label(lab): lab for lab in labels} @@ -328,16 +313,17 @@ def update(self, new_labels): def to_string(self, shift_coordinates_by=0): - out = "" - for n in range(self._iter_max): - out += self._labels[n].to_string(shift_coordinates_by) + "\n" - - return out + return "".join( + self._labels[n].to_string(shift_coordinates_by) + "\n" + for n in range(self._iter_max) + ) def to_string_lst(self, shift_coordinates_by=0): - out = "# GeneMark.hmm-2 LST format\n" - out += "# GeneMark.hmm-2 prokaryotic version: 1.14\n" + out = ( + "# GeneMark.hmm-2 LST format\n" + + "# GeneMark.hmm-2 prokaryotic version: 1.14\n" + ) out += "# File with sequence: tmpseq.fna\n" out += "# File with native parameters: itr_1.mod\n" out += "# Native species name and build: gms2-training\n" @@ -348,7 +334,7 @@ def to_string_lst(self, shift_coordinates_by=0): seqname_to_labels = dict() for l in self._labels: # type: Label if l.seqname() not in seqname_to_labels: - seqname_to_labels[l.seqname()] = list() + seqname_to_labels[l.seqname()] = [] seqname_to_labels[l.seqname()].append(l) @@ -358,10 +344,10 @@ def to_string_lst(self, shift_coordinates_by=0): for counter, l in enumerate(seqname_labels): out += str(counter) - out += " " + str(l.strand()) - out += " " + str(l.left() + shift_coordinates_by) - out += " " + str(l.right() + shift_coordinates_by) - out += " " + str(l.right() - l.left() + 1) + out += f" {str(l.strand())}" + out += f" {str(l.left() + shift_coordinates_by)}" + out += f" {str(l.right() + shift_coordinates_by)}" + out += f" {str(l.right() - l.left() + 1)}" out += " " "nativebac" + " AGGAGG 6 1" out += " " + " ." @@ -452,21 +438,17 @@ def _split_by_overlap_helper(labels, labels_reference, strand): if l.strand() == "+": if index == 0: labels_with_no_overlap.add(l) + elif l.coordinates().left - labels_combined[index - 1].coordinates().right <= 0: + labels_with_overlap.add(l) else: - if l.coordinates().left - labels_combined[index - 1].coordinates().right <= 0: - labels_with_overlap.add(l) - else: - labels_with_no_overlap.add(l) + labels_with_no_overlap.add(l) - # negative strand + elif index == len(labels_combined) - 1: + labels_with_no_overlap.add(l) + elif labels_combined[index + 1].coordinates().left - l.coordinates().right <= 0: + labels_with_overlap.add(l) else: - if index == len(labels_combined) - 1: - labels_with_no_overlap.add(l) - else: - if labels_combined[index + 1].coordinates().left - l.coordinates().right <= 0: - labels_with_overlap.add(l) - else: - labels_with_no_overlap.add(l) + labels_with_no_overlap.add(l) return labels_with_overlap, labels_with_no_overlap @@ -522,9 +504,8 @@ def _compute_distance_to_upstream_genes_on_positive_strand(labels): if largest_right is None: largest_right = labels[index_of_previous].coordinates().right - else: - if largest_right < labels[index_of_previous].coordinates().right: - largest_right = labels[index_of_previous].coordinates().right + elif largest_right < labels[index_of_previous].coordinates().right: + largest_right = labels[index_of_previous].coordinates().right distance = lab.coordinates().left - largest_right if largest_right is not None else None @@ -558,9 +539,8 @@ def _compute_distance_to_upstream_genes_on_negative_strand(labels): if smallest_left is None: smallest_left = labels[index_of_previous].coordinates().left - else: - if smallest_left > labels[index_of_previous].coordinates().left: - smallest_left = labels[index_of_previous].coordinates().left + elif smallest_left > labels[index_of_previous].coordinates().left: + smallest_left = labels[index_of_previous].coordinates().left distance = smallest_left - lab.coordinates().right if smallest_left is not None else None @@ -591,11 +571,9 @@ def create_gene_key_from_label(label, genome_name=None): def create_key_3prime_from_label(label, genome_name=None): # type: (Label, Union[str, None]) -> str if label.strand() == "+": - return "{};{};{};{};{}".format(genome_name, label.seqname(), "", - label.coordinates().right, label.strand()) + return f"{genome_name};{label.seqname()};;{label.coordinates().right};{label.strand()}" else: - return "{};{};{};{};{}".format(genome_name, label.seqname(), label.coordinates().left, - "", label.strand()) + return f"{genome_name};{label.seqname()};{label.coordinates().left};;{label.strand()}" def shift_5prime(label, amount): @@ -610,7 +588,7 @@ def shift_5prime(label, amount): if abs(amount) % 3 != 0: import logging - logging.debug("Shifting 5prime by value ({}) not a multiple of 3".format(amount)) + logging.debug(f"Shifting 5prime by value ({amount}) not a multiple of 3") if label.strand() == "+": label.coordinates().left += amount @@ -623,6 +601,6 @@ def get_unique_gene_keys(*args): keys = set() for labels in args: - keys = keys.union(set(create_key_3prime_from_label(lab) for lab in labels)) + keys = keys.union({create_key_3prime_from_label(lab) for lab in labels}) return keys \ No newline at end of file diff --git a/code/python/lib/mg_general/labels_comparison_detailed.py b/code/python/lib/mg_general/labels_comparison_detailed.py index f48cf4e..d211c26 100644 --- a/code/python/lib/mg_general/labels_comparison_detailed.py +++ b/code/python/lib/mg_general/labels_comparison_detailed.py @@ -121,33 +121,26 @@ def _compare_labels_helper(labels_a, labels_b, **kwargs): "b": Labels([x[1] for x in compare_3p["match"].values()]) } comparison["labels"]["match-3p-not-5p"] = { - "a": Labels([x for x in compare_3p_5p["unique-a"].values()]), - "b": Labels([x for x in compare_3p_5p["unique-b"].values()]) + "a": Labels(list(compare_3p_5p["unique-a"].values())), + "b": Labels(list(compare_3p_5p["unique-b"].values())), } return comparison @staticmethod def _split_by_match_3prime(key_3prime_to_label_a, key_3prime_to_label_b): - # type: (Dict[str, Label], Dict[str, Label]) -> Dict[str, Dict[str, Any]] - result = { - "match": dict(), - "unique-a": dict(), - "unique-b": dict() - } - keys_match = set(key_3prime_to_label_a.keys()).intersection(set(key_3prime_to_label_b.keys())) keys_unique_a = set(key_3prime_to_label_a.keys()).difference(set(key_3prime_to_label_b.keys())) keys_unique_b = set(key_3prime_to_label_b.keys()).difference(set(key_3prime_to_label_a.keys())) - result["match"] = { - key: (key_3prime_to_label_a[key], key_3prime_to_label_b[key]) for key in keys_match + return { + "match": { + key: (key_3prime_to_label_a[key], key_3prime_to_label_b[key]) + for key in keys_match + }, + "unique-a": {key: key_3prime_to_label_a[key] for key in keys_unique_a}, + "unique-b": {key: key_3prime_to_label_b[key] for key in keys_unique_b}, } - result["unique-a"] = {key: key_3prime_to_label_a[key] for key in keys_unique_a} - result["unique-b"] = {key: key_3prime_to_label_b[key] for key in keys_unique_b} - - return result - @staticmethod def _split_by_match_5prime(key_to_pair_3p): # type: (Dict[str, Tuple(Label, Label)]) -> Dict[str, Dict[str, Any]] @@ -186,10 +179,10 @@ def _parse_compp_output(self, output): b = letter[1] # bad good_and_bad += [ - ("in-{}".format(g), "in_{}".format(b)), - ("long-in-{}".format(g), "long_in_{}".format(b)), - ("short-in-{}".format(g), "short_in_{}".format(b)), - ("unique-in-{}".format(g), "unique_in_{}".format(b)), + (f"in-{g}", f"in_{b}"), + (f"long-in-{g}", f"long_in_{b}"), + (f"short-in-{g}", f"short_in_{b}"), + (f"unique-in-{g}", f"unique_in_{b}"), ] self.stats = {goodname: d[badname] for goodname, badname in good_and_bad} diff --git a/code/python/lib/mg_general/shelf.py b/code/python/lib/mg_general/shelf.py index 8ccc73d..a245166 100644 --- a/code/python/lib/mg_general/shelf.py +++ b/code/python/lib/mg_general/shelf.py @@ -13,18 +13,14 @@ def test_log_level(): - log.debug(f"Test") - log.info(f"Test") - log.warning(f"Test") - log.critical(f"Test") + log.debug("Test") + log.info("Test") + log.warning("Test") + log.critical("Test") def list_find_first(a_list, a_filter): - # type: (List[Any], Callable) -> Any - for x in a_list: - if a_filter(x): - return x - return None + return next((x for x in a_list if a_filter(x)), None) def compute_gc(sequences, label=None): @@ -37,27 +33,26 @@ def compute_gc(sequences, label=None): for seqname, seqrecord in sequences.items(): for i in range(len(seqrecord)): l = seqrecord[i].upper() - if l == "G" or l == "C": + if l in ["G", "C"]: gc += 1 - elif l == "A" or l == "T": + elif l in ["A", "T"]: at += 1 total = gc + at if total != 0: gc_percent = 100.0 * gc / float(total) - else: - if label.seqname() in sequences.keys(): - seqrecord = sequences[label.seqname()] - gc = at = 0 - for i in range(label.left(), label.right()): - l = seqrecord[i].upper() - if l == "G" or l == "C": - gc += 1 - elif l == "A" or l == "T": - at += 1 - total = gc + at - if total != 0: - gc_percent = 100.0 * gc / float(total) + elif label.seqname() in sequences.keys(): + seqrecord = sequences[label.seqname()] + gc = at = 0 + for i in range(label.left(), label.right()): + l = seqrecord[i].upper() + if l in ["G", "C"]: + gc += 1 + elif l in ["A", "T"]: + at += 1 + total = gc + at + if total != 0: + gc_percent = 100.0 * gc / float(total) return gc_percent @@ -65,4 +60,8 @@ def compute_gc(sequences, label=None): def powerset(iterable, min_len=0): """powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)""" s = list(iterable) - return [x for x in chain.from_iterable(combinations(s, r) for r in range(min_len, len(s) + 1))] \ No newline at end of file + return list( + chain.from_iterable( + combinations(s, r) for r in range(min_len, len(s) + 1) + ) + ) \ No newline at end of file diff --git a/code/python/lib/mg_io/general.py b/code/python/lib/mg_io/general.py index 4671450..be8f685 100644 --- a/code/python/lib/mg_io/general.py +++ b/code/python/lib/mg_io/general.py @@ -15,9 +15,7 @@ def mkdir_p(path): try: os.makedirs(path) except OSError as exc: - if exc.errno == errno.EEXIST and os.path.isdir(path): - pass - else: + if exc.errno != errno.EEXIST or not os.path.isdir(path): raise @@ -49,5 +47,4 @@ def load_obj(name): name += ".pkl" with open(name, 'rb') as f: - loaded_data = dill.load(f) - return loaded_data + return dill.load(f) diff --git a/code/python/lib/mg_io/labels.py b/code/python/lib/mg_io/labels.py index 7aebba4..8a3cf3f 100644 --- a/code/python/lib/mg_io/labels.py +++ b/code/python/lib/mg_io/labels.py @@ -4,7 +4,7 @@ from typing import * from mg_general.general import get_value -sys.path.append(os.path.dirname(__file__) + "/..") # add custom library directory to path +sys.path.append(f"{os.path.dirname(__file__)}/..") from mg_general.labels import Label, Coordinates, Labels @@ -52,9 +52,7 @@ def read_labels_from_file(filename, shift=-1, name=None, **kwargs): line = line.strip() - m = pattern.match(line) - if m: - + if m := pattern.match(line): attributes = create_attribute_dict(m.group(9), key_value_delimiter=key_value_delimiter, delimiter=attribute_delimiter) attributes["score"] = m.group(6) @@ -84,7 +82,7 @@ def read_labels_from_file(filename, shift=-1, name=None, **kwargs): def read_lst(pf_labels, shift=-1): # type: (str, int) -> Labels - labels = list() + labels = [] # pattern = re.compile(r"([^\t]+)\t([^\t]+)\t(CDS)\t(\d+)\t(\d+)\t([^\t]+)\t([+-])\t([^\t]+)\t([^\t]+)") pattern = re.compile(r"([^\s]+)\s+([+-])\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)$") @@ -111,8 +109,7 @@ def read_lst(pf_labels, shift=-1): elif len(line.strip()) == 0 or seqname is None: continue - m = pattern.match(line) - if m: + if m := pattern.match(line): attributes = m.group(6) label = { @@ -141,7 +138,7 @@ def read_lst(pf_labels, shift=-1): def read_fgs_format(pf_labels, shift=-1): # type: (str, int) -> Labels - labels = list() + labels = [] # pattern = re.compile(r"([^\t]+)\t([^\t]+)\t(CDS)\t(\d+)\t(\d+)\t([^\t]+)\t([+-])\t([^\t]+)\t([^\t]+)") pattern = re.compile(r"\s*(\d+)\s+(\d+)\s+([+-])\s+.+$") @@ -168,8 +165,7 @@ def read_fgs_format(pf_labels, shift=-1): elif len(line.strip()) == 0 or seqname is None: continue - m = pattern.match(line) - if m: + if m := pattern.match(line): attributes = dict() label = { diff --git a/code/python/lib/mg_io/shelf.py b/code/python/lib/mg_io/shelf.py index b9dbeee..bb8b001 100644 --- a/code/python/lib/mg_io/shelf.py +++ b/code/python/lib/mg_io/shelf.py @@ -17,12 +17,7 @@ def write_sequence_list_to_fasta_file(sequences, pf_sequences): - # type: (List[Seq], str) -> None - - data = "" - for i in range(len(sequences)): - data += ">{}\n{}\n".format(i, sequences[i]) - + data = "".join(f">{i}\n{sequences[i]}\n" for i in range(len(sequences))) write_to_file(data, pf_sequences) diff --git a/code/python/lib/mg_models/building.py b/code/python/lib/mg_models/building.py index 8001bc1..69b6bec 100644 --- a/code/python/lib/mg_models/building.py +++ b/code/python/lib/mg_models/building.py @@ -24,9 +24,7 @@ def mat_to_dict(mat): # type: (np.ndarray) -> Dict[str, List[float]] - index_to_letter = { - i: x for i, x in enumerate(list("ACGT")) - } + index_to_letter = dict(enumerate(list("ACGT"))) result = dict() @@ -40,14 +38,12 @@ def mat_to_dict(mat): def get_average_zero_order_noncoding(df): # type: (pd.DataFrame) -> np.ndarray - list_arr = list() + list_arr = [] for idx in df.index: mod = GMS2Noncoding(df.at[idx, "Mod"].items["NON_MAT"]) list_arr.append(mod.pwm_to_array(0)) - avg = np.mean(list_arr, 0) - - return avg + return np.mean(list_arr, 0) def build_mgm_motif_model_for_gc(env, df, col, **kwargs): @@ -107,23 +103,21 @@ def build_mgm_motif_model_for_gc(env, df, col, **kwargs): try: for i in l.keys(): if i not in values.keys(): - values[i] = list() + values[i] = [] values[i].append(l[i]) except Exception: continue - for i in values.keys(): + for i in values: values[i] = np.mean(values[i]) total = sum(values.values()) - for i in values.keys(): + for i in values: values[i] /= total x = sorted(values.keys()) y = [values[a] for a in x] - position_distributions_by_shift[s] = { - a: b for a, b in zip(x, y) - } + position_distributions_by_shift[s] = dict(zip(x, y)) # compile into single model @@ -168,16 +162,14 @@ def build_mgm_motif_model_for_gc_v2(env, df, col, **kwargs): # Separate motifs per shift unique_shifts = sorted(set(update_shifts)) - array_per_shift = { - s: list() for s in unique_shifts - } + array_per_shift = {s: [] for s in unique_shifts} for i in range(len(update_shifts)): shift = update_shifts[i] array_per_shift[shift].append(array[i, shift:shift + original_width, :]) raw_array_per_shift = { - x: np.array(array_per_shift[x]) for x in array_per_shift.keys() + x: np.array(array_per_shift[x]) for x in array_per_shift } for s in unique_shifts: @@ -210,23 +202,21 @@ def build_mgm_motif_model_for_gc_v2(env, df, col, **kwargs): try: for i in l.keys(): if i not in values.keys(): - values[i] = list() + values[i] = [] values[i].append(l[i]) except Exception: continue - for i in values.keys(): + for i in values: values[i] = np.mean(values[i]) total = sum(values.values()) - for i in values.keys(): + for i in values: values[i] /= total x = sorted(values.keys()) y = [values[a] for a in x] - position_distributions_by_shift[s] = { - a: b for a, b in zip(x, y) - } + position_distributions_by_shift[s] = dict(zip(x, y)) # compile into single model diff --git a/code/python/lib/mg_models/mgm_motif_model.py b/code/python/lib/mg_models/mgm_motif_model.py index 4576068..9eda665 100644 --- a/code/python/lib/mg_models/mgm_motif_model.py +++ b/code/python/lib/mg_models/mgm_motif_model.py @@ -39,7 +39,7 @@ def score(self, fragment, **kwargs): if begin is None: begin = 0 - score_per_shift = list() + score_per_shift = [] for s in self._shift_prior: s = int(s) # shift prior @@ -55,11 +55,10 @@ def score(self, fragment, **kwargs): score += 0.25 else: score *= 0.25 + elif use_log: + score += math.log(self._motif[fragment[begin + i]][s + i]) else: - if use_log: - score += math.log(self._motif[fragment[begin + i]][s + i]) - else: - score *= self._motif[fragment[begin + i]][s + i] + score *= self._motif[fragment[begin + i]][s + i] # spacer if component != "motif": @@ -99,7 +98,7 @@ def _init_spacer(spacer=None): if isinstance(spacer[shift], dict): if len(spacer[shift].keys()) == 0: continue - max_position = max([int(x) for x in spacer[shift].keys()]) + max_position = max(int(x) for x in spacer[shift].keys()) result = [0] * (max_position + 1) for i in spacer[shift].keys(): result[int(i)] = spacer[shift][i] @@ -119,10 +118,8 @@ def pwm_to_df(self): keys = sorted(self._motif.keys()) - list_entries = list() - for p in range(len(next(iter(self._motif.values())))): - list_entries.append( - [self._motif[k][p] for k in keys] - ) - + list_entries = [ + [self._motif[k][p] for k in keys] + for p in range(len(next(iter(self._motif.values())))) + ] return pd.DataFrame(list_entries, columns=keys) diff --git a/code/python/lib/mg_models/mgm_motif_model_v2.py b/code/python/lib/mg_models/mgm_motif_model_v2.py index 0103097..65f5264 100644 --- a/code/python/lib/mg_models/mgm_motif_model_v2.py +++ b/code/python/lib/mg_models/mgm_motif_model_v2.py @@ -32,7 +32,7 @@ def _make_shifts_consistent(self): all_shifts = set(self._shift_prior.keys()).union(self._motif.keys()).union(self._spacer.keys()) - spacer_len = max([len(self._spacer[a]) for a in self._spacer.keys()]) + spacer_len = max(len(self._spacer[a]) for a in self._spacer.keys()) for s in all_shifts: if s not in self._shift_prior: @@ -61,7 +61,7 @@ def score(self, fragment, **kwargs): if begin is None: begin = 0 - score_per_shift = list() + score_per_shift = [] for s in self._shift_prior: s = int(s) # shift prior @@ -77,11 +77,10 @@ def score(self, fragment, **kwargs): score += 0.25 else: score *= 0.25 + elif use_log: + score += math.log(self._motif[s][fragment[begin + i]][i]) else: - if use_log: - score += math.log(self._motif[s][fragment[begin + i]][i]) - else: - score *= self._motif[s][fragment[begin + i]][i] + score *= self._motif[s][fragment[begin + i]][i] # spacer if component != "motif": @@ -121,7 +120,7 @@ def _init_spacer(spacer=None): if isinstance(spacer[shift], dict): if len(spacer[shift].keys()) == 0: continue - max_position = max([int(x) for x in spacer[shift].keys()]) + max_position = max(int(x) for x in spacer[shift].keys()) result = [0] * (max_position + 1) for i in spacer[shift].keys(): result[int(i)] = spacer[shift][i] @@ -142,12 +141,10 @@ def pwm_to_df(self, shift): keys = sorted(self._motif[shift].keys()) - list_entries = list() - for p in range(len(next(iter(self._motif[shift].values())))): - list_entries.append( - [self._motif[shift][k][p] for k in keys] - ) - + list_entries = [ + [self._motif[shift][k][p] for k in keys] + for p in range(len(next(iter(self._motif[shift].values())))) + ] return pd.DataFrame(list_entries, columns=keys) diff --git a/code/python/lib/mg_models/motif_model.py b/code/python/lib/mg_models/motif_model.py index de79abb..e3cf7ee 100644 --- a/code/python/lib/mg_models/motif_model.py +++ b/code/python/lib/mg_models/motif_model.py @@ -14,7 +14,7 @@ def __init__(self, motif, spacer=None): # type: (Dict[str, List[float]], Union[None, Dict[int, float]]) -> None self._motif = motif # type: Dict[str, List[float]] - self._motif_width = max([len(motif[x]) for x in self._motif.keys()]) + self._motif_width = max(len(motif[x]) for x in self._motif.keys()) self._spacer = MotifModel._init_spacer(spacer) # type: Union[None, List[float]] @@ -41,11 +41,10 @@ def score(self, fragment, **kwargs): score += 0.25 else: score *= 0.25 + elif use_log: + score += self._motif[fragment[begin + i]][i] else: - if use_log: - score += self._motif[fragment[begin + i]][i] - else: - score *= self._motif[fragment[begin + i]][i] + score *= self._motif[fragment[begin + i]][i] if component != "motif": if self._spacer is not None: @@ -61,9 +60,11 @@ def find_best_position_and_score(self, fragment, **kwargs): # type: (str, Dict[str, Any]) -> Tuple[int, float] return max( - [(pos, self.score(fragment, begin=pos, **kwargs)) - for pos in range(len(fragment) - self._motif_width)], - key=lambda x: x[1] + ( + (pos, self.score(fragment, begin=pos, **kwargs)) + for pos in range(len(fragment) - self._motif_width) + ), + key=lambda x: x[1], ) def motif_width(self): @@ -78,7 +79,7 @@ def _init_spacer(spacer=None): return None if isinstance(spacer, dict): - max_position = max([int(x) for x in spacer.keys()]) + max_position = max(int(x) for x in spacer.keys()) result = [0] * (max_position + 1) for i in spacer.keys(): result[int(i)] = spacer[i] @@ -95,17 +96,13 @@ def pwm_to_df(self): keys = sorted(self._motif.keys()) - list_entries = list() - for p in range(self.motif_width()): - list_entries.append( - [self._motif[k][p] for k in keys] - ) - + list_entries = [ + [self._motif[k][p] for k in keys] for p in range(self.motif_width()) + ] return pd.DataFrame(list_entries, columns=keys) def to_string(self): - # type: () -> str - out = "" - for letter in sorted(self._motif.keys()): - out += letter + " ".join([str(x) for x in self._motif[letter]]) + "\n" - return out + return "".join( + letter + " ".join([str(x) for x in self._motif[letter]]) + "\n" + for letter in sorted(self._motif.keys()) + ) diff --git a/code/python/lib/mg_models/shelf.py b/code/python/lib/mg_models/shelf.py index 0989349..6c40a20 100644 --- a/code/python/lib/mg_models/shelf.py +++ b/code/python/lib/mg_models/shelf.py @@ -34,7 +34,7 @@ def bin_by_gc(df, step=1, **kwargs): gc_feature = get_value(kwargs, "gc_feature", "GC", valid_type=str) gc_ranges = range(30, 71, step) - result = list() + result = [] a = 0 for b in gc_ranges: right = b if b != gc_ranges[-1] else 100 @@ -56,13 +56,13 @@ def get_consensus_sequence(dict_mat): best_val = None for letter in dict_mat.keys(): - if best_letter is None: + if ( + best_letter is not None + and dict_mat[letter][n] > best_val + or best_letter is None + ): best_letter = letter best_val = dict_mat[letter][n] - else: - if dict_mat[letter][n] > best_val: - best_letter = letter - best_val = dict_mat[letter][n] out += best_letter @@ -78,7 +78,7 @@ def get_position_distributions_by_shift(df, col, shifts): s = shifts[n] if s not in result: - result[s] = list() + result[s] = [] result[s].append(df.at[idx, "Mod"].items[col]) @@ -100,13 +100,13 @@ def first_non_gap(l_seq): for l in list_seqs: p = first_non_gap(l) if p not in pos_to_list_seqs.keys(): - pos_to_list_seqs[p] = list() + pos_to_list_seqs[p] = [] pos_to_list_seqs[p].append(l) # reappend into single list and sort per position - output = list() - output_counts = list() + output = [] + output_counts = [] for p in sorted(pos_to_list_seqs.keys()): # get counts per item counter = Counter(pos_to_list_seqs[p]) @@ -129,7 +129,7 @@ def print_reduced_msa(msa_t, sort_by_starting_position=False, n=None): out = "" counter = 0 for s, c in zip(list_sequences, counts): - out += "{} {}\n".format(s, c) + out += f"{s} {c}\n" if n is not None and counter >= n: break @@ -167,7 +167,7 @@ def run_msa_on_sequence_file(pf_fasta, pf_msa, **kwargs): num_processors = get_value(kwargs, "num_processors", None) output_order = get_value(kwargs, "outputorder", "input-order") - log.debug("Number of processors for MSA: {}".format(num_processors)) + log.debug(f"Number of processors for MSA: {num_processors}") other_options = dict() if num_processors is not None: other_options["threads"] = num_processors @@ -190,12 +190,12 @@ def run_msa_on_sequences(env, sequences, **kwargs): fn_tmp_prefix = get_value(kwargs, "fn_tmp_prefix", "", default_if_none=True) # write sequences to file - pf_fasta = os_join(pd_work, "{}tmp_sequences.fasta".format(fn_tmp_prefix)) + pf_fasta = os_join(pd_work, f"{fn_tmp_prefix}tmp_sequences.fasta") remove_p(pf_fasta) write_sequence_list_to_fasta_file(sequences, pf_fasta) # run msa - pf_msa = os_join(pd_work, "{}tmp_msa.txt".format(fn_tmp_prefix)) + pf_msa = os_join(pd_work, f"{fn_tmp_prefix}tmp_msa.txt") run_msa_on_sequence_file(pf_fasta, pf_msa, **kwargs) msa_t = MSAType.init_from_file(pf_msa) @@ -208,7 +208,7 @@ def run_msa_on_sequences(env, sequences, **kwargs): def gather_consensus_sequences(env, df, col): # type: (Environment, pd.DataFrame, str) -> List[str] - sequences = list() + sequences = [] for idx in df.index: d = df.at[idx, "Mod"].items[col] # type: Dict[str, List[float]] @@ -220,13 +220,13 @@ def gather_consensus_sequences(env, df, col): best_val = None for letter in d.keys(): - if best_letter is None: + if ( + best_letter is not None + and d[letter][n] > best_val + or best_letter is None + ): best_letter = letter best_val = d[letter][n] - else: - if d[letter][n] > best_val: - best_letter = letter - best_val = d[letter][n] out += best_letter sequences.append(out) @@ -248,13 +248,9 @@ def create_numpy_for_column_with_extended_motif(env, df, col, other=dict()): other["msa_t"] = msa_t # get position of shift - shifts = list() + shifts = [] for s in msa_t.list_alignment_sequences: - p = 0 - for pos in range(len(s)): - if s[pos] != "-": - p = pos - break + p = next((pos for pos in range(len(s)) if s[pos] != "-"), 0) shifts.append(p) msa_t = run_msa_on_sequences(env, consensus_seqs, outputorder="tree-order") @@ -377,8 +373,10 @@ def turn_off_components(pf_mod_original, pf_new_mod, components_off, native_codi mod_string = clean_up_start_context(mod_string, t, delete_sc) if native_coding_off: - mod_string = re.sub(r"\$TO_NATIVE" + r"\s+\d+\.\d+", f"$TO_NATIVE 0.0", mod_string) - mod_string = re.sub(r"\$TO_MGM" + r"\s+\d+\.\d+", f"$TO_MGM 1.0", mod_string) + mod_string = re.sub( + r"\$TO_NATIVE" + r"\s+\d+\.\d+", "$TO_NATIVE 0.0", mod_string + ) + mod_string = re.sub(r"\$TO_MGM" + r"\s+\d+\.\d+", "$TO_MGM 1.0", mod_string) with open(pf_new_mod, "w") as f_out: f_out.write(mod_string) @@ -432,10 +430,10 @@ def component_in_model_file(env, gi, component): with open(pf_mod, "r") as f: mod_string = f.read() - for t in key_to_gms2_tags(component): - if re.findall(r"\$" + t + r"[\s\n]", mod_string): - return True - return False + return any( + re.findall(r"\$" + t + r"[\s\n]", mod_string) + for t in key_to_gms2_tags(component) + ) def run_mgm(env, pf_sequence, pf_mgm, pf_prediction, **kwargs): @@ -474,7 +472,7 @@ def run_fgs(env, pf_sequence, pf_prediction, **kwargs): # prog=f"eval \"$(docker-machine env default)\"; docker run -v {env['pd-base']}:{env['pd-base']} quay.io/biocontainers/fraggenescan:1.31--h516909a_2 " \ # f"run_FragGeneScan.pl -genome={pf_sequence} -complete=0" \ # f" -out={pf_prediction} -train=illumina_10" - pf_mod = f"complete" + pf_mod = "complete" cmd = f"{prog} -s {pf_sequence} -o {pf_prediction} -w 1 -t complete" log.info(cmd) @@ -512,7 +510,7 @@ def convert_mga_output_to_gff(output_str, pf_output): end -= frame # if stop is partial, check if should be shifted - if partial == "10" or partial == "00": + if partial in ["10", "00"]: gene_length = end - start + 1 rem = gene_length % 3 if rem > 0: @@ -551,7 +549,7 @@ def convert_mga_output_to_gff(output_str, pf_output): def run_mga(env, pf_sequence, pf_prediction, **kwargs): # type: (Environment, str, str) -> None - prog = f"mga" + prog = "mga" cmd = f"{prog} -m {pf_sequence}" output = run_shell_cmd(cmd) convert_mga_output_to_gff(output, pf_prediction) @@ -606,9 +604,7 @@ def run_prodigal(env, pf_sequence, pf_prediction, **kwargs): pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal") cmd_run = f"cd {env['pd-work']};\n" - cmd_run += "{} -i {} -g {} -o {} -f gff -q \n".format( - pe_tool, pf_sequence, gcode, pf_prediction - ) + cmd_run += f"{pe_tool} -i {pf_sequence} -g {gcode} -o {pf_prediction} -f gff -q \n" run_shell_cmd(cmd_run) @@ -621,9 +617,7 @@ def run_meta_prodigal(env, pf_sequence, pf_prediction, **kwargs): pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal") cmd_run = f"cd {env['pd-work']};\n" - cmd_run += "{} -i {} -g {} -o {} -f gff -q -p meta \n".format( - pe_tool, pf_sequence, gcode, pf_prediction - ) + cmd_run += f"{pe_tool} -i {pf_sequence} -g {gcode} -o {pf_prediction} -f gff -q -p meta \n" run_shell_cmd(cmd_run) @@ -633,9 +627,7 @@ def run_meta_prodigal_autogcode(env, pf_sequence, pf_prediction, **kwargs): pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal") cmd_run = f"cd {env['pd-work']};\n" - cmd_run += "{} -i {} -o {} -f gff -q -p meta \n".format( - pe_tool, pf_sequence, pf_prediction - ) + cmd_run += f"{pe_tool} -i {pf_sequence} -o {pf_prediction} -f gff -q -p meta \n" run_shell_cmd(cmd_run) @@ -700,9 +692,7 @@ def train_gms2_model(env, pf_new_seq, pf_labels_lst, pf_mod, **kwargs): run_shell_cmd( cmd ) - mod = GMS2Mod.init_from_file(pf_mod) - - return mod + return GMS2Mod.init_from_file(pf_mod) def relative_entropy(motif, background, component=None): @@ -736,11 +726,10 @@ def run_mgm2_autogcode(env, pf_sequence, pf_prediction, **kwargs): p4 = get_value(kwargs, "p4", 10) p11 = get_value(kwargs, "p11", 20) - pf_summary = get_value(kwargs, "pf_summary", None) - opt = "" - if pf_summary: + if pf_summary := get_value(kwargs, "pf_summary", None): opt = f" --pf-summary {pf_summary} " - + else: + opt = "" cmd = f"{prog} --seq {pf_sequence} --out {pf_prediction} --clean --p4 {p4} --p11 {p11} {opt}" run_shell_cmd(cmd) @@ -776,7 +765,7 @@ def count_mismatches(s1, s2): # type: (str, str) -> int assert(len(s1) == len(s2)) - return sum([1 for i in range(len(s1)) if s1[i] != s2[i]]) + return sum(1 for i in range(len(s1)) if s1[i] != s2[i]) def helper_clusters_by_heuristic(env, df): @@ -785,15 +774,17 @@ def helper_clusters_by_heuristic(env, df): clusters = [0] * len(seqs) freqs = df["CONSENSUS_RBS_MAT"].value_counts().to_dict() - unique_seqs_ordered = [s for s in sorted(freqs.keys(), key=lambda item: item[1], reverse=True)] + unique_seqs_ordered = list( + sorted(freqs.keys(), key=lambda item: item[1], reverse=True) + ) seq_to_cluster = dict() cluster_to_seqs = dict() # type: (Dict[int, List[str]]) cluster_id = 0 - for i in range(len(unique_seqs_ordered)): - s = unique_seqs_ordered[i] + for item in unique_seqs_ordered: + s = item # try and find an existing cluster found_id = None diff --git a/code/python/lib/mg_options/options.py b/code/python/lib/mg_options/options.py index 4a8604a..6ec36e4 100644 --- a/code/python/lib/mg_options/options.py +++ b/code/python/lib/mg_options/options.py @@ -88,7 +88,7 @@ def read_from_file(pf_options): f = open(pf_options, "r") return yaml.load(f, Loader=yaml.FullLoader) except IOError: - logger.warning("Options File Not Found: {}".format(pf_options)) + logger.warning(f"Options File Not Found: {pf_options}") return dict() @staticmethod @@ -99,18 +99,13 @@ def read_from_defaults_file(pf_default): f = open(pf_default, "r") return yaml.load(f, Loader=yaml.FullLoader) except IOError: - logger.warning("Defaults File Not Found: {}".format(pf_default)) + logger.warning(f"Defaults File Not Found: {pf_default}") return dict() @staticmethod def merge_custom_with_default(default, custom): - # type: (Dict[str, Any], Dict[str, Any]) -> Dict[str, Any] - - if default is None and custom is None: - return dict() - if default is None: - return custom + return dict() if custom is None else custom if custom is None: return default @@ -147,7 +142,7 @@ def _check_requirements(self): for r in requirements: if r not in self._options or self._options[r] is None: - raise ValueError("Option required: {}".format(r)) + raise ValueError(f"Option required: {r}") def required(self): # type: () -> Union[Set[str], None] diff --git a/code/python/lib/mg_parallelization/generic_threading.py b/code/python/lib/mg_parallelization/generic_threading.py index a4ea679..24a6768 100644 --- a/code/python/lib/mg_parallelization/generic_threading.py +++ b/code/python/lib/mg_parallelization/generic_threading.py @@ -13,7 +13,7 @@ def wait_for_all(active_threads): # type: (List[threading.Thread]) -> List[threading.Thread] - done_threads = list() + done_threads = [] while True: if len(active_threads) == 0: break @@ -68,7 +68,7 @@ def __init__(self, func, list_func_kwargs, **kwargs): self._thread_id = get_value(kwargs, "thread_id", self.ident) def run(self): - list_outputs = list() + list_outputs = [] for func_kwargs in self._list_func_kwargs: output = self._func(**func_kwargs) @@ -84,18 +84,16 @@ def run_one_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): simultaneous_runs = get_value(kwargs, "simultaneous_runs", 8) - active_threads = list() - thread_id = 0 + active_threads = [] output = dict() # type: Dict[Any, List[Any]] - for dp in data: + for thread_id, dp in enumerate(data): # Create a thread for genome and run thread = GenericThread(func, {data_arg_name: dp, **func_kwargs}, output=output, thread_id=thread_id) thread.start() - thread_id += 1 active_threads.append(thread) logger.debug(f"Number of active threads: {len(active_threads)}") @@ -107,9 +105,7 @@ def run_one_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): wait_for_all(active_threads) - return [ - l for l in output.values() - ] + return list(output.values()) def run_n_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): @@ -125,7 +121,7 @@ def run_n_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): if n * simultaneous_runs > len(data): n = math.ceil(len(data) / simultaneous_runs) - active_threads = list() + active_threads = [] thread_id = 0 thread_kwargs = dict() @@ -137,7 +133,7 @@ def run_n_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): thread_id += 1 # get n datapoints - infos = list() + infos = [] counter = 0 while i < len(data): infos.append({ @@ -159,7 +155,7 @@ def run_n_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): if len(active_threads) >= simultaneous_runs: wait_for_any(active_threads) - # time.sleep(5) + # time.sleep(5) wait_for_all(active_threads) @@ -181,7 +177,7 @@ def run_slice_per_thread(data, func, data_arg_name, func_kwargs, **kwargs): if n * simultaneous_runs > len(data): n = math.ceil(len(data) / simultaneous_runs) - active_threads = list() + active_threads = [] thread_id = 0 thread_kwargs = {} diff --git a/code/python/lib/mg_parallelization/pbs.py b/code/python/lib/mg_parallelization/pbs.py index 4a228ac..0dfbfcf 100644 --- a/code/python/lib/mg_parallelization/pbs.py +++ b/code/python/lib/mg_parallelization/pbs.py @@ -99,17 +99,11 @@ def run(self, data, func, func_kwargs, **kwargs): ) - # 4) Merge end-results - data_output = None - if not self._dry_run: - data_output = self.merge_output_package_files(list_pf_output_job_packages) - - # 5) Clean - #if self._prl_options.safe_get("pbs-clean"): - #remove_p(*list_pf_input_job) - # remove_p(*list_pf_output_job_packages) - - return data_output + return ( + self.merge_output_package_files(list_pf_output_job_packages) + if not self._dry_run + else None + ) def run_on_generator(self, gen_data, func, func_kwargs, **kwargs): # type: (Generator, Callable, Dict[str, Any], Dict[str, Any]) -> Any @@ -151,19 +145,13 @@ def run_on_generator(self, gen_data, func, func_kwargs, **kwargs): merge_kwargs = get_value(kwargs, "merge_kwargs", dict()) - # 4) Merge end-results - - data_output = None - if not self._dry_run: - data_output = self.merge_output_package_files(list_pf_output_job_packages, as_generator=True, - **merge_kwargs) - - # 5) Clean - #if self._prl_options.safe_get("pbs-clean"): - #remove_p(*[f"{x}.pkl" for x in list_pf_input_job]) - # remove_p(*[f"{x}.pkl" for x in list_pf_output_job_packages]) - - return data_output + return ( + self.merge_output_package_files( + list_pf_output_job_packages, as_generator=True, **merge_kwargs + ) + if not self._dry_run + else None + ) def create_input_package_files(self, data, func, func_kwargs, num_splits, **kwargs): @@ -202,12 +190,9 @@ def create_input_package_files(self, data, func, func_kwargs, num_splits, **kwar for d in list_split_data: split_collector.append(d) - # Write package to disk - list_pf_data = self._package_and_save_list_data(list_split_data, func, func_kwargs, - pf_package_template_formatted) - - # return list of filenames - return list_pf_data + return self._package_and_save_list_data( + list_split_data, func, func_kwargs, pf_package_template_formatted + ) def create_input_package_files_from_generator(self, data, func, func_kwargs, num_splits, **kwargs): # type: (Dict, Callable, Dict[str, Any], int, Dict[str, Any]) -> List[str] @@ -237,17 +222,10 @@ def create_input_package_files_from_generator(self, data, func, func_kwargs, num kwargs, "pf_package_template_formatted", os.path.join(pd_work_pbs, "input_package_{}") ) - # Split data - # list_split_data = self._splitter(data, num_splits, **split_kwargs) - - # Write package to disk - list_pf_data = self._package_and_save_list_data_from_generator( + return self._package_and_save_list_data_from_generator( data, func, func_kwargs, pf_package_template_formatted, **split_kwargs ) - # return list of filenames - return list_pf_data - def execute_function_on_input_packages(self, pf_input_package_template_formatted, job_name, num_jobs): # type: (str, str, int) -> List[str] """ @@ -265,7 +243,7 @@ def execute_function_on_input_packages(self, pf_input_package_template_formatted pf_input_package_template = pf_input_package_template_formatted.format("${PBS_ARRAYID}") # create pbs file - pf_output_package_template = "{}_output".format(pf_input_package_template) + pf_output_package_template = f"{pf_input_package_template}_output" self._create_pbs_file(job_name, num_jobs, pf_pbs, pf_input_package_template, pf_output_package_template) # run @@ -275,12 +253,15 @@ def execute_function_on_input_packages(self, pf_input_package_template_formatted # wait for jobs to end self._wait_for_job_array(array_job_name, pd_head) - # collect all output files - list_pf_outputs = [] - for x in range(1, num_jobs + 1): - if os.path.isfile(PBS.create_concrete_from_template(pf_output_package_template + ".pkl", x)): - list_pf_outputs.append(PBS.create_concrete_from_template(pf_output_package_template, x)) - + list_pf_outputs = [ + PBS.create_concrete_from_template(pf_output_package_template, x) + for x in range(1, num_jobs + 1) + if os.path.isfile( + PBS.create_concrete_from_template( + f"{pf_output_package_template}.pkl", x + ) + ) + ] # write summary file pf_pbs_summary = os.path.join(self._prl_options["pbs-pd-head"], self._prl_options["pbs-fn-summary"]) write_to_file("\n".join(list_pf_outputs), pf_pbs_summary) @@ -290,7 +271,7 @@ def execute_function_on_input_packages(self, pf_input_package_template_formatted @staticmethod def _qsub(pf_pbs): # type: (str) -> str - return run_shell_cmd("qsub -V " + pf_pbs, do_not_log=True).strip() + return run_shell_cmd(f"qsub -V {pf_pbs}", do_not_log=True).strip() def _read_data_from_output_packages(self, list_pf_output_packages, as_generator=False): @@ -312,12 +293,9 @@ def merge_output_package_files(self, list_pf_output_packages, **kwargs): list_output_data = self._read_data_from_output_packages(list_pf_output_packages, as_generator) if not as_generator: - list_output_data = [x for x in list_output_data] + list_output_data = list(list_output_data) - # 4-a) Merge data while loading packages one by one - data_output = self._merger(list_output_data, **kwargs) - - return data_output + return self._merger(list_output_data, **kwargs) def _package_and_save_data(self, data, func, func_kwargs, pf_package): # type: (Dict[str, Any], Callable, Dict[str, Any], str) -> None @@ -335,17 +313,13 @@ def _package_and_save_data(self, data, func, func_kwargs, pf_package): def _package_and_save_list_data(self, list_data, func, func_kwargs, pf_package_template_formatted): # type: (List[Dict[str, Any]], Callable, Dict[str, Any], str) -> List[str] - list_pf = list() - file_number = 1 - - for data in list_data: + list_pf = [] + for file_number, data in enumerate(list_data, start=1): pf_save = pf_package_template_formatted.format(file_number) self._package_and_save_data(data, func, func_kwargs, pf_save) list_pf.append(pf_save) - file_number += 1 - return list_pf def _package_and_save_list_data_from_generator(self, gen_data, func, func_kwargs, pf_package_template_formatted, @@ -353,17 +327,13 @@ def _package_and_save_list_data_from_generator(self, gen_data, func, func_kwargs # type: (Generator, Callable, Dict[str, Any], str, Dict[str, Any]) -> List[str] arg_name_data = get_value(kwargs, "arg_name_data", "data") - list_pf = list() - file_number = 1 - - for data in gen_data: + list_pf = [] + for file_number, data in enumerate(gen_data, start=1): pf_save = pf_package_template_formatted.format(file_number) self._package_and_save_data({arg_name_data: data}, func, func_kwargs, pf_save) list_pf.append(pf_save) - file_number += 1 - return list_pf def _create_pbs_file(self, jobname, num_jobs, pf_pbs, pf_input_package_template, pf_output_package_template): @@ -401,15 +371,7 @@ def _generate_call_command(env, pf_job_input, pf_job_output, prl_options, pd_com pd_compute = os.path.abspath(os.path.join(prl_options["pbs-pd-root-compute"], prl_options["pbs-dn-compute"])) pd_job_template = os.path.join(pd_compute, "job_${PBS_ARRAYID}") - cmd = "{} --pf-job-input {} --pf-job-output {} --pd-work {} -l {}".format( - "python {}".format(os.path.join(env["pd-code"], "python/driver", "run-pbs-job.py")), - pf_job_input, - pf_job_output, - pd_job_template, - "DEBUG" # log.level - ) - - return cmd + return f'python {os.path.join(env["pd-code"], "python/driver", "run-pbs-job.py")} --pf-job-input {pf_job_input} --pf-job-output {pf_job_output} --pd-work {pd_job_template} -l DEBUG' @staticmethod def create_concrete_from_template(pf_template, file_number): @@ -463,15 +425,15 @@ def _cmd_run_dummy_and_wait(pf_dummy, jobname_dummy, jobname_array): def generate_pbs_header(job_name, working_dir=".", num_nodes=1, ppn=1, walltime="00:30:00"): pbs_text = "" - pbs_text += "#PBS -N " + str(job_name) + "\n" - pbs_text += "#PBS -o " + str(working_dir) + "\n" + pbs_text += f"#PBS -N {str(job_name)}" + "\n" + pbs_text += f"#PBS -o {str(working_dir)}" + "\n" pbs_text += "#PBS -j oe" + "\n" - pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str(ppn) + "\n" - pbs_text += "#PBS -l walltime=" + str(walltime) + "\n" + pbs_text += f"#PBS -l nodes={str(num_nodes)}:ppn={str(ppn)}" + "\n" + pbs_text += f"#PBS -l walltime={str(walltime)}" + "\n" pbs_text += "#PBS -W umask=002" + "\n" - pbs_text += "set PBS_O_WORKDIR = " + str(working_dir) + "\n" + pbs_text += f"set PBS_O_WORKDIR = {str(working_dir)}" + "\n" pbs_text += "cd $PBS_O_WORKDIR \n" pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n" @@ -503,34 +465,32 @@ def _generate_pbs_header_array(num_jobs, job_name, prl_options, pd_compute): mkdir_p(pd_pbs_logs) node_property = prl_options.safe_get("pbs-node-property") - if node_property is not None: - node_property = ":" + node_property - else: - node_property = "" - + node_property = f":{node_property}" if node_property is not None else "" pbs_text = "" - pbs_text += "#PBS -N " + str(job_name) + "\n" + pbs_text += f"#PBS -N {str(job_name)}" + "\n" pbs_text += "#PBS -o " + "{}/{}".format(pd_pbs_logs, "error_${PBS_ARRAYID}") + "\n" pbs_text += "#PBS -j oe" + "\n" - pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str(ppn) + "{}\n".format(node_property) - pbs_text += "#PBS -l walltime=" + str(walltime) + "\n" + pbs_text += ( + f"#PBS -l nodes={str(num_nodes)}:ppn={str(ppn)}" + f"{node_property}\n" + ) + pbs_text += f"#PBS -l walltime={str(walltime)}" + "\n" if prl_options: - array_param = "1-{}".format(num_jobs) + array_param = f"1-{num_jobs}" if prl_options["pbs-concurrent-nodes"]: total_concurrent_jobs = prl_options["pbs-concurrent-nodes"] * int(8 / ppn) - array_param = "{}%{}".format(array_param, total_concurrent_jobs) + array_param = f"{array_param}%{total_concurrent_jobs}" - pbs_text += "#PBS -t {}".format(array_param) + "\n" + pbs_text += f"#PBS -t {array_param}" + "\n" pbs_text += "#PBS -W umask=002" + "\n" #pbs_text += "export PATH=\"/home/karl/anaconda/envs/biogem_sbsp/bin:$PATH\"\n" - pbs_text += "mkdir -p {}".format(pd_job_template) + "\n" + pbs_text += f"mkdir -p {pd_job_template}" + "\n" - pbs_text += "PBS_O_WORKDIR=" + pd_job_template + "\n" + pbs_text += f"PBS_O_WORKDIR={pd_job_template}" + "\n" pbs_text += "cd $PBS_O_WORKDIR \n" pbs_text += "sleep 10\n" diff --git a/code/python/lib/mg_pbs_data/mergers.py b/code/python/lib/mg_pbs_data/mergers.py index 24d4ade..85f098d 100644 --- a/code/python/lib/mg_pbs_data/mergers.py +++ b/code/python/lib/mg_pbs_data/mergers.py @@ -33,10 +33,8 @@ def merge_dataframes_to_file(dfs, **kwargs): counter = 0 # counter == 0 means write header if not append: remove_p(pf_output) - else: - # disable header if exists - if os.path.isfile(pf_output) and file_not_empty(pf_output): - counter = 1 + elif os.path.isfile(pf_output) and file_not_empty(pf_output): + counter = 1 header = None for df in dfs: @@ -44,16 +42,14 @@ def merge_dataframes_to_file(dfs, **kwargs): #print(list(df.columns.values)) if header is None: header = list(df.columns.values) - else: - - if header != list(df.columns.values): - log.warning("Could not append dataframe to file. Header inconsistent") - print (header) - print(list(df.columns.values)) - print (" ") + elif header != list(df.columns.values): + log.warning("Could not append dataframe to file. Header inconsistent") + print (header) + print(list(df.columns.values)) + print (" ") # import pdb # pdb.set_trace() - continue + continue df.to_csv(pf_output, index=False, mode="a", header=counter==0) counter += 1 @@ -61,7 +57,7 @@ def merge_dataframes_to_file(dfs, **kwargs): def merge_lists(list_lists): # type: (List[List[T]]) -> List[T] - merged = list() + merged = [] for l in list_lists: merged += l diff --git a/code/python/lib/mg_pbs_data/splitters.py b/code/python/lib/mg_pbs_data/splitters.py index d60d0ad..d490df8 100644 --- a/code/python/lib/mg_pbs_data/splitters.py +++ b/code/python/lib/mg_pbs_data/splitters.py @@ -43,16 +43,13 @@ def split_gil(data, num_splits, **kwargs): gil = data - list_of_list_of_gi = list() - for i in range(num_splits): - list_of_list_of_gi.append(list()) - + list_of_list_of_gi = [[] for _ in range(num_splits)] for index, gi in enumerate(gil): index_of_list = index % num_splits list_of_list_of_gi[index_of_list].append(gi) - list_output = list() + list_output = [] for i in range(len(list_of_list_of_gi)): val = { arg_name_gil: GenomeInfoList(list_of_list_of_gi[i]), @@ -76,16 +73,13 @@ def split_list(data, num_splits, **kwargs): arg_name_jobid = get_value(kwargs, "arg_name_jobid", None, value_type=str) - list_of_lists = list() - for i in range(num_splits): - list_of_lists.append(list()) - + list_of_lists = [[] for _ in range(num_splits)] for index, item in enumerate(data): index_of_list = index % num_splits list_of_lists[index_of_list].append(item) - list_output = list() + list_output = [] for i in range(len(list_of_lists)): val = { arg_name_data: list_of_lists[i], @@ -108,16 +102,13 @@ def split_generator(data, num_splits, **kwargs): arg_name_jobid = get_value(kwargs, "arg_name_jobid", None, value_type=str) - list_of_lists = list() - for i in range(num_splits): - list_of_lists.append(list()) - + list_of_lists = [[] for _ in range(num_splits)] for index, item in enumerate(data): index_of_list = index % num_splits list_of_lists[index_of_list].append(item) - list_output = list() + list_output = [] for i in range(len(list_of_lists)): val = { arg_name_data: list_of_lists[i], @@ -147,14 +138,10 @@ def split_list_DEPRECATED(data, num_splits, pd_work, **kwargs): list_pf_data = data["list_pf_data"] pf_output_template = data["pf_output_template"] - list_splits = list() - - split_number = 1 - for v in list_pf_data: - list_splits.append({"pf_data": v, "pf_output": pf_output_template.format(split_number)}) - split_number += 1 - - return list_splits + return [ + {"pf_data": v, "pf_output": pf_output_template.format(split_number)} + for split_number, v in enumerate(list_pf_data, start=1) + ] def split_dict(data, num_splits, pd_work, **kwargs): @@ -163,18 +150,14 @@ def split_dict(data, num_splits, pd_work, **kwargs): a_dict = data["dict"] # type: Dict[str, Any] pf_output_template = data["pf_output_template"] - list_splits = list() + list_splits = [] num_splits = min(num_splits, len(a_dict)) - for i in range(num_splits): + for _ in range(num_splits): list_splits.append(dict()) list_splits[-1]["data"] = dict() - index = 0 - - for k, v in a_dict.items(): + for index, (k, v) in enumerate(a_dict.items()): list_splits[index % num_splits]["data"][k] = v - index += 1 - for split_number in range(1, len(list_splits) + 1): list_splits[split_number - 1]["pf_output"] = pf_output_template.format(split_number) list_splits[split_number - 1]["msa_output_start"] = split_number @@ -190,13 +173,8 @@ def split_genome_info_list(data, num_splits, pd_work, **kwargs): pf_output_template = get_value(data, "pf_output_template", "") - if num_splits > len(genome_info_list): - num_splits = len(genome_info_list) - - list_of_list_of_gi = list() - for i in range(num_splits): - list_of_list_of_gi.append(list()) - + num_splits = min(num_splits, len(genome_info_list)) + list_of_list_of_gi = [[] for _ in range(num_splits)] for index, gi in enumerate(genome_info_list): index_of_list = index % num_splits @@ -211,22 +189,16 @@ def split_genome_info_list(data, num_splits, pd_work, **kwargs): def split_q3prime_files(data, num_splits, pd_work, **kwargs): - # type: (Dict[str, Any], int, str, Dict[str, Any]) -> List[Dict[str, str]] - - file_number = 1 - - list_splits = list() + list_splits = [] q3prime_to_list_pf = data["q3prime_to_list_pf"] pf_output_template = data["pf_output_template"] - for q3prime_key in q3prime_to_list_pf.keys(): + for file_number, q3prime_key in enumerate(q3prime_to_list_pf.keys(), start=1): list_pf = q3prime_to_list_pf[q3prime_key] list_splits.append({"list_pf_data": list_pf, "pf_output": pf_output_template.format(file_number), "q3prime": q3prime_key, "msa_output_start": file_number}) - file_number += 1 - return list_splits @@ -247,7 +219,7 @@ def split_list_and_remerge_by_key(data, num_splits, pd_work, **kwargs): group_key = data["group_key"] pf_output_template = data["pf_output_template"] - list_pf_new = list() + list_pf_new = [] for pf_old in list_pf_data: @@ -295,7 +267,7 @@ def split_query_genomes_target_genomes_one_vs_group(data, num_splits, pd_work, * pf_t_list = data["pf_t_list"] pf_output_template = data["pf_output_template"] - list_pf_splits = list() + list_pf_splits = [] q_list = GenomeInfoList.init_from_file(pf_q_list) t_list = GenomeInfoList.init_from_file(pf_t_list) diff --git a/code/python/lib/mg_stats/shelf.py b/code/python/lib/mg_stats/shelf.py index 77c080e..d136a4d 100644 --- a/code/python/lib/mg_stats/shelf.py +++ b/code/python/lib/mg_stats/shelf.py @@ -20,11 +20,10 @@ def all_columns_equal(df, columns=None): if columns is None: columns = df.columns.values - # create condition list - conditions = list() - for i in range(1, len(columns)): - conditions.append(f"(df[{columns[i-1]}] == df[{columns[i]}])") - + conditions = [ + f"(df[{columns[i - 1]}] == df[{columns[i]}])" + for i in range(1, len(columns)) + ] return eval(" & ".join(conditions)) @@ -41,18 +40,22 @@ def create_joint_reference_from_list(df, list_reference): reference_values = df.loc[reference_rows, f"3p-{list_reference[0]}"] df.loc[reference_rows, f"3p-{reference}"] = reference_values - list_partial = [f"'Partial3p-{r}'" for r in list_reference if f"Partial3p-{r}" in df.columns.values] - if len(list_partial) > 0: - + if list_partial := [ + f"'Partial3p-{r}'" + for r in list_reference + if f"Partial3p-{r}" in df.columns.values + ]: reference_rows = all_columns_equal(df, list_partial) reference_values = df.loc[reference_rows, f"Partial3p-{list_reference[0]}"] df.loc[reference_rows, f"Partial3p-{reference}"] = reference_values - list_partial = [f"'Partial5p-{r}'" for r in list_reference if f"Partial5p-{r}" in df.columns.values] - if len(list_partial) > 0: - + if list_partial := [ + f"'Partial5p-{r}'" + for r in list_reference + if f"Partial5p-{r}" in df.columns.values + ]: reference_rows = all_columns_equal(df, list_partial) reference_values = df.loc[reference_rows, f"Partial5p-{list_reference[0]}"] df.loc[reference_rows, f"Partial5p-{reference}"] = reference_values @@ -103,12 +106,12 @@ def tidy_genome_level(env, df): "Number of Comp Match", "Number of Comp Found", "Precision", "Recall", "WR", "Number of Missed", "Sensitivity", "Specificity", "Error Rate", "IC3p Match", "IC5p Match", "Comp Match"] - df_total = list() + df_total = [] list_index = [x for x in ["Genome", "Clade", "Chunk Size", "Genome GC", "Number in Reference"] if x in df.columns] for v in values_to_melt: value_vars = [x for x in df.columns if v == x.split("(")[0].strip()] - if len(value_vars) == 0: + if not value_vars: continue df_curr = pd.melt(df, id_vars=list_index, value_vars=value_vars, @@ -136,7 +139,7 @@ def check_tools_and_reference_lists(df, tools, ref_5p, ref_3p): # get tools list # If not provided, extract from df # Make sure it doesn't contain any references - all_tools = sorted(set([x.split("-")[1] for x in df.columns if "5p-" in x])) + all_tools = sorted({x.split("-")[1] for x in df.columns if "5p-" in x}) # check that references exist for list_ref in [ref_5p, ref_3p]: diff --git a/code/python/lib/mg_stats/small.py b/code/python/lib/mg_stats/small.py index a139278..f058d7d 100644 --- a/code/python/lib/mg_stats/small.py +++ b/code/python/lib/mg_stats/small.py @@ -16,7 +16,7 @@ def get_stats_at_gcfid_level_with_reference(df, tools, reference): # type: (pd.DataFrame, List[str], str) -> pd.DataFrame - list_entries = list() + list_entries = [] for gcfid, df_group in df.groupby("Genome", as_index=False): @@ -62,10 +62,10 @@ def _helper_join_reference_and_tidy_data(env, df_per_gene, tools, list_ref): reference = _helper_df_joint_reference(df_per_gene, list_ref) df_per_gene = update_dataframe_with_stats(df_per_gene, tools, reference).copy() - #### Genome Level: compute stats per genome - df_stats_gcfid = list() - for _, df_group in df_per_gene.groupby("Genome", as_index=False): - df_stats_gcfid.append(get_stats_at_gcfid_level_with_reference(df_group, tools, reference)) + df_stats_gcfid = [ + get_stats_at_gcfid_level_with_reference(df_group, tools, reference) + for _, df_group in df_per_gene.groupby("Genome", as_index=False) + ] df_per_genome = pd.concat(df_stats_gcfid, ignore_index=True, sort=False) ### Tidy Data and filter out those not present in tools or reference diff --git a/code/python/lib/mg_viz/colormap.py b/code/python/lib/mg_viz/colormap.py index 501c8d8..ad0fbc1 100644 --- a/code/python/lib/mg_viz/colormap.py +++ b/code/python/lib/mg_viz/colormap.py @@ -14,19 +14,6 @@ def _init_mapping_ancestors(): palette = seaborn.xkcd_palette(colors) return {x[0]: x[1] for x in zip(ancestors, palette)} -# def _init_mapping_ancestors(): -# colors = ["windows blue", "amber", "faded green", "dusty purple"] -# ancestors = ["Archaea", "Actinobacteria", "Enterobacterales", "FCB group"] -# -# color_pal = seaborn.color_palette("colorblind", 6).as_hex() -# colors = ','.join(color_pal) -# palette = seaborn.color_palette(color_pal) -# - - - - return {x[0]: x[1] for x in zip(ancestors, palette)} - def _init_mapping_verified(): colors = ["windows blue", "amber", "faded green", "dusty purple", "pale red"] ancestors = ["E. coli", "H. salinarum", "N. pharaonis", "M. tuberculosis", "R. denitrificans"] @@ -96,7 +83,7 @@ def get_map(name): # type: (str) -> Dict[str, Any] if name not in ColorMap._mappings: - raise ValueError("Unknown color mapping for: {}".format(name)) + raise ValueError(f"Unknown color mapping for: {name}") return ColorMap._mappings[name] diff --git a/code/python/lib/mg_viz/general.py b/code/python/lib/mg_viz/general.py index 4568a9a..4f8c407 100644 --- a/code/python/lib/mg_viz/general.py +++ b/code/python/lib/mg_viz/general.py @@ -216,8 +216,7 @@ def plot_scatter_matrix(df_data, column_names, color_by, figure_options=None, ** else: df_features = df_data[column_names] - should_jitter = get_value(kwargs, "jitter", False) - if should_jitter: + if should_jitter := get_value(kwargs, "jitter", False): jitter(df_features, column_names) fig, ax = plt.subplots() @@ -229,11 +228,6 @@ def plot_scatter_matrix(df_data, column_names, color_by, figure_options=None, ** else: ax = sns.pairplot(df_features, plot_kws={"s": 10}) - # for lh in ax._legend.legendHandles: - # lh.set_alpha(1) - # lh._sizes = [50] - - # sm = scatter_matrix(df_features, diagonal="kde", figsize=(10, 10)) # # Change label rotation # # [s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)] diff --git a/code/python/lib/mg_viz/mgm_motif_model.py b/code/python/lib/mg_viz/mgm_motif_model.py index 2b56f79..5d13700 100644 --- a/code/python/lib/mg_viz/mgm_motif_model.py +++ b/code/python/lib/mg_viz/mgm_motif_model.py @@ -154,7 +154,7 @@ def visualize(mgm_mm, title="", **kwargs): MGMMotifModelVisualizer._viz_logo(mgm_mm, ax_logo) MGMMotifModelVisualizer._viz_msa(msa_t, ax_text) - plt.suptitle("Gc range: {}".format(title)) + plt.suptitle(f"Gc range: {title}") plt.tight_layout() plt.subplots_adjust(top=0.9) @@ -173,14 +173,17 @@ def _viz_motif_pwm_from_raw_data(raw_motif_data, axes, motif_width): for l, ax in zip(letters, axes): # for each position in motif # go through df and accumulate values - all_positions = list() - all_probs = list() + all_positions = [] + all_probs = [] for w_pos in range(array.shape[1]): for index in range(len(shifts)): shifted_position = w_pos - if w_pos < shifts[index] or w_pos >= shifts[index] + motif_width: + if ( + shifted_position < shifts[index] + or shifted_position >= shifts[index] + motif_width + ): continue all_positions.append(shifted_position) @@ -190,8 +193,8 @@ def _viz_motif_pwm_from_raw_data(raw_motif_data, axes, motif_width): raise ValueError("Something's up") all_probs.append(array[index, shifted_position, letter_to_idx[l]]) - # ax.scatter(all_gc, all_probs, marker="+") - # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3}) + # ax.scatter(all_gc, all_probs, marker="+") + # seaborn.regplot(all_gc, all_probs, ax=ax, lowess=True, scatter_kws={"s": 5, "alpha": 0.3}) ax.set_title(f"{l}") df = pd.DataFrame({"Position": all_positions, "Probability": all_probs}) diff --git a/code/python/lib/mg_viz/mgm_motif_model_v2.py b/code/python/lib/mg_viz/mgm_motif_model_v2.py index d5640d0..f34ba92 100644 --- a/code/python/lib/mg_viz/mgm_motif_model_v2.py +++ b/code/python/lib/mg_viz/mgm_motif_model_v2.py @@ -171,7 +171,7 @@ def visualize(mgm_mm, title="", **kwargs): MGMMotifModelVisualizerV2._viz_heuristic(msa_t, ax_text) - plt.suptitle("Gc range: {}".format(title)) + plt.suptitle(f"Gc range: {title}") plt.tight_layout() plt.subplots_adjust(top=0.9) @@ -192,8 +192,8 @@ def _viz_motif_pwm_from_raw_data(raw_motif_data, axes, motif_width): for l, ax in zip(letters, axes): # for each position in motif # go through df and accumulate values - all_positions = list() - all_probs = list() + all_positions = [] + all_probs = [] for w_pos in range(array.shape[1]): # go through all motifs at current position diff --git a/code/python/lib/mg_viz/shelf.py b/code/python/lib/mg_viz/shelf.py index 382a4af..babf141 100644 --- a/code/python/lib/mg_viz/shelf.py +++ b/code/python/lib/mg_viz/shelf.py @@ -36,7 +36,7 @@ def update_tool_names_to_full(names): "prodigal": "Prodigal" } - updated = list() + updated = [] for n in names: n = n.lower() if n in d: diff --git a/code/python/lib/mg_viz/stats_large.py b/code/python/lib/mg_viz/stats_large.py index f13b60a..f97d6ac 100644 --- a/code/python/lib/mg_viz/stats_large.py +++ b/code/python/lib/mg_viz/stats_large.py @@ -45,11 +45,7 @@ def plot_gc_stats_side_by_side(env, df_tidy, columns, tool_order, reference, **k from collections import abc axes_unr = axes - if not isinstance(axes, abc.Iterable): - axes = [axes] - else: - axes = axes.ravel() - + axes = [axes] if not isinstance(axes, abc.Iterable) else axes.ravel() ax = None i = j = 0 fontsize="small" @@ -105,14 +101,9 @@ def plot_gc_stats_side_by_side(env, df_tidy, columns, tool_order, reference, **k # }[l.lower()] for l in labels] labels = update_tool_names_to_full(labels) - if legend_pos == "bottom" or True: - leg = fig.legend(handles, labels, bbox_to_anchor=(0.5, 0.1), loc='upper center', ncol=legend_cols, - bbox_transform=fig.transFigure, frameon=False, - fontsize="xx-small") - else: - leg = fig.legend(handles, labels, bbox_to_anchor=(1.05, 0.5), loc='center left', - frameon=False, - fontsize=18) + leg = fig.legend(handles, labels, bbox_to_anchor=(0.5, 0.1), loc='upper center', ncol=legend_cols, + bbox_transform=fig.transFigure, frameon=False, + fontsize="xx-small") for lh in leg.legendHandles: lh.set_alpha(1) lh.set_sizes([18] * (len(tool_order))) @@ -121,11 +112,10 @@ def plot_gc_stats_side_by_side(env, df_tidy, columns, tool_order, reference, **k for i in range(col_wrap): fig.align_ylabels(axes_unr[:,i]) - if legend_pos == "bottom" or True: - if num_rows == 1: - fig.tight_layout(rect=[0,0.05,1,1]) - else: - fig.tight_layout(rect=[0,0.1,1,1]) + if num_rows == 1: + fig.tight_layout(rect=[0,0.05,1,1]) + else: + fig.tight_layout(rect=[0,0.1,1,1]) # else: # fig.tight_layout(rect=[0, 0, 1, 1]) fig.savefig(next_name(env["pd-work"]), bbox_extra_artists=(leg,)) #bbox_inches='tight' @@ -271,11 +261,7 @@ def viz_stats_large_5p_error_vs_gc_by_clade(env, df_tidy, reference, **kwargs): from collections import abc axes_unr = axes - if not isinstance(axes, abc.Iterable): - axes = [axes] - else: - axes = axes.ravel() - + axes = [axes] if not isinstance(axes, abc.Iterable) else axes.ravel() ax = None fontsize = "xx-small" counter = 0 diff --git a/code/python/lib/mg_viz/stats_small.py b/code/python/lib/mg_viz/stats_small.py index 339aec4..55ff6bd 100644 --- a/code/python/lib/mg_viz/stats_small.py +++ b/code/python/lib/mg_viz/stats_small.py @@ -56,7 +56,7 @@ def stats_small_3p_missed_vs_length(env, df_per_gene, tools, reference): # collect in bins bins = [[0, 150], [150, 300], [300, 600], [600, 900], [900, float('inf')]] - list_entries = list() + list_entries = [] for t in tools + [reference]: df_tool = df_per_gene[~df_per_gene[f"5p-{t}"].isna()] for b in bins: