-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sourcery refactored master branch #1
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -87,20 +87,19 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag): | |
labels_per_seqname = dict() | ||
for lab in labels: | ||
if lab.seqname() not in labels_per_seqname: | ||
labels_per_seqname[lab.seqname()] = list() | ||
labels_per_seqname[lab.seqname()] = [] | ||
|
||
labels_per_seqname[lab.seqname()].append(lab) | ||
|
||
counter = 0 | ||
for seqname in labels_per_seqname: | ||
for counter, (seqname, value) in enumerate(labels_per_seqname.items()): | ||
Comment on lines
-90
to
+94
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
entries[seqname] = dict() | ||
|
||
total_score = 0 | ||
avg_gene_length = 0 | ||
avg_gc = 0 | ||
num_genes = 0 | ||
|
||
for lab in labels_per_seqname[seqname]: | ||
for lab in value: | ||
score = lab.get_attribute_value("score") | ||
try: | ||
score = float(score) | ||
|
@@ -117,17 +116,16 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag): | |
avg_gene_length += abs(lab.right() - lab.left() + 1) | ||
|
||
|
||
avg_gene_length /= num_genes if num_genes > 0 else 0 | ||
avg_gc /= num_genes if num_genes > 0 else 0 | ||
avg_gene_length /= max(num_genes, 0) | ||
avg_gc /= max(num_genes, 0) | ||
entries[seqname] = { | ||
f"{tag}: Total Score": total_score, | ||
f"{tag}: Average Gene Length": avg_gene_length, | ||
f"{tag}: Average Gene GC": avg_gc, | ||
f"{tag}: Number of Genes": num_genes | ||
} | ||
counter += 1 | ||
# if counter > 5: | ||
# break | ||
# if counter > 5: | ||
# break | ||
return entries | ||
|
||
|
||
|
@@ -184,9 +182,6 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs): | |
pf_chunks = mkstemp_closed(dir=env["pd-work"], suffix=".fasta") | ||
gs.write_to_file(pf_chunks) | ||
|
||
list_entries = list() | ||
|
||
|
||
Comment on lines
-187
to
-189
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
pd_run = os_join(env["pd-work"], gi.name, f"{dn_prefix}{dn}_{chunk}") | ||
mkdir_p(pd_run) | ||
|
||
|
@@ -195,8 +190,7 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs): | |
gcode_true=gcode_true, **kwargs) | ||
|
||
results["Genome"] = gi.name | ||
list_entries.append(results) | ||
|
||
list_entries = [results] | ||
remove_p(pf_prediction) | ||
remove_p(pf_chunks) | ||
|
||
|
@@ -205,7 +199,7 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs): | |
|
||
def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs): | ||
# type: (Environment, GenomeInfo, str, List[int], Dict[str, Any]) -> pd.DataFrame | ||
list_df = list() | ||
list_df = [] | ||
Comment on lines
-208
to
+202
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
num_processors = get_value(kwargs, "num_processors", 1, valid_type=int) | ||
|
||
if num_processors > 1: | ||
|
@@ -217,7 +211,7 @@ def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs): | |
) | ||
|
||
else: | ||
list_df = list() | ||
list_df = [] | ||
for chunk in chunks: | ||
logger.debug(f"{gi.name};{chunk}") | ||
curr = build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs) | ||
|
@@ -227,14 +221,10 @@ def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs): | |
|
||
|
||
def build_gcode_features(env, gil, tool, chunks, **kwargs): | ||
# type: (Environment, GenomeInfoList, str, List[int], Dict[str, Any]) -> pd.DataFrame | ||
list_df = list() | ||
|
||
for gi in gil: | ||
list_df.append( | ||
build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs) | ||
) | ||
|
||
list_df = [ | ||
build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs) | ||
for gi in gil | ||
] | ||
Comment on lines
-230
to
+227
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
return pd.concat(list_df, ignore_index=True, sort=False) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -142,7 +142,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): | |
binned_dfs = bin_by_gc(df, step=bin_size) | ||
|
||
# for each binned dataframe, build specific model | ||
list_mgm_models = list() # type: List[Tuple[float, float, MGMMotifModel]] | ||
list_mgm_models = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
for info in binned_dfs: | ||
lower, upper, df_gc = info | ||
|
||
|
@@ -158,7 +158,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): | |
|
||
if mgm_mm is None: | ||
# use previous model | ||
if len(list_mgm_models) > 0: | ||
if list_mgm_models: | ||
prev = list_mgm_models[-1][2] | ||
list_mgm_models.append([lower, upper, prev]) | ||
else: | ||
|
@@ -190,7 +190,7 @@ def build_mgm_models(env, df, pf_output): | |
} | ||
|
||
name_to_models = dict() # type: Dict[str, Dict[str, Dict[str, MGMMotifModelAllGC]]] | ||
for species_type in type_model_group.keys(): | ||
for species_type in type_model_group: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
name_to_models[species_type] = dict() # type: Dict[str, Dict[str, MGMMotifModelAllGC]] | ||
for name in type_model_group[species_type].keys(): | ||
name_to_models[species_type][name] = dict() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -127,7 +127,7 @@ def get_loess(local_x, local_y): | |
def visualize_start_codons(env, viz_collector): | ||
# type: (Environment, Dict[str, Dict[str, Dict[str, Any]]]) -> None | ||
|
||
list_entries = list() | ||
list_entries = [] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
for genome_type in viz_collector: | ||
for group in viz_collector[genome_type]: | ||
|
@@ -139,15 +139,19 @@ def visualize_start_codons(env, viz_collector): | |
y = vals["y"] | ||
y_fit = vals["y_fit"] | ||
|
||
for i in range(len(x)): | ||
list_entries.append({ | ||
list_entries.extend( | ||
{ | ||
"Genome Type": genome_type, | ||
"Group": group if genome_type == "Bacteria" else f"A*,D*", | ||
"Group": group | ||
if genome_type == "Bacteria" | ||
else "A*,D*", | ||
"Codon": codon, | ||
"x": x[i], | ||
"y": y[i], | ||
"y_fit": y_fit[i] | ||
}) | ||
"y_fit": y_fit[i], | ||
} | ||
for i in range(len(x)) | ||
) | ||
if genome_type == "Archaea": | ||
break | ||
|
||
|
@@ -198,7 +202,7 @@ def add_codon_probabilities(env, df, mgm, codons, gms2_group, **kwargs): | |
|
||
df = df[df["Type"] == genome_type].copy() | ||
|
||
list_entries = list() | ||
list_entries = [] | ||
Comment on lines
-201
to
+205
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
fig, ax = plt.subplots() | ||
# values_per_codon = dict() | ||
|
@@ -302,8 +306,8 @@ def add_stop_codon_probabilities(env, df, mgm, **kwargs): | |
def compute_bin_averages(x, y, x_min, x_max, x_step): | ||
# type: (List[float], List[float], float, float, float) -> [List[float], List[float]] | ||
|
||
x_out = list() | ||
y_out = list() | ||
x_out = [] | ||
y_out = [] | ||
Comment on lines
-305
to
+310
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
|
||
current = 0 | ||
for x_tag in np.arange(x_min, x_max, x_step): | ||
|
@@ -319,7 +323,7 @@ def compute_bin_averages(x, y, x_min, x_max, x_step): | |
total += 1 | ||
current += 1 | ||
|
||
if total == 0 and len(y_out) == 0: | ||
if total == 0 and not y_out: | ||
continue | ||
avg = y_out[-1] if total == 0 else acc / float(total) | ||
x_out.append(x_tag) | ||
|
@@ -422,7 +426,7 @@ def add_start_context_probabilities(env, df, mgm, input_tag, output_tag, **kwarg | |
# add gc models to mgm | ||
# for genome_tag in ["A", "B"]: # genome_type[0] FIXME | ||
genome_tag = genome_type[0] | ||
for gc_tag in sc_gc.keys(): | ||
for gc_tag in sc_gc: | ||
Comment on lines
-425
to
+429
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[output_tag + "_MAT"] = sc_gc[gc_tag] | ||
mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}"] = 1 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}_ORDER"] = 2 | ||
|
@@ -515,7 +519,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): | |
binned_dfs = bin_by_gc(df, step=bin_size, gc_feature=gc_feature) | ||
|
||
# for each binned dataframe, build specific model | ||
list_mgm_models = list() # type: List[List[float, float, MGMMotifModelV2]] | ||
list_mgm_models = [] | ||
Comment on lines
-518
to
+522
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
for info in binned_dfs: | ||
lower, upper, df_gc = info | ||
# | ||
|
@@ -528,7 +532,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs): | |
|
||
if mgm_mm is None: | ||
# use previous model | ||
if len(list_mgm_models) > 0: | ||
if list_mgm_models: | ||
prev = list_mgm_models[-1][2] | ||
list_mgm_models.append([lower, upper, prev]) | ||
else: | ||
|
@@ -560,37 +564,24 @@ def add_motif_probabilities(env, df, mgm, input_tag, output_tag, genome_type, ** | |
|
||
motif = motif_by_gc.get_model_by_gc(gc) | ||
|
||
if True or "RBS" in output_tag: | ||
# create a label for each shift | ||
for shift, prob in motif._shift_prior.items(): | ||
prob /= 100.0 | ||
output_tag_ws = f"{output_tag}_{int(shift)}" | ||
try: | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift] | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = \ | ||
motif._spacer[ | ||
shift] | ||
except KeyError: | ||
pass | ||
|
||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob | ||
else: | ||
# promoter aren't shifted (for now) | ||
best_shift = max(motif._shift_prior.items(), key=operator.itemgetter(1))[0] | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAT"] = motif._motif[best_shift] | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_POS_DISTR"] = motif._spacer[ | ||
best_shift] | ||
|
||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}"] = 1 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_ORDER"] = 0 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_WIDTH"] = width | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MARGIN"] = 0 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAX_DUR"] = dur | ||
# create a label for each shift | ||
for shift, prob in motif._shift_prior.items(): | ||
prob /= 100.0 | ||
output_tag_ws = f"{output_tag}_{int(shift)}" | ||
try: | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift] | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = \ | ||
motif._spacer[ | ||
shift] | ||
except KeyError: | ||
pass | ||
|
||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0 | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur | ||
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob | ||
Comment on lines
-563
to
+584
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
This removes the following comments ( why? ):
|
||
|
||
|
||
def _build_start_or_stop_codons(env, df, mgm, genome_type, codons, **kwargs): | ||
|
@@ -666,9 +657,9 @@ def _build_motifs(env, df, mgm, genome_type, tag, **kwargs): | |
|
||
learn_from_component = learn_from[tag] # get for component | ||
|
||
df_type = df[df["Type"] == genome_type] | ||
if genome_type == "Archaea": | ||
|
||
df_type = df[df["Type"] == genome_type] | ||
Comment on lines
+660
to
-671
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Function
|
||
for o, l in learn_from_component[genome_type].items(): | ||
if "PROMOTER" in tag and o != "D": | ||
continue # promoters are only in group D | ||
|
@@ -681,7 +672,6 @@ def _build_motifs(env, df, mgm, genome_type, tag, **kwargs): | |
) | ||
else: | ||
|
||
df_type = df[df["Type"] == genome_type] | ||
for o, l in learn_from_component[genome_type].items(): | ||
if "PROMOTER" in tag and o != "C": | ||
continue # promoters are only in group C | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Function
main
refactored with the following changes:remove-unreachable-code
)