Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sourcery refactored master branch #1

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions code/python/driver/analyze_gcode_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,28 +89,6 @@ def main(env, args):
# plt.show()
return

fig = plt.figure()
ax = fig.gca(projection='3d')
ax.plot_trisurf(df['Chunk Size'], df['p4,p11'], df['Match Rate'], linewidth=0.2)
ax.set_xlabel("Chunk Size")
ax.set_ylabel("Parameters")
ax.set_zlabel("Match Rate")
plt.show()

df2 = df[df["Tool"] == "mgm2"].groupby(["p4", "p11"], as_index=False).mean()

idx = df2["Match Rate"].argmax()
p4 = df2.at[idx, "p4"]
p11 = df2.at[idx, "p11"]
df_best = df[(df["p4"] == p4) & (df["p11"] == p11)]
df_alex = df[(df["p4"] == 10) & (df["p11"] == 20)]
fig, ax = plt.subplots()
sns.lineplot("Chunk Size", "Match Rate", data=df_best, label="Optimized")
sns.lineplot("Chunk Size", "Match Rate", data=df[df["Tool"] == "mprodigal"], label="MProdigal")
sns.lineplot("Chunk Size", "Match Rate", data=df_alex, label="Original")
ax.set_ylim(0, 1)
plt.show()
Comment on lines 91 to -112
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function main refactored with the following changes:





Expand Down
38 changes: 14 additions & 24 deletions code/python/driver/build_gcode_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,20 +87,19 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag):
labels_per_seqname = dict()
for lab in labels:
if lab.seqname() not in labels_per_seqname:
labels_per_seqname[lab.seqname()] = list()
labels_per_seqname[lab.seqname()] = []

labels_per_seqname[lab.seqname()].append(lab)

counter = 0
for seqname in labels_per_seqname:
for counter, (seqname, value) in enumerate(labels_per_seqname.items()):
Comment on lines -90 to +94
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function get_features_from_prediction refactored with the following changes:

entries[seqname] = dict()

total_score = 0
avg_gene_length = 0
avg_gc = 0
num_genes = 0

for lab in labels_per_seqname[seqname]:
for lab in value:
score = lab.get_attribute_value("score")
try:
score = float(score)
Expand All @@ -117,17 +116,16 @@ def get_features_from_prediction(tool, pf_prediction, gcode_true, tag):
avg_gene_length += abs(lab.right() - lab.left() + 1)


avg_gene_length /= num_genes if num_genes > 0 else 0
avg_gc /= num_genes if num_genes > 0 else 0
avg_gene_length /= max(num_genes, 0)
avg_gc /= max(num_genes, 0)
entries[seqname] = {
f"{tag}: Total Score": total_score,
f"{tag}: Average Gene Length": avg_gene_length,
f"{tag}: Average Gene GC": avg_gc,
f"{tag}: Number of Genes": num_genes
}
counter += 1
# if counter > 5:
# break
# if counter > 5:
# break
return entries


Expand Down Expand Up @@ -184,9 +182,6 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs):
pf_chunks = mkstemp_closed(dir=env["pd-work"], suffix=".fasta")
gs.write_to_file(pf_chunks)

list_entries = list()


Comment on lines -187 to -189
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function build_gcode_features_for_gi_for_chunk refactored with the following changes:

pd_run = os_join(env["pd-work"], gi.name, f"{dn_prefix}{dn}_{chunk}")
mkdir_p(pd_run)

Expand All @@ -195,8 +190,7 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs):
gcode_true=gcode_true, **kwargs)

results["Genome"] = gi.name
list_entries.append(results)

list_entries = [results]
remove_p(pf_prediction)
remove_p(pf_chunks)

Expand All @@ -205,7 +199,7 @@ def build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs):

def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs):
# type: (Environment, GenomeInfo, str, List[int], Dict[str, Any]) -> pd.DataFrame
list_df = list()
list_df = []
Comment on lines -208 to +202
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function build_gcode_features_for_gi refactored with the following changes:

num_processors = get_value(kwargs, "num_processors", 1, valid_type=int)

if num_processors > 1:
Expand All @@ -217,7 +211,7 @@ def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs):
)

else:
list_df = list()
list_df = []
for chunk in chunks:
logger.debug(f"{gi.name};{chunk}")
curr = build_gcode_features_for_gi_for_chunk(env, gi, tool, chunk, **kwargs)
Expand All @@ -227,14 +221,10 @@ def build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs):


def build_gcode_features(env, gil, tool, chunks, **kwargs):
# type: (Environment, GenomeInfoList, str, List[int], Dict[str, Any]) -> pd.DataFrame
list_df = list()

for gi in gil:
list_df.append(
build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs)
)

list_df = [
build_gcode_features_for_gi(env, gi, tool, chunks, **kwargs)
for gi in gil
]
Comment on lines -230 to +227
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function build_gcode_features refactored with the following changes:

This removes the following comments ( why? ):

# type: (Environment, GenomeInfoList, str, List[int], Dict[str, Any]) -> pd.DataFrame

return pd.concat(list_df, ignore_index=True, sort=False)


Expand Down
6 changes: 3 additions & 3 deletions code/python/driver/build_mgm_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs):
binned_dfs = bin_by_gc(df, step=bin_size)

# for each binned dataframe, build specific model
list_mgm_models = list() # type: List[Tuple[float, float, MGMMotifModel]]
list_mgm_models = []
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function build_mgm_motif_models_for_all_gc refactored with the following changes:

This removes the following comments ( why? ):

# type: List[Tuple[float, float, MGMMotifModel]]

for info in binned_dfs:
lower, upper, df_gc = info

Expand All @@ -158,7 +158,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs):

if mgm_mm is None:
# use previous model
if len(list_mgm_models) > 0:
if list_mgm_models:
prev = list_mgm_models[-1][2]
list_mgm_models.append([lower, upper, prev])
else:
Expand Down Expand Up @@ -190,7 +190,7 @@ def build_mgm_models(env, df, pf_output):
}

name_to_models = dict() # type: Dict[str, Dict[str, Dict[str, MGMMotifModelAllGC]]]
for species_type in type_model_group.keys():
for species_type in type_model_group:
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function build_mgm_models refactored with the following changes:

name_to_models[species_type] = dict() # type: Dict[str, Dict[str, MGMMotifModelAllGC]]
for name in type_model_group[species_type].keys():
name_to_models[species_type][name] = dict()
Expand Down
82 changes: 36 additions & 46 deletions code/python/driver/build_mgm_models_from_gms2_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def get_loess(local_x, local_y):
def visualize_start_codons(env, viz_collector):
# type: (Environment, Dict[str, Dict[str, Dict[str, Any]]]) -> None

list_entries = list()
list_entries = []
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function visualize_start_codons refactored with the following changes:


for genome_type in viz_collector:
for group in viz_collector[genome_type]:
Expand All @@ -139,15 +139,19 @@ def visualize_start_codons(env, viz_collector):
y = vals["y"]
y_fit = vals["y_fit"]

for i in range(len(x)):
list_entries.append({
list_entries.extend(
{
"Genome Type": genome_type,
"Group": group if genome_type == "Bacteria" else f"A*,D*",
"Group": group
if genome_type == "Bacteria"
else "A*,D*",
"Codon": codon,
"x": x[i],
"y": y[i],
"y_fit": y_fit[i]
})
"y_fit": y_fit[i],
}
for i in range(len(x))
)
if genome_type == "Archaea":
break

Expand Down Expand Up @@ -198,7 +202,7 @@ def add_codon_probabilities(env, df, mgm, codons, gms2_group, **kwargs):

df = df[df["Type"] == genome_type].copy()

list_entries = list()
list_entries = []
Comment on lines -201 to +205
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function add_codon_probabilities refactored with the following changes:


fig, ax = plt.subplots()
# values_per_codon = dict()
Expand Down Expand Up @@ -302,8 +306,8 @@ def add_stop_codon_probabilities(env, df, mgm, **kwargs):
def compute_bin_averages(x, y, x_min, x_max, x_step):
# type: (List[float], List[float], float, float, float) -> [List[float], List[float]]

x_out = list()
y_out = list()
x_out = []
y_out = []
Comment on lines -305 to +310
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function compute_bin_averages refactored with the following changes:


current = 0
for x_tag in np.arange(x_min, x_max, x_step):
Expand All @@ -319,7 +323,7 @@ def compute_bin_averages(x, y, x_min, x_max, x_step):
total += 1
current += 1

if total == 0 and len(y_out) == 0:
if total == 0 and not y_out:
continue
avg = y_out[-1] if total == 0 else acc / float(total)
x_out.append(x_tag)
Expand Down Expand Up @@ -422,7 +426,7 @@ def add_start_context_probabilities(env, df, mgm, input_tag, output_tag, **kwarg
# add gc models to mgm
# for genome_tag in ["A", "B"]: # genome_type[0] FIXME
genome_tag = genome_type[0]
for gc_tag in sc_gc.keys():
for gc_tag in sc_gc:
Comment on lines -425 to +429
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function add_start_context_probabilities refactored with the following changes:

mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[output_tag + "_MAT"] = sc_gc[gc_tag]
mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}"] = 1
mgm.items_by_species_and_gc[genome_tag][str(gc_tag)].items[f"{output_tag}_ORDER"] = 2
Expand Down Expand Up @@ -515,7 +519,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs):
binned_dfs = bin_by_gc(df, step=bin_size, gc_feature=gc_feature)

# for each binned dataframe, build specific model
list_mgm_models = list() # type: List[List[float, float, MGMMotifModelV2]]
list_mgm_models = []
Comment on lines -518 to +522
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function build_mgm_motif_models_for_all_gc refactored with the following changes:

This removes the following comments ( why? ):

# type: List[List[float, float, MGMMotifModelV2]]

for info in binned_dfs:
lower, upper, df_gc = info
#
Expand All @@ -528,7 +532,7 @@ def build_mgm_motif_models_for_all_gc(env, df, name, **kwargs):

if mgm_mm is None:
# use previous model
if len(list_mgm_models) > 0:
if list_mgm_models:
prev = list_mgm_models[-1][2]
list_mgm_models.append([lower, upper, prev])
else:
Expand Down Expand Up @@ -560,37 +564,24 @@ def add_motif_probabilities(env, df, mgm, input_tag, output_tag, genome_type, **

motif = motif_by_gc.get_model_by_gc(gc)

if True or "RBS" in output_tag:
# create a label for each shift
for shift, prob in motif._shift_prior.items():
prob /= 100.0
output_tag_ws = f"{output_tag}_{int(shift)}"
try:
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift]
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = \
motif._spacer[
shift]
except KeyError:
pass

mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob
else:
# promoter aren't shifted (for now)
best_shift = max(motif._shift_prior.items(), key=operator.itemgetter(1))[0]
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAT"] = motif._motif[best_shift]
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_POS_DISTR"] = motif._spacer[
best_shift]

mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}"] = 1
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_ORDER"] = 0
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_WIDTH"] = width
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MARGIN"] = 0
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag}_MAX_DUR"] = dur
# create a label for each shift
for shift, prob in motif._shift_prior.items():
prob /= 100.0
output_tag_ws = f"{output_tag}_{int(shift)}"
try:
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAT"] = motif._motif[shift]
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_POS_DISTR"] = \
motif._spacer[
shift]
except KeyError:
pass

mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}"] = 1
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_ORDER"] = 0
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_WIDTH"] = width
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MARGIN"] = 0
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_MAX_DUR"] = dur
mgm.items_by_species_and_gc[genome_tag][str(gc)].items[f"{output_tag_ws}_SHIFT"] = prob
Comment on lines -563 to +584
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function add_motif_probabilities refactored with the following changes:

This removes the following comments ( why? ):

# promoter aren't shifted (for now)



def _build_start_or_stop_codons(env, df, mgm, genome_type, codons, **kwargs):
Expand Down Expand Up @@ -666,9 +657,9 @@ def _build_motifs(env, df, mgm, genome_type, tag, **kwargs):

learn_from_component = learn_from[tag] # get for component

df_type = df[df["Type"] == genome_type]
if genome_type == "Archaea":

df_type = df[df["Type"] == genome_type]
Comment on lines +660 to -671
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Function _build_motifs refactored with the following changes:

for o, l in learn_from_component[genome_type].items():
if "PROMOTER" in tag and o != "D":
continue # promoters are only in group D
Expand All @@ -681,7 +672,6 @@ def _build_motifs(env, df, mgm, genome_type, tag, **kwargs):
)
else:

df_type = df[df["Type"] == genome_type]
for o, l in learn_from_component[genome_type].items():
if "PROMOTER" in tag and o != "C":
continue # promoters are only in group C
Expand Down
Loading