Skip to content

Commit

Permalink
output metrics with and without filter
Browse files Browse the repository at this point in the history
  • Loading branch information
fernandomeyer committed Feb 16, 2021
1 parent 3ad737d commit e6addaa
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 37 deletions.
28 changes: 13 additions & 15 deletions opal.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ def print_by_rank(output_dir, labels, pd_metrics):
order_rows = labels
# define ordering of columns, hard coded
order_columns = [c.UNIFRAC, c.UNW_UNIFRAC, c.L1NORM, c.RECALL, c.PRECISION, c.F1_SCORE, c.TP, c.FP, c.FN, c.OTUS, c.JACCARD, c.SHANNON_DIVERSITY, c.SHANNON_EQUIT, c.BRAY_CURTIS]
if c.FP_UNFILTERED in pd_metrics['metric'].values:
order_columns += [c.PRECISION_UNFILTERED, c.F1_SCORE_UNFILTERED, c.TP_UNFILTERED, c.FP_UNFILTERED]
if c.FP + c.UNFILTERED_SUF in pd_metrics['metric'].values:
order_columns += [metric + c.UNFILTERED_SUF for metric in order_columns]
for rank in c.ALL_RANKS:
# subset to those information that either belong to the given rank or are rank independent, i.e. are unifrac values
table = pd_metrics[(pd_metrics['rank'] == rank) | (pd_metrics['metric'].isin([c.UNIFRAC, c.UNW_UNIFRAC]))]
table = pd_metrics[(pd_metrics['rank'] == rank) | (pd_metrics['metric'].isin([c.UNIFRAC, c.UNW_UNIFRAC, c.UNIFRAC + c.UNFILTERED_SUF, c.UNW_UNIFRAC + c.UNFILTERED_SUF]))]
# reformat the table with a pivot_table
table = table.pivot_table(index=['tool', 'sample'], columns='metric', values='value')
# select only tools in labels and get rid of gold standard
Expand All @@ -92,14 +92,16 @@ def print_by_tool(output_dir, pd_metrics):
make_sure_path_exists(os.path.join(output_dir, "by_tool"))
# define ordering of columns, hard coded
order_columns = [c.UNIFRAC, c.UNW_UNIFRAC, c.L1NORM, c.RECALL, c.PRECISION, c.F1_SCORE, c.TP, c.FP, c.FN, c.OTUS, c.JACCARD, c.SHANNON_DIVERSITY, c.SHANNON_EQUIT, c.BRAY_CURTIS]
if c.FP_UNFILTERED in pd_metrics['metric'].values:
order_columns += [c.PRECISION_UNFILTERED, c.F1_SCORE_UNFILTERED, c.TP_UNFILTERED, c.FP_UNFILTERED]
unifrac_list = [c.UNIFRAC, c.UNW_UNIFRAC]
if c.FP + c.UNFILTERED_SUF in pd_metrics['metric'].values:
order_columns += [metric + c.UNFILTERED_SUF for metric in order_columns]
unifrac_list += [c.UNIFRAC + c.UNFILTERED_SUF, c.UNW_UNIFRAC + c.UNFILTERED_SUF]
for toolname, pd_metrics_tool in pd_metrics.groupby('tool'):
if toolname == c.GS:
continue
table = pd_metrics_tool.pivot_table(index=['rank', 'sample'], columns='metric', values='value')
# little hack to carry unifrac over to every rank
for unifrac_col in order_columns[:2]:
for unifrac_col in unifrac_list:
table[unifrac_col] = pd_metrics_tool[pd_metrics_tool['metric'] == unifrac_col]['value'].values[0]
# order table
table['rank_cat'] = pd.Categorical(table.index.get_level_values('rank'), categories=c.ALL_RANKS, ordered=True)
Expand Down Expand Up @@ -167,7 +169,7 @@ def evaluate(gs_samples_list, profiles_list_to_samples_list, labels, normalize,
if filter_tail_percentage:
metrics_list = pd_metrics['metric'].unique().tolist()
pd_metrics_copy = pd_metrics.copy()
pd_metrics_copy['metric'].replace(metrics_list, [metric + ' (unfiltered)' for metric in metrics_list], inplace=True)
pd_metrics_copy['metric'].replace(metrics_list, [metric + c.UNFILTERED_SUF for metric in metrics_list], inplace=True)
pd_metrics = pd.concat([pd_metrics, pd_metrics_copy], ignore_index=True)

one_profile_assessed = False
Expand Down Expand Up @@ -250,13 +252,9 @@ def reformat_pandas(sample_id, label, braycurtis, shannon, binary_metrics, l1nor

# convert Binary metrics
pd_binary_metrics = pd.DataFrame([binary_metrics[rank].get_pretty_dict() for rank in binary_metrics.keys()]).set_index('rank').stack().reset_index().rename(columns={'level_1': 'metric', 0: 'value'})
if 'fpfiltered' in pd_binary_metrics['metric'].values:
oldnames = ['fp', 'fpfiltered', 'tp', 'tpfiltered', 'fn', 'jaccard', 'precision', 'precisionfiltered', 'recall', 'f1', 'f1filtered', 'otus']
newnames = [c.FP_UNFILTERED, c.FP, c.TP_UNFILTERED, c.TP, c.FN, c.JACCARD, c.PRECISION_UNFILTERED, c.PRECISION, c.RECALL, c.F1_SCORE_UNFILTERED, c.F1_SCORE, c.OTUS]
else:
oldnames = ['fp', 'tp', 'fn', 'jaccard', 'precision', 'recall', 'f1', 'otus']
newnames = [c.FP, c.TP, c.FN, c.JACCARD, c.PRECISION, c.RECALL, c.F1_SCORE, c.OTUS]
pd_binary_metrics['metric'].replace(oldnames, newnames, inplace=True)
pd_binary_metrics['metric'].replace(['fp', 'tp', 'fn', 'jaccard', 'precision', 'recall', 'f1', 'otus'],
[c.FP, c.TP, c.FN, c.JACCARD, c.PRECISION, c.RECALL, c.F1_SCORE, c.OTUS],
inplace=True)
pd_binary_metrics['sample'] = sample_id
pd_binary_metrics['tool'] = label

Expand All @@ -270,7 +268,7 @@ def reformat_pandas(sample_id, label, braycurtis, shannon, binary_metrics, l1nor

if rename_as_unfiltered:
metrics_list = pd_formatted['metric'].unique().tolist()
pd_formatted['metric'].replace(metrics_list, [metric + ' (unfiltered)' for metric in metrics_list], inplace=True)
pd_formatted['metric'].replace(metrics_list, [metric + c.UNFILTERED_SUF for metric in metrics_list], inplace=True)

return pd_formatted

Expand Down
6 changes: 3 additions & 3 deletions src/binary_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def jaccard_index(tp, rank_query_taxids, rank_truth_taxids):

def f1_score(this_precision, this_recall):
""" Returns f1 score
>>> f1_score(rank_metrics.precision, rank_metrics.recall)
>>> f1_score(test_rank_metrics.precision, test_rank_metrics.recall)
1.0
"""
Expand All @@ -182,7 +182,7 @@ def f1_score(this_precision, this_recall):

def compute_rank_metrics(rank_query, rank_truth, rank):
""" Returns metrics for one rank
>>> compute_rank_metrics(test_query_rank, test_truth_rank, "species", None).get_ordered_dict()
>>> compute_rank_metrics(test_query_rank, test_truth_rank, "species").get_ordered_dict()
OrderedDict([('_RankMetrics__f1', 1.0), ('_RankMetrics__fn', 0), ('_RankMetrics__fp', 0), ('_RankMetrics__jaccard', 1.0), ('_RankMetrics__otus', 1), ('_RankMetrics__precision', 1.0), ('_RankMetrics__rank', 'species'), ('_RankMetrics__recall', 1.0), ('_RankMetrics__tp', 1)])
"""
Expand All @@ -207,7 +207,7 @@ def compute_rank_metrics(rank_query, rank_truth, rank):

def compute_tree_metrics(query, truth):
""" Return metrics for tree
>>> compute_tree_metrics(query_tree, truth_tree, None)["species"].get_ordered_dict()
>>> compute_tree_metrics(query_tree, truth_tree)["species"].get_ordered_dict()
OrderedDict([('_RankMetrics__f1', 0.5), ('_RankMetrics__fn', 1), ('_RankMetrics__fp', 3), ('_RankMetrics__jaccard', 0.3333333333333333), ('_RankMetrics__otus', 5), ('_RankMetrics__precision', 0.4), ('_RankMetrics__rank', 'species'), ('_RankMetrics__recall', 0.6666666666666666), ('_RankMetrics__tp', 2)])
"""

Expand Down
37 changes: 22 additions & 15 deletions src/html_opal.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ def get_rank_to_sample_pd(pd_metrics):
for index, row in pd_grouped_copy.iterrows():
pd_grouped.loc[index][c.UNIFRAC] = pd_grouped.loc[('rank independent', index[1], index[2])][c.UNIFRAC]
pd_grouped.loc[index][c.UNW_UNIFRAC] = pd_grouped.loc[('rank independent', index[1], index[2])][c.UNW_UNIFRAC]
if c.UNIFRAC + c.UNFILTERED_SUF in pd_grouped.columns:
for index, row in pd_grouped_copy.iterrows():
pd_grouped.loc[index][c.UNIFRAC + c.UNFILTERED_SUF] = pd_grouped.loc[('rank independent', index[1], index[2])][c.UNIFRAC + c.UNFILTERED_SUF]
pd_grouped.loc[index][c.UNW_UNIFRAC + c.UNFILTERED_SUF] = pd_grouped.loc[('rank independent', index[1], index[2])][c.UNW_UNIFRAC + c.UNFILTERED_SUF]

for (rank, sample), g in pd_grouped.groupby(['rank', 'sample']):
rank_to_sample_pd[rank][sample] = g.reset_index().rename(columns={'tool': 'Tool'}).drop(['rank', 'sample'], axis=1).set_index('Tool').T
Expand Down Expand Up @@ -205,27 +209,27 @@ def get_colors_and_ranges(name, all_values, df_metrics):
hue2 = 240

metrics = [c.PRECISION, c.RECALL, c.F1_SCORE, c.JACCARD]
metrics = metrics + [metric + ' (unfiltered)' for metric in metrics]
metrics = metrics + [metric + c.UNFILTERED_SUF for metric in metrics]
if name in metrics:
return color1, color2, hue1, hue2, 0, 1

metrics = [c.FP, c.UNIFRAC, c.UNW_UNIFRAC]
metrics = metrics + [metric + ' (unfiltered)' for metric in metrics]
metrics = metrics + [metric + c.UNFILTERED_SUF for metric in metrics]
if name in metrics:
return color2, color1, hue2, hue1, 0, max(all_values)

if name == c.TP or name == c.TP + ' (unfiltered)':
if name == c.TP or name == c.TP + c.UNFILTERED_SUF:
return color1, color2, hue1, hue2, 0, max(all_values)

if name == c.FN or name == c.FN + ' (unfiltered)':
if name == c.FN or name == c.FN + c.UNFILTERED_SUF:
fn_values = df_metrics.loc[name, ].values
# convert "<mean> (<standard error>)" to float of <mean>
if len(fn_values) > 0 and isinstance(fn_values[0], str):
fn_values = [float(x.split(' ')[0]) for x in fn_values]
return color2, color1, hue2, hue1, 0, max(fn_values)
if name == c.L1NORM or name == c.L1NORM + ' (unfiltered)':
if name == c.L1NORM or name == c.L1NORM + c.UNFILTERED_SUF:
return color2, color1, hue2, hue1, 0, 2
if name == c.BRAY_CURTIS or name == c.BRAY_CURTIS + ' (unfiltered)':
if name == c.BRAY_CURTIS or name == c.BRAY_CURTIS + c.UNFILTERED_SUF:
return color2, color1, hue2, hue1, 0, 1
return color1, color2, hue1, hue2, max(all_values), min(all_values)

Expand Down Expand Up @@ -292,13 +296,21 @@ def create_metrics_table(pd_metrics, labels, sample_ids_list):
all_sample_ids = sample_ids_list[:]
all_sample_ids.insert(0, '(average over samples)')

if c.FP_UNFILTERED in pd_metrics['metric'].values:
presence_metrics = [c.RECALL, c.PRECISION, c.PRECISION_UNFILTERED, c.F1_SCORE, c.F1_SCORE_UNFILTERED, c.TP, c.TP_UNFILTERED, c.FP, c.FP_UNFILTERED, c.FN, c.JACCARD]
else:
presence_metrics = [c.RECALL, c.PRECISION, c.F1_SCORE, c.TP, c.FP, c.FN, c.JACCARD]
presence_metrics = [c.RECALL, c.PRECISION, c.F1_SCORE, c.TP, c.FP, c.FN, c.JACCARD]
estimates_metrics = [c.UNIFRAC, c.UNW_UNIFRAC, c.L1NORM, c.BRAY_CURTIS]
alpha_diversity_metics = [c.OTUS, c.SHANNON_DIVERSITY, c.SHANNON_EQUIT]
rank_independent_metrics = [c.UNIFRAC, c.UNW_UNIFRAC]

if c.FP + c.UNFILTERED_SUF in pd_metrics['metric'].values:
presence_metrics = [[metric, metric + c.UNFILTERED_SUF] for metric in presence_metrics]
presence_metrics = [metric for elem in presence_metrics for metric in elem]
estimates_metrics = [[metric, metric + c.UNFILTERED_SUF] for metric in estimates_metrics]
estimates_metrics = [metric for elem in estimates_metrics for metric in elem]
alpha_diversity_metics = [[metric, metric + c.UNFILTERED_SUF] for metric in alpha_diversity_metics]
alpha_diversity_metics = [metric for elem in alpha_diversity_metics for metric in elem]
rank_independent_metrics = [[metric, metric + c.UNFILTERED_SUF] for metric in rank_independent_metrics]
rank_independent_metrics = [metric for elem in rank_independent_metrics for metric in elem]

all_metrics = [presence_metrics, estimates_metrics, alpha_diversity_metics]

presence_metrics_label = 'Presence/absence of taxa'
Expand Down Expand Up @@ -333,11 +345,6 @@ def get_html_dict(metrics):
(c.OTUS, c.TOOLTIP_OTUS),
(c.SHANNON_DIVERSITY, c.TOOLTIP_SHANNON_DIVERSITY),
(c.SHANNON_EQUIT, c.TOOLTIP_SHANNON_EQUIT)]
if c.FP_UNFILTERED in pd_metrics['metric'].values:
metrics_tuples += [(c.FP_UNFILTERED, c.TOOLTIP_FP),
(c.TP_UNFILTERED, c.TOOLTIP_TP),
(c.PRECISION_UNFILTERED, c.TOOLTIP_PRECISION),
(c.F1_SCORE_UNFILTERED, c.TOOLTIP_F1_SCORE)]

d = get_html_dict(metrics_tuples)

Expand Down
5 changes: 1 addition & 4 deletions src/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,17 @@
UNW_UNIFRAC = 'Unweighted UniFrac error'
L1NORM = 'L1 norm error'
PRECISION = 'Purity'
PRECISION_UNFILTERED = 'Purity (unfiltered)'
RECALL = 'Completeness'
F1_SCORE = 'F1 score'
F1_SCORE_UNFILTERED = 'F1 score (unfiltered)'
TP = 'True positives'
TP_UNFILTERED = 'True positives (unfiltered)'
FP = 'False positives'
FP_UNFILTERED = 'False positives (unfiltered)'
FN = "False negatives"
OTUS = "Taxon counts"
JACCARD = "Jaccard index"
SHANNON_DIVERSITY = 'Shannon diversity'
SHANNON_EQUIT = 'Shannon equitability'
BRAY_CURTIS = 'Bray-Curtis distance'
UNFILTERED_SUF = ' (unfiltered)'
ALL_METRICS = [UNIFRAC, UNW_UNIFRAC, L1NORM, PRECISION, RECALL, F1_SCORE, TP, FP, FN, JACCARD, SHANNON_DIVERSITY, SHANNON_EQUIT, BRAY_CURTIS]

TOOLTIP_UNIFRAC = 'tree-based measure of similarity between the true and predicted abundances at all taxonomic ranks ranging from 0 (high similarity) to 16 (low similarity).'
Expand Down

0 comments on commit e6addaa

Please sign in to comment.