Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow metrics to return tuple of scores #334

Merged
merged 14 commits into from
Nov 22, 2021
Prev Previous commit
Next Next commit
Fixes as per comment
khumairraj committed Nov 21, 2021

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit b08a552713983e602264e5609030bafe4a964445
20 changes: 9 additions & 11 deletions heareval/predictions/task_predictions.py
Original file line number Diff line number Diff line change
@@ -263,25 +263,23 @@ def log_scores(self, name: str, score_args):
validate_score_return_type(score_ret)
# If the returned score is a tuple, store each subscore as separate entry
if isinstance(score_ret, tuple):
# The first score in the returned tuple will be used
# as primary score for this metric
# The first score in the returned tuple will be used for
# any optimisation criterion, for instance if the metric is
# primary metric, the score corresponding to the first tuple
# will be used for early stopping
end_scores[f"{name}_{score}"] = score_ret[0][1]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure if this is the right PR to change this, but I hate this JSON schema I defined.

It looks like this:

    "test": {
        "test_score": 0.5070093274116516,
        "test_loss": 11.01280689239502,
        "test_event_onset_200ms_fms": 0.5070093274116516,
        "test_segment_1s_er": 0.5021833777427673,
        "validation_score": 0.4511459767818451,
        "epoch": 470,
        "time_in_min": 64.55239222049713
    },

When I want to crawl over this JSON and find ALL the scores, I resort to the hacky trick of seeing if the string starts with "test" and isn't "test_score" or "test_loss". Kinda gross.

I would prefer ["test"]["test_scores"] = {"onset_fms": ..., ...} etc in addition to ["test"]["test_score"] and ["test"]["test_loss"].

Maybe we should make an issue for this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# All other scores will also be logged
for (subscore, value) in score_ret:
end_scores[f"{name}_{score}_{subscore}"] = value
elif isinstance(score_ret, (float, int)):
elif isinstance(score_ret, float):
end_scores[f"{name}_{score}"] = score_ret
else:
raise ValueError(
f"Return type {type(score_ret)} is unexpected. Return type of "
"the score function should either be a "
"tuple(tuple(str, (float, int))) or float or int. "
"tuple(tuple) or float."
)

# Weird, scores can be nan if precision has zero guesses
for score_name, value in end_scores.items():
end_scores[score_name] = 0.0 if math.isnan(value) else value

self.log(
f"{name}_score", end_scores[f"{name}_{str(self.scores[0])}"], logger=True
)
@@ -463,16 +461,16 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
self.target_events[name],
)
# If the score returns a tuple of scores, the first score
# is used as the primary score
# is used
if isinstance(primary_score_ret, tuple):
primary_score = primary_score_ret[0][1]
elif isinstance(primary_score_ret, (float, int)):
elif isinstance(primary_score_ret, float):
primary_score = primary_score_ret
else:
raise ValueError(
f"Return type {type(primary_score_ret)} is unexpected. "
"Return type of the score function should either be a "
"tuple(tuple(str, (float, int))) or float or int. "
"tuple(tuple) or float. "
)
if np.isnan(primary_score):
primary_score = 0.0
47 changes: 34 additions & 13 deletions heareval/score.py
Original file line number Diff line number Diff line change
@@ -54,25 +54,37 @@ def label_to_binary_vector(label: List, num_labels: int) -> torch.Tensor:
return binary_labels


def validate_score_return_type(
ret: Union[float, int, Tuple[Tuple[str, Union[int, float]]]]
):
"""Validates if the return type of the score is valid"""
def validate_score_return_type(ret: Union[Tuple[Tuple], float]):
"""
Valid return types for the metric are
- tuple(tuple(string: name of the subtype, float: the value)): This is the
case with sed eval metrics. They can return (("f_measure", value),
("precision", value), ...), depending on the scores
the metric should is supposed to return. This is set as `scores`
attribute in the metric.
- float: Standard metric behaviour

The downstream prediction pipeline is able to handle these two types.
In case of the tuple return type, the value of the first entry in the
tuple will be used as an optimisation criterion wherever required.
For instance, if the return is (("f_measure", value), ("precision", value)),
the value corresponding to the f_measure will be used ( for instance in
early stopping if this metric is the primary score for the task )
"""
if isinstance(ret, tuple):
assert all(
type(s) == tuple and type(s[0]) == str and type(s[1]) in [float, int]
for s in ret
type(s) == tuple and type(s[0]) == str and type(s[1]) == float for s in ret
), (
"If the return type of the score is a tuple, all the elements "
"in the tuple should be tuple of type tuple(str, (float, int))"
"in the tuple should be tuple of type (string, float)"
)
elif isinstance(ret, (float, int)):
elif isinstance(ret, float):
pass
else:
raise ValueError(
f"Return type {type(ret)} is unexpected. Return type of "
"the score function should either be a "
"tuple(tuple(str, (float, int))) or float or int. "
"tuple(tuple) or float. "
)


@@ -99,12 +111,18 @@ def __init__(
self.name = name
self.maximize = maximize

def __call__(self, *args, **kwargs) -> float:
def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple], float]:
"""
Calls the compute function of the metric, and after validating the output,
returns the metric score
"""
ret = self.compute(*args, **kwargs)
validate_score_return_type(ret)
return ret

def compute(self, predictions: Any, targets: Any, **kwargs) -> float:
def compute(
self, predictions: Any, targets: Any, **kwargs
) -> Union[Tuple[Tuple], float]:
"""
Compute the score based on the predictions and targets. Returns the score.
"""
@@ -188,7 +206,9 @@ def __init__(
self.params = params
assert self.score_class is not None

def compute(self, predictions: Dict, targets: Dict, **kwargs):
def compute(
self, predictions: Dict, targets: Dict, **kwargs
) -> Tuple[Tuple[str, float]]:
# Containers of events for sed_eval
reference_event_list = self.sed_eval_event_container(targets)
estimated_event_list = self.sed_eval_event_container(predictions)
@@ -215,7 +235,8 @@ def compute(self, predictions: Dict, targets: Dict, **kwargs):
overall_scores: Dict[str, Union[float, int]] = dict(
ChainMap(*nested_overall_scores.values())
)
# Return the required scores as tuples
# Return the required scores as tuples. The scores are returned in the
# order they are passed in the `scores` argument
return tuple([(score, overall_scores[score]) for score in self.scores])

@staticmethod