Fixes as per comment

hearbenchmark · turian · Nov 22, 2021 · Nov 21, 2021 · Nov 21, 2021 · Nov 21, 2021
commit b08a552713983e602264e5609030bafe4a964445
diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
@@ -263,25 +263,23 @@ def log_scores(self, name: str, score_args):
             validate_score_return_type(score_ret)
             # If the returned score is a tuple, store each subscore as separate entry
             if isinstance(score_ret, tuple):
-                # The first score in the returned tuple will be used
-                # as primary score for this metric
+                # The first score in the returned tuple will be used for
+                # any optimisation criterion, for instance if the metric is
+                # primary metric, the score corresponding to the first tuple
+                # will be used for early stopping
                 end_scores[f"{name}_{score}"] = score_ret[0][1]
                 # All other scores will also be logged
                 for (subscore, value) in score_ret:
                     end_scores[f"{name}_{score}_{subscore}"] = value
-            elif isinstance(score_ret, (float, int)):
+            elif isinstance(score_ret, float):
                 end_scores[f"{name}_{score}"] = score_ret
             else:
                 raise ValueError(
                     f"Return type {type(score_ret)} is unexpected. Return type of "
                     "the score function should either be a "
-                    "tuple(tuple(str, (float, int))) or float or int. "
+                    "tuple(tuple) or float."
                 )
 
-        # Weird, scores can be nan if precision has zero guesses
-        for score_name, value in end_scores.items():
-            end_scores[score_name] = 0.0 if math.isnan(value) else value
-
         self.log(
             f"{name}_score", end_scores[f"{name}_{str(self.scores[0])}"], logger=True
         )
@@ -463,16 +461,16 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
                 self.target_events[name],
             )
             # If the score returns a tuple of scores, the first score
-            # is used as the primary score
+            # is used
             if isinstance(primary_score_ret, tuple):
                 primary_score = primary_score_ret[0][1]
-            elif isinstance(primary_score_ret, (float, int)):
+            elif isinstance(primary_score_ret, float):
                 primary_score = primary_score_ret
             else:
                 raise ValueError(
                     f"Return type {type(primary_score_ret)} is unexpected. "
                     "Return type of the score function should either be a "
-                    "tuple(tuple(str, (float, int))) or float or int. "
+                    "tuple(tuple) or float. "
                 )
             if np.isnan(primary_score):
                 primary_score = 0.0

diff --git a/heareval/score.py b/heareval/score.py
@@ -54,25 +54,37 @@ def label_to_binary_vector(label: List, num_labels: int) -> torch.Tensor:
     return binary_labels
 
 
-def validate_score_return_type(
-    ret: Union[float, int, Tuple[Tuple[str, Union[int, float]]]]
-):
-    """Validates if the return type of the score is valid"""
+def validate_score_return_type(ret: Union[Tuple[Tuple], float]):
+    """
+    Valid return types for the metric are
+        - tuple(tuple(string: name of the subtype, float: the value)): This is the
+            case with sed eval metrics. They can return (("f_measure", value),
+            ("precision", value), ...), depending on the scores
+            the metric should is supposed to return. This is set as `scores`
+            attribute in the metric.
+        - float: Standard metric behaviour
+
+    The downstream prediction pipeline is able to handle these two types.
+    In case of the tuple return type, the value of the first entry in the
+    tuple will be used as an optimisation criterion wherever required.
+    For instance, if the return is (("f_measure", value), ("precision", value)),
+    the value corresponding to the f_measure will be used ( for instance in
+    early stopping if this metric is the primary score for the task )
+    """
     if isinstance(ret, tuple):
         assert all(
-            type(s) == tuple and type(s[0]) == str and type(s[1]) in [float, int]
-            for s in ret
+            type(s) == tuple and type(s[0]) == str and type(s[1]) == float for s in ret
         ), (
             "If the return type of the score is a tuple, all the elements "
-            "in the tuple should be tuple of type tuple(str, (float, int))"
+            "in the tuple should be tuple of type (string, float)"
         )
-    elif isinstance(ret, (float, int)):
+    elif isinstance(ret, float):
         pass
     else:
         raise ValueError(
             f"Return type {type(ret)} is unexpected. Return type of "
             "the score function should either be a "
-            "tuple(tuple(str, (float, int))) or float or int. "
+            "tuple(tuple) or float. "
         )
 
 
@@ -99,12 +111,18 @@ def __init__(
             self.name = name
         self.maximize = maximize
 
-    def __call__(self, *args, **kwargs) -> float:
+    def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple], float]:
+        """
+        Calls the compute function of the metric, and after validating the output,
+        returns the metric score
+        """
         ret = self.compute(*args, **kwargs)
         validate_score_return_type(ret)
         return ret
 
-    def compute(self, predictions: Any, targets: Any, **kwargs) -> float:
+    def compute(
+        self, predictions: Any, targets: Any, **kwargs
+    ) -> Union[Tuple[Tuple], float]:
         """
         Compute the score based on the predictions and targets. Returns the score.
         """
@@ -188,7 +206,9 @@ def __init__(
         self.params = params
         assert self.score_class is not None
 
-    def compute(self, predictions: Dict, targets: Dict, **kwargs):
+    def compute(
+        self, predictions: Dict, targets: Dict, **kwargs
+    ) -> Tuple[Tuple[str, float]]:
         # Containers of events for sed_eval
         reference_event_list = self.sed_eval_event_container(targets)
         estimated_event_list = self.sed_eval_event_container(predictions)
@@ -215,7 +235,8 @@ def compute(self, predictions: Dict, targets: Dict, **kwargs):
         overall_scores: Dict[str, Union[float, int]] = dict(
             ChainMap(*nested_overall_scores.values())
         )
-        # Return the required scores as tuples
+        # Return the required scores as tuples. The scores are returned in the
+        # order they are passed in the `scores` argument
         return tuple([(score, overall_scores[score]) for score in self.scores])
 
     @staticmethod