From 28a754e2246e780674af24e79d6ada2bf1ed78d4 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 14:46:33 +0000
Subject: [PATCH 01/13] Modify score to return tuple for sedeval

---
 heareval/score.py | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/heareval/score.py b/heareval/score.py
index 39010d9b..558ea980 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -2,7 +2,8 @@
 Common utils for scoring.
 """
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from collections import ChainMap
 
 import numpy as np
 import pandas as pd
@@ -142,20 +143,21 @@ class SoundEventScore(ScoreFunction):
     def __init__(
         self,
         label_to_idx: Dict[str, int],
-        score: str,
+        scores: Tuple[str],
         params: Dict = None,
         name: Optional[str] = None,
         maximize: bool = True,
     ):
         """
-        :param score: Score to use, from the list of overall SED eval scores.
+        :param scores: Scores to use, from the list of overall SED eval scores.
+            The first score in the tuple will be the primary score for this metric
         :param params: Parameters to pass to the scoring function,
                        see inheriting children for details.
         """
         if params is None:
             params = {}
         super().__init__(label_to_idx=label_to_idx, name=name, maximize=maximize)
-        self.score = score
+        self.scores = scores
         self.params = params
         assert self.score_class is not None
 
@@ -176,10 +178,18 @@ def __call__(self, predictions: Dict, targets: Dict, **kwargs):
                 estimated_event_list=estimated_event_list.filter(filename=filename),
             )
 
-        # This (and segment_based_scores) return a pretty large selection of scores.
-        overall_scores = scores.results_overall_metrics()
-        # Keep the specific score we want
-        return overall_scores[self.score][self.score]
+        # results_overall_metrics return a pretty large nested selection of scores,
+        # with dicts of scores keyed on the type of scores, like f_measure, error_rate,
+        # accuracy
+        nested_overall_scores: Dict[
+            str, Dict[str, float]
+        ] = scores.results_overall_metrics()
+        # Open up nested overall scores
+        overall_scores: Dict[str, Union[float, int]] = dict(
+            ChainMap(*nested_overall_scores.values())
+        )
+        # Return the required scores as tuples
+        return tuple([(score, overall_scores[score]) for score in self.scores])
 
     @staticmethod
     def sed_eval_event_container(

From 1bc9686488f69289560d48ba65ab22e281ae7069 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 14:47:22 +0000
Subject: [PATCH 02/13] Modify the actual score functions

---
 heareval/score.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/heareval/score.py b/heareval/score.py
index 558ea980..50ef253a 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -319,19 +319,20 @@ def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> fl
     "event_onset_200ms_fms": partial(
         EventBasedScore,
         name="event_onset_200ms_fms",
-        score="f_measure",
+        # If first score will be used as the primary score for this metric
+        scores=("f_measure", "precision", "recall"),
         params={"evaluate_onset": True, "evaluate_offset": False, "t_collar": 0.2},
     ),
     "event_onset_50ms_fms": partial(
         EventBasedScore,
         name="event_onset_50ms_fms",
-        score="f_measure",
+        scores=("f_measure", "precision", "recall"),
         params={"evaluate_onset": True, "evaluate_offset": False, "t_collar": 0.05},
     ),
     "event_onset_offset_50ms_20perc_fms": partial(
         EventBasedScore,
         name="event_onset_offset_50ms_20perc_fms",
-        score="f_measure",
+        scores=("f_measure", "precision", "recall"),
         params={
             "evaluate_onset": True,
             "evaluate_offset": True,
@@ -342,7 +343,7 @@ def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> fl
     "segment_1s_er": partial(
         SegmentBasedScore,
         name="segment_1s_er",
-        score="error_rate",
+        scores=("error_rate"),
         params={"time_resolution": 1.0},
         maximize=False,
     ),

From 98e52dc251378b4b79e83933aa5dd78a4a570630 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 14:55:22 +0000
Subject: [PATCH 03/13] Change the predictions to handle tuple returns from
 metric

---
 heareval/predictions/task_predictions.py | 84 +++++++++++++++++-------
 1 file changed, 62 insertions(+), 22 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index fb37eae0..49cfd755 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -253,6 +253,48 @@ def validation_step(self, batch, batch_idx):
     def test_step(self, batch, batch_idx):
         return self._step(batch, batch_idx)
 
+    def log_scores(self, name: str, score_args: Tuple[Any]):
+        """Logs the metric score value for each score defined for the model"""
+        assert hasattr(self, "scores"), "Scores for the model should be defined"
+        end_scores: Dict[str, Union[int, float]] = {}
+        for score in self.scores:
+            score_ret = score(*score_args)
+            # If the returned score is a tuple, store each subscore as separate entry
+            if isinstance(score_ret, tuple):
+                assert all(
+                    type(s) == tuple
+                    and type(s[0]) == str
+                    and type(s[1]) in [float, int]
+                    for s in score_ret
+                ), (
+                    "If the return type of the score is a tuple, all the elements "
+                    "in the tuple should be tuple of type tuple(str, (float, int))"
+                )
+                # The first score in the returned tuple will be used
+                # as primary score for this metric
+                end_scores[f"{name}_{score}"] = score_ret[0][1]
+                # All other scores will also be logged
+                for (subscore, value) in score_ret:
+                    end_scores[f"{name}_{score}_{subscore}"] = value
+            elif isinstance(score_ret, (float, int)):
+                end_scores[f"{name}_{score}"] = score_ret
+            else:
+                raise ValueError(
+                    f"Return type {type(score_ret)} is unexpected. Return type of "
+                    "the score function should either be a "
+                    "tuple(tuple(str, (float, int))) or float or int. "
+                )
+
+        # Weird, scores can be nan if precision has zero guesses
+        for score_name in end_scores:
+            end_scores[score_name] = 0.0 if math.isnan(value) else value
+
+        self.log(
+            f"{name}_score", end_scores[f"{name}_{str(self.scores[0])}"], logger=True
+        )
+        for score_name in end_scores:
+            self.log(score_name, end_scores[score_name], prog_bar=True, logger=True)
+
     # Implement this for each inheriting class
     # TODO: Can we combine the boilerplate for both of these?
     def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
@@ -335,15 +377,13 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
             # Cache all predictions for later serialization
             self.test_predicted_labels = prediction
 
-        for score in self.scores:
-            end_scores[f"{name}_{score}"] = score(
-                prediction.detach().cpu().numpy(), target.detach().cpu().numpy()
-            )
-        self.log(
-            f"{name}_score", end_scores[f"{name}_{str(self.scores[0])}"], logger=True
+        self.log_scores(
+            name,
+            score_args=(
+                prediction.detach().cpu().numpy(),
+                target.detach().cpu().numpy(),
+            ),
         )
-        for score_name in end_scores:
-            self.log(score_name, end_scores[score_name], prog_bar=True, logger=True)
 
 
 class EventPredictionModel(AbstractPredictionModel):
@@ -424,11 +464,23 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
         for postprocessing in tqdm(predicted_events_by_postprocessing):
             predicted_events = predicted_events_by_postprocessing[postprocessing]
             primary_score_fn = self.scores[0]
-            primary_score = primary_score_fn(
+            primary_score_ret = primary_score_fn(
                 # predicted_events, self.target_events[name]
                 predicted_events,
                 self.target_events[name],
             )
+            # If the score returns a tuple of scores, the first score
+            # is used as the primary score
+            if isinstance(primary_score_ret, tuple):
+                primary_score = primary_score_ret[0][1]
+            elif isinstance(primary_score_ret, (float, int)):
+                primary_score = primary_score_ret
+            else:
+                raise ValueError(
+                    f"Return type {type(primary_score_ret)} is unexpected. "
+                    "Return type of the score function should either be a "
+                    "tuple(tuple(str, (float, int))) or float or int. "
+                )
             if np.isnan(primary_score):
                 primary_score = 0.0
             score_and_postprocessing.append((primary_score, postprocessing))
@@ -454,19 +506,7 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
             self.test_predicted_labels = prediction
             self.test_predicted_events = predicted_events
 
-        for score in self.scores:
-            end_scores[f"{name}_{score}"] = score(
-                predicted_events, self.target_events[name]
-            )
-            # Weird, this can happen if precision has zero guesses
-            if math.isnan(end_scores[f"{name}_{score}"]):
-                end_scores[f"{name}_{score}"] = 0.0
-        self.log(
-            f"{name}_score", end_scores[f"{name}_{str(self.scores[0])}"], logger=True
-        )
-
-        for score_name in end_scores:
-            self.log(score_name, end_scores[score_name], prog_bar=True, logger=True)
+        self.log_scores(name, score_args=(predicted_events, self.target_events[name]))
 
 
 class SplitMemmapDataset(Dataset):

From 0a05eb1ab800d5bd2c95e8467b0b79f5f3437ab1 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 15:18:49 +0000
Subject: [PATCH 04/13] Make validate function

---
 heareval/predictions/task_predictions.py | 11 ++-----
 heareval/score.py                        | 41 ++++++++++++++++++++----
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 49cfd755..2e22dd8e 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -46,6 +46,7 @@
     available_scores,
     label_to_binary_vector,
     label_vocab_as_dict,
+    validate_score_return_type,
 )
 
 TASK_SPECIFIC_PARAM_GRID = {
@@ -259,17 +260,9 @@ def log_scores(self, name: str, score_args: Tuple[Any]):
         end_scores: Dict[str, Union[int, float]] = {}
         for score in self.scores:
             score_ret = score(*score_args)
+            validate_score_return_type(score_ret)
             # If the returned score is a tuple, store each subscore as separate entry
             if isinstance(score_ret, tuple):
-                assert all(
-                    type(s) == tuple
-                    and type(s[0]) == str
-                    and type(s[1]) in [float, int]
-                    for s in score_ret
-                ), (
-                    "If the return type of the score is a tuple, all the elements "
-                    "in the tuple should be tuple of type tuple(str, (float, int))"
-                )
                 # The first score in the returned tuple will be used
                 # as primary score for this metric
                 end_scores[f"{name}_{score}"] = score_ret[0][1]
diff --git a/heareval/score.py b/heareval/score.py
index 50ef253a..003e6d1b 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -54,6 +54,28 @@ def label_to_binary_vector(label: List, num_labels: int) -> torch.Tensor:
     return binary_labels
 
 
+def validate_score_return_type(
+    ret: Union[float, int, Tuple[Tuple[str, Union[int, float]]]]
+):
+    """Validates if the return type of the score is valid"""
+    if isinstance(ret, tuple):
+        assert all(
+            type(s) == tuple and type(s[0]) == str and type(s[1]) in [float, int]
+            for s in ret
+        ), (
+            "If the return type of the score is a tuple, all the elements "
+            "in the tuple should be tuple of type tuple(str, (float, int))"
+        )
+    elif isinstance(ret, (float, int)):
+        pass
+    else:
+        raise ValueError(
+            f"Return type {type(ret)} is unexpected. Return type of "
+            "the score function should either be a "
+            "tuple(tuple(str, (float, int))) or float or int. "
+        )
+
+
 class ScoreFunction:
     """
     A simple abstract base class for score functions
@@ -77,7 +99,12 @@ def __init__(
             self.name = name
         self.maximize = maximize
 
-    def __call__(self, predictions: Any, targets: Any, **kwargs) -> float:
+    def __call__(self, *args, **kwargs) -> float:
+        ret = self.compute(*args, **kwargs)
+        validate_score_return_type(ret)
+        return ret
+
+    def compute(self, predictions: Any, targets: Any, **kwargs) -> float:
         """
         Compute the score based on the predictions and targets. Returns the score.
         """
@@ -90,7 +117,7 @@ def __str__(self):
 class Top1Accuracy(ScoreFunction):
     name = "top1_acc"
 
-    def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
         # Compute the number of correct predictions
@@ -115,7 +142,7 @@ class ChromaAccuracy(ScoreFunction):
 
     name = "chroma_acc"
 
-    def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         # Compute the number of correct predictions
         correct = 0
         for target, prediction in zip(targets, predictions):
@@ -161,7 +188,7 @@ def __init__(
         self.params = params
         assert self.score_class is not None
 
-    def __call__(self, predictions: Dict, targets: Dict, **kwargs):
+    def compute(self, predictions: Dict, targets: Dict, **kwargs):
         # Containers of events for sed_eval
         reference_event_list = self.sed_eval_event_container(targets)
         estimated_event_list = self.sed_eval_event_container(predictions)
@@ -243,7 +270,7 @@ class MeanAveragePrecision(ScoreFunction):
 
     name = "mAP"
 
-    def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
 
@@ -272,7 +299,7 @@ class DPrime(ScoreFunction):
 
     name = "d_prime"
 
-    def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
         # ROC-AUC Requires more than one example for each class
@@ -297,7 +324,7 @@ class AUCROC(ScoreFunction):
 
     name = "aucroc"
 
-    def __call__(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
         # ROC-AUC Requires more than one example for each class

From c34546f5f805bd15a207c2b9e4d18825608354cb Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 15:23:55 +0000
Subject: [PATCH 05/13] Fix mypy

---
 heareval/predictions/task_predictions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 2e22dd8e..09c4687d 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -254,10 +254,10 @@ def validation_step(self, batch, batch_idx):
     def test_step(self, batch, batch_idx):
         return self._step(batch, batch_idx)
 
-    def log_scores(self, name: str, score_args: Tuple[Any]):
+    def log_scores(self, name: str, score_args):
         """Logs the metric score value for each score defined for the model"""
         assert hasattr(self, "scores"), "Scores for the model should be defined"
-        end_scores: Dict[str, Union[int, float]] = {}
+        end_scores = {}
         for score in self.scores:
             score_ret = score(*score_args)
             validate_score_return_type(score_ret)

From d25bd50cedfe3c00f9f8b8c64e1ec8f1bd8e4b5c Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 15:57:57 +0000
Subject: [PATCH 06/13] Minor fixes

---
 heareval/predictions/task_predictions.py | 2 +-
 heareval/score.py                        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 09c4687d..45f6e3a2 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -279,7 +279,7 @@ def log_scores(self, name: str, score_args):
                 )
 
         # Weird, scores can be nan if precision has zero guesses
-        for score_name in end_scores:
+        for score_name, value in end_scores.items():
             end_scores[score_name] = 0.0 if math.isnan(value) else value
 
         self.log(
diff --git a/heareval/score.py b/heareval/score.py
index 003e6d1b..eb98cbaf 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -370,7 +370,7 @@ def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> flo
     "segment_1s_er": partial(
         SegmentBasedScore,
         name="segment_1s_er",
-        scores=("error_rate"),
+        scores=("error_rate",),
         params={"time_resolution": 1.0},
         maximize=False,
     ),

From b08a552713983e602264e5609030bafe4a964445 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 18:17:22 +0000
Subject: [PATCH 07/13] Fixes as per comment

---
 heareval/predictions/task_predictions.py | 20 +++++-----
 heareval/score.py                        | 47 +++++++++++++++++-------
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 45f6e3a2..873c05ed 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -263,25 +263,23 @@ def log_scores(self, name: str, score_args):
             validate_score_return_type(score_ret)
             # If the returned score is a tuple, store each subscore as separate entry
             if isinstance(score_ret, tuple):
-                # The first score in the returned tuple will be used
-                # as primary score for this metric
+                # The first score in the returned tuple will be used for
+                # any optimisation criterion, for instance if the metric is
+                # primary metric, the score corresponding to the first tuple
+                # will be used for early stopping
                 end_scores[f"{name}_{score}"] = score_ret[0][1]
                 # All other scores will also be logged
                 for (subscore, value) in score_ret:
                     end_scores[f"{name}_{score}_{subscore}"] = value
-            elif isinstance(score_ret, (float, int)):
+            elif isinstance(score_ret, float):
                 end_scores[f"{name}_{score}"] = score_ret
             else:
                 raise ValueError(
                     f"Return type {type(score_ret)} is unexpected. Return type of "
                     "the score function should either be a "
-                    "tuple(tuple(str, (float, int))) or float or int. "
+                    "tuple(tuple) or float."
                 )
 
-        # Weird, scores can be nan if precision has zero guesses
-        for score_name, value in end_scores.items():
-            end_scores[score_name] = 0.0 if math.isnan(value) else value
-
         self.log(
             f"{name}_score", end_scores[f"{name}_{str(self.scores[0])}"], logger=True
         )
@@ -463,16 +461,16 @@ def _score_epoch_end(self, name: str, outputs: List[Dict[str, List[Any]]]):
                 self.target_events[name],
             )
             # If the score returns a tuple of scores, the first score
-            # is used as the primary score
+            # is used
             if isinstance(primary_score_ret, tuple):
                 primary_score = primary_score_ret[0][1]
-            elif isinstance(primary_score_ret, (float, int)):
+            elif isinstance(primary_score_ret, float):
                 primary_score = primary_score_ret
             else:
                 raise ValueError(
                     f"Return type {type(primary_score_ret)} is unexpected. "
                     "Return type of the score function should either be a "
-                    "tuple(tuple(str, (float, int))) or float or int. "
+                    "tuple(tuple) or float. "
                 )
             if np.isnan(primary_score):
                 primary_score = 0.0
diff --git a/heareval/score.py b/heareval/score.py
index eb98cbaf..996a8105 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -54,25 +54,37 @@ def label_to_binary_vector(label: List, num_labels: int) -> torch.Tensor:
     return binary_labels
 
 
-def validate_score_return_type(
-    ret: Union[float, int, Tuple[Tuple[str, Union[int, float]]]]
-):
-    """Validates if the return type of the score is valid"""
+def validate_score_return_type(ret: Union[Tuple[Tuple], float]):
+    """
+    Valid return types for the metric are
+        - tuple(tuple(string: name of the subtype, float: the value)): This is the
+            case with sed eval metrics. They can return (("f_measure", value),
+            ("precision", value), ...), depending on the scores
+            the metric should is supposed to return. This is set as `scores`
+            attribute in the metric.
+        - float: Standard metric behaviour
+
+    The downstream prediction pipeline is able to handle these two types.
+    In case of the tuple return type, the value of the first entry in the
+    tuple will be used as an optimisation criterion wherever required.
+    For instance, if the return is (("f_measure", value), ("precision", value)),
+    the value corresponding to the f_measure will be used ( for instance in
+    early stopping if this metric is the primary score for the task )
+    """
     if isinstance(ret, tuple):
         assert all(
-            type(s) == tuple and type(s[0]) == str and type(s[1]) in [float, int]
-            for s in ret
+            type(s) == tuple and type(s[0]) == str and type(s[1]) == float for s in ret
         ), (
             "If the return type of the score is a tuple, all the elements "
-            "in the tuple should be tuple of type tuple(str, (float, int))"
+            "in the tuple should be tuple of type (string, float)"
         )
-    elif isinstance(ret, (float, int)):
+    elif isinstance(ret, float):
         pass
     else:
         raise ValueError(
             f"Return type {type(ret)} is unexpected. Return type of "
             "the score function should either be a "
-            "tuple(tuple(str, (float, int))) or float or int. "
+            "tuple(tuple) or float. "
         )
 
 
@@ -99,12 +111,18 @@ def __init__(
             self.name = name
         self.maximize = maximize
 
-    def __call__(self, *args, **kwargs) -> float:
+    def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple], float]:
+        """
+        Calls the compute function of the metric, and after validating the output,
+        returns the metric score
+        """
         ret = self.compute(*args, **kwargs)
         validate_score_return_type(ret)
         return ret
 
-    def compute(self, predictions: Any, targets: Any, **kwargs) -> float:
+    def compute(
+        self, predictions: Any, targets: Any, **kwargs
+    ) -> Union[Tuple[Tuple], float]:
         """
         Compute the score based on the predictions and targets. Returns the score.
         """
@@ -188,7 +206,9 @@ def __init__(
         self.params = params
         assert self.score_class is not None
 
-    def compute(self, predictions: Dict, targets: Dict, **kwargs):
+    def compute(
+        self, predictions: Dict, targets: Dict, **kwargs
+    ) -> Tuple[Tuple[str, float]]:
         # Containers of events for sed_eval
         reference_event_list = self.sed_eval_event_container(targets)
         estimated_event_list = self.sed_eval_event_container(predictions)
@@ -215,7 +235,8 @@ def compute(self, predictions: Dict, targets: Dict, **kwargs):
         overall_scores: Dict[str, Union[float, int]] = dict(
             ChainMap(*nested_overall_scores.values())
         )
-        # Return the required scores as tuples
+        # Return the required scores as tuples. The scores are returned in the
+        # order they are passed in the `scores` argument
         return tuple([(score, overall_scores[score]) for score in self.scores])
 
     @staticmethod

From 391ded8929762c5d6a3a6cfaa45fe28f6c305af0 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 18:20:50 +0000
Subject: [PATCH 08/13] flake

---
 heareval/predictions/task_predictions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 873c05ed..743c457c 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -14,7 +14,6 @@
 
 import copy
 import json
-import math
 import multiprocessing
 import pickle
 import random

From 613ee7f42c789ba9fb7076cd9a9b3b93d01a1655 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 18:26:48 +0000
Subject: [PATCH 09/13] convert compute to _compute

---
 heareval/score.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/heareval/score.py b/heareval/score.py
index 996a8105..551ee7f4 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -116,15 +116,18 @@ def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple], float]:
         Calls the compute function of the metric, and after validating the output,
         returns the metric score
         """
-        ret = self.compute(*args, **kwargs)
+        ret = self._compute(*args, **kwargs)
         validate_score_return_type(ret)
         return ret
 
-    def compute(
+    def _compute(
         self, predictions: Any, targets: Any, **kwargs
     ) -> Union[Tuple[Tuple], float]:
         """
-        Compute the score based on the predictions and targets. Returns the score.
+        Compute the score based on the predictions and targets.
+        This is a private function and the metric should be used as a functor
+        by calling the `__call__` method which calls this and also validates
+        the return type
         """
         raise NotImplementedError("Inheriting classes must implement this function")
 
@@ -135,7 +138,7 @@ def __str__(self):
 class Top1Accuracy(ScoreFunction):
     name = "top1_acc"
 
-    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def _compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
         # Compute the number of correct predictions
@@ -160,7 +163,7 @@ class ChromaAccuracy(ScoreFunction):
 
     name = "chroma_acc"
 
-    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def _compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         # Compute the number of correct predictions
         correct = 0
         for target, prediction in zip(targets, predictions):
@@ -206,7 +209,7 @@ def __init__(
         self.params = params
         assert self.score_class is not None
 
-    def compute(
+    def _compute(
         self, predictions: Dict, targets: Dict, **kwargs
     ) -> Tuple[Tuple[str, float]]:
         # Containers of events for sed_eval
@@ -291,7 +294,7 @@ class MeanAveragePrecision(ScoreFunction):
 
     name = "mAP"
 
-    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def _compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
 
@@ -320,7 +323,7 @@ class DPrime(ScoreFunction):
 
     name = "d_prime"
 
-    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def _compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
         # ROC-AUC Requires more than one example for each class
@@ -345,7 +348,7 @@ class AUCROC(ScoreFunction):
 
     name = "aucroc"
 
-    def compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
+    def _compute(self, predictions: np.ndarray, targets: np.ndarray, **kwargs) -> float:
         assert predictions.ndim == 2
         assert targets.ndim == 2  # One hot
         # ROC-AUC Requires more than one example for each class

From 15ba67d05b505427bfe58610b6ccbafbf3017811 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 18:33:25 +0000
Subject: [PATCH 10/13] Change return type

---
 heareval/score.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/heareval/score.py b/heareval/score.py
index 551ee7f4..ba96931c 100644
--- a/heareval/score.py
+++ b/heareval/score.py
@@ -54,7 +54,7 @@ def label_to_binary_vector(label: List, num_labels: int) -> torch.Tensor:
     return binary_labels
 
 
-def validate_score_return_type(ret: Union[Tuple[Tuple], float]):
+def validate_score_return_type(ret: Union[Tuple[Tuple[str, float], ...], float]):
     """
     Valid return types for the metric are
         - tuple(tuple(string: name of the subtype, float: the value)): This is the
@@ -111,7 +111,7 @@ def __init__(
             self.name = name
         self.maximize = maximize
 
-    def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple], float]:
+    def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple[str, float], ...], float]:
         """
         Calls the compute function of the metric, and after validating the output,
         returns the metric score
@@ -122,7 +122,7 @@ def __call__(self, *args, **kwargs) -> Union[Tuple[Tuple], float]:
 
     def _compute(
         self, predictions: Any, targets: Any, **kwargs
-    ) -> Union[Tuple[Tuple], float]:
+    ) -> Union[Tuple[Tuple[str, float], ...], float]:
         """
         Compute the score based on the predictions and targets.
         This is a private function and the metric should be used as a functor
@@ -211,7 +211,7 @@ def __init__(
 
     def _compute(
         self, predictions: Dict, targets: Dict, **kwargs
-    ) -> Tuple[Tuple[str, float]]:
+    ) -> Tuple[Tuple[str, float], ...]:
         # Containers of events for sed_eval
         reference_event_list = self.sed_eval_event_container(targets)
         estimated_event_list = self.sed_eval_event_container(predictions)
@@ -235,7 +235,7 @@ def _compute(
             str, Dict[str, float]
         ] = scores.results_overall_metrics()
         # Open up nested overall scores
-        overall_scores: Dict[str, Union[float, int]] = dict(
+        overall_scores: Dict[str, float] = dict(
             ChainMap(*nested_overall_scores.values())
         )
         # Return the required scores as tuples. The scores are returned in the

From 67f05e8ee949608165915dcc9459d73180fe7027 Mon Sep 17 00:00:00 2001
From: Humair Raj Khan <khumairraj@gmail.com>
Date: Sun, 21 Nov 2021 19:43:02 +0000
Subject: [PATCH 11/13] Update comment

---
 heareval/predictions/task_predictions.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 743c457c..75fb2a63 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -262,10 +262,8 @@ def log_scores(self, name: str, score_args):
             validate_score_return_type(score_ret)
             # If the returned score is a tuple, store each subscore as separate entry
             if isinstance(score_ret, tuple):
-                # The first score in the returned tuple will be used for
-                # any optimisation criterion, for instance if the metric is
-                # primary metric, the score corresponding to the first tuple
-                # will be used for early stopping
+                # The first score in the returned tuple for the
+                # first `self.scores` is the optimization criterion
                 end_scores[f"{name}_{score}"] = score_ret[0][1]
                 # All other scores will also be logged
                 for (subscore, value) in score_ret:

From 45eb94bc976570954b34c46ddf15f52c8bbba67a Mon Sep 17 00:00:00 2001
From: Joseph Turian <turian@gmail.com>
Date: Sun, 21 Nov 2021 20:32:17 +0000
Subject: [PATCH 12/13] Comment update

---
 heareval/predictions/task_predictions.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 75fb2a63..af2fd415 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -257,13 +257,12 @@ def log_scores(self, name: str, score_args):
         """Logs the metric score value for each score defined for the model"""
         assert hasattr(self, "scores"), "Scores for the model should be defined"
         end_scores = {}
+        # The first score in the first `self.scores` is the optimization criterion
         for score in self.scores:
             score_ret = score(*score_args)
             validate_score_return_type(score_ret)
             # If the returned score is a tuple, store each subscore as separate entry
             if isinstance(score_ret, tuple):
-                # The first score in the returned tuple for the
-                # first `self.scores` is the optimization criterion
                 end_scores[f"{name}_{score}"] = score_ret[0][1]
                 # All other scores will also be logged
                 for (subscore, value) in score_ret:

From c515a5fe6674637cb74d0416b4017796ea4bd6ec Mon Sep 17 00:00:00 2001
From: Joseph Turian <turian@gmail.com>
Date: Mon, 22 Nov 2021 10:26:10 +0000
Subject: [PATCH 13/13] rm math

---
 heareval/predictions/task_predictions.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/heareval/predictions/task_predictions.py b/heareval/predictions/task_predictions.py
index 3b6f9bb2..aba2749d 100755
--- a/heareval/predictions/task_predictions.py
+++ b/heareval/predictions/task_predictions.py
@@ -15,7 +15,6 @@
 import copy
 import json
 import logging
-import math
 import multiprocessing
 import pickle
 import random