pyannote · clement-pages · Oct 13, 2023 · Oct 26, 2023 · Nov 2, 2023 · Nov 2, 2023
diff --git a/pyannote/audio/core/model.py b/pyannote/audio/core/model.py
@@ -217,9 +217,12 @@ def __example_output(
             self.specifications, __example_output, example_output
         )
 
+    def prepare_data(self):
+        self.task.prepare_data()
+
     def setup(self, stage=None):
         if stage == "fit":
-            self.task.setup_metadata()
+            self.task.setup()
 
         # list of layers before adding task-dependent layers
         before = set((name, id(module)) for name, module in self.named_modules())

diff --git a/pyannote/audio/core/task.py b/pyannote/audio/core/task.py
diff --git a/pyannote/audio/tasks/segmentation/mixins.py b/pyannote/audio/tasks/segmentation/mixins.py
diff --git a/pyannote/audio/tasks/segmentation/multilabel.py b/pyannote/audio/tasks/segmentation/multilabel.py
@@ -31,11 +31,11 @@
 from torch_audiomentations.core.transforms_interface import BaseWaveformTransform
 from torchmetrics import Metric
 
-from pyannote.audio.core.task import Problem, Resolution, Specifications, Task
-from pyannote.audio.tasks.segmentation.mixins import SegmentationTaskMixin
+from pyannote.audio.core.task import Problem, Resolution, Specifications
+from pyannote.audio.tasks.segmentation.mixins import SegmentationTask
 
 
-class MultiLabelSegmentation(SegmentationTaskMixin, Task):
+class MultiLabelSegmentation(SegmentationTask):
     """Generic multi-label segmentation
 
     Multi-label segmentation is the process of detecting temporal intervals
@@ -79,6 +79,8 @@ class MultiLabelSegmentation(SegmentationTaskMixin, Task):
     metric : optional
         Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
         Defaults to AUROC (area under the ROC curve).
+    cache_path : str, optional
+        path to file where to write or load task caches
     """
 
     def __init__(
@@ -94,6 +96,7 @@ def __init__(
         pin_memory: bool = False,
         augmentation: BaseWaveformTransform = None,
         metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
+        cache_path: Optional[Union[str, None]] = None,
     ):
         if not isinstance(protocol, SegmentationProtocol):
             raise ValueError(
@@ -109,6 +112,7 @@ def __init__(
             pin_memory=pin_memory,
             augmentation=augmentation,
             metric=metric,
+            cache_path=cache_path,
         )
 
         self.balance = balance
@@ -123,7 +127,7 @@ def setup(self):
         super().setup()
 
         self.specifications = Specifications(
-            classes=self.classes,
+            classes=self.prepared_data["classes"],
             problem=Problem.MULTI_LABEL_CLASSIFICATION,
             resolution=Resolution.FRAME,
             duration=self.duration,
@@ -169,7 +173,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
         sample = dict()
         sample["X"], _ = self.model.audio.crop(file, chunk, duration=duration)
         # gather all annotations of current file
-        annotations = self.annotations[self.annotations["file_id"] == file_id]
+        annotations = self.prepared_data["annotations"][self.prepared_data["annotations"]["file_id"] == file_id]
 
         # gather all annotations with non-empty intersection with current chunk
         chunk_annotations = annotations[
@@ -184,9 +188,9 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
 
         # frame-level targets (-1 for un-annotated classes)
         y = -np.ones(
-            (self.model.example_output.num_frames, len(self.classes)), dtype=np.int8
+            (self.model.example_output.num_frames, len(self.prepared_data["classes"])), dtype=np.int8
         )
-        y[:, self.annotated_classes[file_id]] = 0
+        y[:, self.prepared_data["annotated_classes"][file_id]] = 0
         for start, end, label in zip(
             start_idx, end_idx, chunk_annotations["global_label_idx"]
         ):
@@ -196,7 +200,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
             y, self.model.example_output.frames, labels=self.classes
         )
 
-        metadata = self.metadata[file_id]
+        metadata = self.prepared_data["metadata"][file_id]
         sample["meta"] = {key: metadata[key] for key in metadata.dtype.names}
         sample["meta"]["file"] = file_id
 

diff --git a/pyannote/audio/tasks/segmentation/overlapped_speech_detection.py b/pyannote/audio/tasks/segmentation/overlapped_speech_detection.py
@@ -21,19 +21,19 @@
 # SOFTWARE.
 
 
-from typing import Dict, Sequence, Text, Tuple, Union
+from typing import Dict, Optional, Sequence, Text, Tuple, Union
 
 import numpy as np
 from pyannote.core import Segment, SlidingWindowFeature
 from pyannote.database import Protocol
 from torch_audiomentations.core.transforms_interface import BaseWaveformTransform
 from torchmetrics import Metric
 
-from pyannote.audio.core.task import Problem, Resolution, Specifications, Task
-from pyannote.audio.tasks.segmentation.mixins import SegmentationTaskMixin
+from pyannote.audio.core.task import Problem, Resolution, Specifications
+from pyannote.audio.tasks.segmentation.mixins import SegmentationTask
 
 
-class OverlappedSpeechDetection(SegmentationTaskMixin, Task):
+class OverlappedSpeechDetection(SegmentationTask):
     """Overlapped speech detection
 
     Overlapped speech detection is the task of detecting regions where at least
@@ -88,6 +88,8 @@ class OverlappedSpeechDetection(SegmentationTaskMixin, Task):
     metric : optional
         Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
         Defaults to AUROC (area under the ROC curve).
+    cache_path : str, optional
+        path to file where to write or load task caches
     """
 
     OVERLAP_DEFAULTS = {"probability": 0.5, "snr_min": 0.0, "snr_max": 10.0}
@@ -105,6 +107,7 @@ def __init__(
         pin_memory: bool = False,
         augmentation: BaseWaveformTransform = None,
         metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
+        cache_path: Optional[Union[str, None]] = None,
     ):
         super().__init__(
             protocol,
@@ -115,6 +118,7 @@ def __init__(
             pin_memory=pin_memory,
             augmentation=augmentation,
             metric=metric,
+            cache_path=cache_path,
         )
 
         self.specifications = Specifications(
@@ -163,7 +167,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
         sample["X"], _ = self.model.audio.crop(file, chunk, duration=duration)
 
         # gather all annotations of current file
-        annotations = self.annotations[self.annotations["file_id"] == file_id]
+        annotations = self.prepared_data["annotations"][self.prepared_data["annotations"]["file_id"] == file_id]
 
         # gather all annotations with non-empty intersection with current chunk
         chunk_annotations = annotations[
@@ -186,7 +190,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
             y, self.model.example_output.frames, labels=["speech"]
         )
 
-        metadata = self.metadata[file_id]
+        metadata = self.prepared_data["metadata"][file_id]
         sample["meta"] = {key: metadata[key] for key in metadata.dtype.names}
         sample["meta"]["file"] = file_id
 

diff --git a/pyannote/audio/tasks/segmentation/speaker_diarization.py b/pyannote/audio/tasks/segmentation/speaker_diarization.py
@@ -23,7 +23,7 @@
 import math
 import warnings
 from collections import Counter
-from typing import Dict, Literal, Sequence, Text, Tuple, Union
+from typing import Dict, Literal, Optional, Sequence, Text, Tuple, Union
 
 import numpy as np
 import torch
@@ -37,8 +37,8 @@
 from torch_audiomentations.core.transforms_interface import BaseWaveformTransform
 from torchmetrics import Metric
 
-from pyannote.audio.core.task import Problem, Resolution, Specifications, Task
-from pyannote.audio.tasks.segmentation.mixins import SegmentationTaskMixin
+from pyannote.audio.core.task import Problem, Resolution, Specifications
+from pyannote.audio.tasks.segmentation.mixins import SegmentationTask
 from pyannote.audio.torchmetrics import (
     DiarizationErrorRate,
     FalseAlarmRate,
@@ -58,7 +58,7 @@
 Scopes = list(Scope.__args__)
 
 
-class SpeakerDiarization(SegmentationTaskMixin, Task):
+class SpeakerDiarization(SegmentationTask):
     """Speaker diarization
 
     Parameters
@@ -110,6 +110,8 @@ class SpeakerDiarization(SegmentationTaskMixin, Task):
     metric : optional
         Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
         Defaults to AUROC (area under the ROC curve).
+    cache_path : str, optional
+        path to file where to write or load task caches
 
     References
     ----------
@@ -140,6 +142,7 @@ def __init__(
         augmentation: BaseWaveformTransform = None,
         vad_loss: Literal["bce", "mse"] = None,
         metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
+        cache_path: Optional[Union[str, None]] = None,
         max_num_speakers: int = None,  # deprecated in favor of `max_speakers_per_chunk``
         loss: Literal["bce", "mse"] = None,  # deprecated
     ):
@@ -152,6 +155,7 @@ def __init__(
             pin_memory=pin_memory,
             augmentation=augmentation,
             metric=metric,
+            cache_path=cache_path,
         )
 
         if not isinstance(protocol, SpeakerDiarizationProtocol):
@@ -191,23 +195,23 @@ def setup(self):
 
         # estimate maximum number of speakers per chunk when not provided
         if self.max_speakers_per_chunk is None:
-            training = self.metadata["subset"] == Subsets.index("train")
+            training = self.prepared_data["metadata"]["subset"] == Subsets.index("train")
 
             num_unique_speakers = []
             progress_description = f"Estimating maximum number of speakers per {self.duration:g}s chunk in the training set"
             for file_id in track(
                 np.where(training)[0], description=progress_description
             ):
-                annotations = self.annotations[
-                    np.where(self.annotations["file_id"] == file_id)[0]
+                annotations = self.prepared_data["annotations"][
+                    np.where(self.prepared_data["annotations"]["file_id"] == file_id)[0]
                 ]
-                annotated_regions = self.annotated_regions[
-                    np.where(self.annotated_regions["file_id"] == file_id)[0]
+                annotated_regions = self.prepared_data["annotated_regions"][
+                    np.where(self.prepared_data["annotated_regions"]["file_id"] == file_id)[0]
                 ]
                 for region in annotated_regions:
                     # find annotations within current region
                     region_start = region["start"]
-                    region_end = region["end"]
+                    region_end = region["start"] + region["duration"]
                     region_annotations = annotations[
                         np.where(
                             (annotations["start"] >= region_start)
@@ -318,7 +322,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
         file = self.get_file(file_id)
 
         # get label scope
-        label_scope = Scopes[self.metadata[file_id]["scope"]]
+        label_scope = Scopes[self.prepared_data["metadata"][file_id]["scope"]]
         label_scope_key = f"{label_scope}_label_idx"
 
         #
@@ -328,7 +332,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
         sample["X"], _ = self.model.audio.crop(file, chunk, duration=duration)
 
         # gather all annotations of current file
-        annotations = self.annotations[self.annotations["file_id"] == file_id]
+        annotations = self.prepared_data["annotations"][self.prepared_data["annotations"]["file_id"] == file_id]
 
         # gather all annotations with non-empty intersection with current chunk
         chunk_annotations = annotations[
@@ -364,7 +368,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
             y, self.model.example_output.frames, labels=labels
         )
 
-        metadata = self.metadata[file_id]
+        metadata = self.prepared_data["metadata"][file_id]
         sample["meta"] = {key: metadata[key] for key in metadata.dtype.names}
         sample["meta"]["file"] = file_id
 

diff --git a/pyannote/audio/tasks/segmentation/voice_activity_detection.py b/pyannote/audio/tasks/segmentation/voice_activity_detection.py
@@ -20,19 +20,19 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-from typing import Dict, Sequence, Text, Tuple, Union
+from typing import Dict, Optional, Sequence, Text, Tuple, Union
 
 import numpy as np
 from pyannote.core import Segment, SlidingWindowFeature
 from pyannote.database import Protocol
 from torch_audiomentations.core.transforms_interface import BaseWaveformTransform
 from torchmetrics import Metric
 
-from pyannote.audio.core.task import Problem, Resolution, Specifications, Task
-from pyannote.audio.tasks.segmentation.mixins import SegmentationTaskMixin
+from pyannote.audio.core.task import Problem, Resolution, Specifications
+from pyannote.audio.tasks.segmentation.mixins import SegmentationTask
 
 
-class VoiceActivityDetection(SegmentationTaskMixin, Task):
+class VoiceActivityDetection(SegmentationTask):
     """Voice activity detection
 
     Voice activity detection (or VAD) is the task of detecting speech regions
@@ -74,6 +74,8 @@ class VoiceActivityDetection(SegmentationTaskMixin, Task):
     metric : optional
         Validation metric(s). Can be anything supported by torchmetrics.MetricCollection.
         Defaults to AUROC (area under the ROC curve).
+    cache_path : str, optional
+        path to file where to write or load task caches
     """
 
     def __init__(
@@ -88,6 +90,7 @@ def __init__(
         pin_memory: bool = False,
         augmentation: BaseWaveformTransform = None,
         metric: Union[Metric, Sequence[Metric], Dict[str, Metric]] = None,
+        cache_path: Optional[Union[str, None]] = None,
     ):
         super().__init__(
             protocol,
@@ -98,6 +101,7 @@ def __init__(
             pin_memory=pin_memory,
             augmentation=augmentation,
             metric=metric,
+            cache_path=cache_path,
         )
 
         self.balance = balance
@@ -145,7 +149,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
         sample["X"], _ = self.model.audio.crop(file, chunk, duration=duration)
 
         # gather all annotations of current file
-        annotations = self.annotations[self.annotations["file_id"] == file_id]
+        annotations = self.prepared_data["annotations"][self.prepared_data["annotations"]["file_id"] == file_id]
 
         # gather all annotations with non-empty intersection with current chunk
         chunk_annotations = annotations[
@@ -167,7 +171,7 @@ def prepare_chunk(self, file_id: int, start_time: float, duration: float):
             y, self.model.example_output.frames, labels=["speech"]
         )
 
-        metadata = self.metadata[file_id]
+        metadata = self.prepared_data["metadata"][file_id]
         sample["meta"] = {key: metadata[key] for key in metadata.dtype.names}
         sample["meta"]["file"] = file_id