diff --git a/pyannote/audio/pipelines/overlapped_speech_detection.py b/pyannote/audio/pipelines/overlapped_speech_detection.py index 064cae1be..1c9790feb 100644 --- a/pyannote/audio/pipelines/overlapped_speech_detection.py +++ b/pyannote/audio/pipelines/overlapped_speech_detection.py @@ -134,9 +134,13 @@ def __init__( )[:, :, -2, np.newaxis] self._segmentation = Inference(model, **inference_kwargs) - #  hyper-parameters used for hysteresis thresholding - self.onset = Uniform(0.0, 1.0) - self.offset = Uniform(0.0, 1.0) + if model.specifications.powerset: + self.onset = self.offset = 0.5 + + else: + #  hyper-parameters used for hysteresis thresholding + self.onset = Uniform(0.0, 1.0) + self.offset = Uniform(0.0, 1.0) # hyper-parameters used for post-processing i.e. removing short overlapped regions # or filling short gaps between overlapped regions @@ -152,14 +156,21 @@ def __init__( self.recall = recall def default_parameters(self): - # parameters optimized on DIHARD 3 development set if self.segmentation == "pyannote/segmentation": + # parameters optimized on DIHARD 3 development set return { "onset": 0.430, "offset": 0.320, "min_duration_on": 0.091, "min_duration_off": 0.144, } + + elif self.segmentation == "pyannote/segmentation-3.0.0": + return { + "min_duration_on": 0.0, + "min_duration_off": 0.0, + } + raise NotImplementedError() def classes(self): diff --git a/pyannote/audio/pipelines/voice_activity_detection.py b/pyannote/audio/pipelines/voice_activity_detection.py index 0edbea42f..f67489b64 100644 --- a/pyannote/audio/pipelines/voice_activity_detection.py +++ b/pyannote/audio/pipelines/voice_activity_detection.py @@ -121,14 +121,18 @@ def __init__( # load model and send it to GPU (when available and not already on GPU) model = get_model(segmentation, use_auth_token=use_auth_token) + inference_kwargs["pre_aggregation_hook"] = lambda scores: np.max( scores, axis=-1, keepdims=True ) self._segmentation = Inference(model, **inference_kwargs) - #  hyper-parameters used for hysteresis thresholding - self.onset = Uniform(0.0, 1.0) - self.offset = Uniform(0.0, 1.0) + if model.specifications.powerset: + self.onset = self.offset = 0.5 + else: + #  hyper-parameters used for hysteresis thresholding + self.onset = Uniform(0.0, 1.0) + self.offset = Uniform(0.0, 1.0) # hyper-parameters used for post-processing i.e. removing short speech regions # or filling short gaps between speech regions @@ -136,14 +140,21 @@ def __init__( self.min_duration_off = Uniform(0.0, 1.0) def default_parameters(self): - # parameters optimized on DIHARD 3 development set if self.segmentation == "pyannote/segmentation": + # parameters optimized for DIHARD 3 development set return { "onset": 0.767, "offset": 0.377, "min_duration_on": 0.136, "min_duration_off": 0.067, } + + elif self.segmentation == "pyannote/segmentation-3.0.0": + return { + "min_duration_on": 0.0, + "min_duration_off": 0.0, + } + raise NotImplementedError() def classes(self): @@ -289,7 +300,6 @@ def __init__( self.learning_rate = LogUniform(1e-6, 1) def apply(self, file: AudioFile) -> Annotation: - # create a copy of file file = dict(file)