From 629fdd207874f563426e119dc882d057e1341ef3 Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Thu, 27 Apr 2023 07:27:29 +0000 Subject: [PATCH 1/8] Fix instantaneous speaker numbers overcounting in the diarization pipeline --- pyannote/audio/pipelines/clustering.py | 1 - .../audio/pipelines/speaker_diarization.py | 39 ++++++++++++------- pyannote/audio/pipelines/utils/diarization.py | 22 ++++------- 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/pyannote/audio/pipelines/clustering.py b/pyannote/audio/pipelines/clustering.py index c51cdcc50..b63ab214f 100644 --- a/pyannote/audio/pipelines/clustering.py +++ b/pyannote/audio/pipelines/clustering.py @@ -253,7 +253,6 @@ def __call__( hard_clusters = np.zeros((num_chunks, num_speakers), dtype=np.int8) soft_clusters = np.ones((num_chunks, num_speakers, 1)) centroids = np.mean(train_embeddings, axis=0, keepdims=True) - return hard_clusters, soft_clusters, centroids train_clusters = self.cluster( diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index d5cf04e05..12533294b 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -478,12 +478,20 @@ def apply( hook("segmentation", segmentations) # shape: (num_chunks, num_frames, local_num_speakers) + # binarize segmentation + if self._segmentation.model.specifications.powerset: + binarized_segmentations = segmentations + else: + binarized_segmentations: SlidingWindowFeature = binarize( + segmentations, + onset=self.segmentation.threshold, + initial_state=False, + ) + # estimate frame-level number of instantaneous speakers count = self.speaker_count( - segmentations, - onset=0.5 - if self._segmentation.model.specifications.powerset - else self.segmentation.threshold, + binarized_segmentations, + max_speakers, frames=self._frames, warm_up=(0.0, 0.0), ) @@ -499,16 +507,6 @@ def apply( return diarization - # binarize segmentation - if self._segmentation.model.specifications.powerset: - binarized_segmentations = segmentations - else: - binarized_segmentations: SlidingWindowFeature = binarize( - segmentations, - onset=self.segmentation.threshold, - initial_state=False, - ) - if self.klustering == "OracleClustering" and not return_embeddings: embeddings = None else: @@ -533,6 +531,19 @@ def apply( # hard_clusters: (num_chunks, num_speakers) # centroids: (num_speakers, dimension) + # number of detected clusters is the number of different speakers + num_different_speakers = centroids.shape[0] + # quick sanity check + assert ( + num_different_speakers >= min_speakers + and num_different_speakers <= max_speakers + ) + + # during counting, we could possibly overcount the number of instantaneous + # speakers due to segmentation errors, so we cap the maximum instantaneous number + # of speakers by the number of detected clusters + count.data = np.minimum(count.data, num_different_speakers) + # reconstruct discrete diarization from raw hard clusters # keep track of inactive speakers diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py index 91413350b..f3b89a560 100644 --- a/pyannote/audio/pipelines/utils/diarization.py +++ b/pyannote/audio/pipelines/utils/diarization.py @@ -117,13 +117,11 @@ def optimal_mapping( else: return mapped_hypothesis - # TODO: get rid of onset/offset (binarization should be applied before calling speaker_count) # TODO: get rid of warm-up parameter (trimming should be applied before calling speaker_count) @staticmethod def speaker_count( - segmentations: SlidingWindowFeature, - onset: float = 0.5, - offset: float = None, + binarized_segmentations: SlidingWindowFeature, + max_speakers: Union[int, float], warm_up: Tuple[float, float] = (0.1, 0.1), frames: SlidingWindow = None, ) -> SlidingWindowFeature: @@ -131,12 +129,10 @@ def speaker_count( Parameters ---------- - segmentations : SlidingWindowFeature - (num_chunks, num_frames, num_classes)-shaped scores. - onset : float, optional - Onset threshold. Defaults to 0.5 - offset : float, optional - Offset threshold. Defaults to `onset`. + binarized_segmentations : SlidingWindowFeature + (num_chunks, num_frames, num_classes)-shaped binarized scores. + max_speakers : int or np.inf + Maximum number of speakers allowed. Counts will not exceed this number warm_up : (float, float) tuple, optional Left/right warm up ratio of chunk duration. Defaults to (0.1, 0.1), i.e. 10% on both sides. @@ -151,10 +147,7 @@ def speaker_count( (num_frames, 1)-shaped instantaneous speaker count """ - binarized: SlidingWindowFeature = binarize( - segmentations, onset=onset, offset=offset, initial_state=False - ) - trimmed = Inference.trim(binarized, warm_up=warm_up) + trimmed = Inference.trim(binarized_segmentations, warm_up=warm_up) count = Inference.aggregate( np.sum(trimmed, axis=-1, keepdims=True), frames=frames, @@ -162,6 +155,7 @@ def speaker_count( missing=0.0, skip_average=False, ) + count.data[count.data > max_speakers] = max_speakers count.data = np.rint(count.data).astype(np.uint8) return count From 3c503ece9b0ecbea6b717c8507f5124981f6b5d6 Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Wed, 5 Jul 2023 07:35:48 +0000 Subject: [PATCH 2/8] Fix resegmentation pipeline --- pyannote/audio/pipelines/resegmentation.py | 11 +++++++++-- pyannote/audio/pipelines/utils/diarization.py | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/pyannote/audio/pipelines/resegmentation.py b/pyannote/audio/pipelines/resegmentation.py index bb71abf22..d01e5d65f 100644 --- a/pyannote/audio/pipelines/resegmentation.py +++ b/pyannote/audio/pipelines/resegmentation.py @@ -39,6 +39,7 @@ get_model, ) from pyannote.audio.utils.permutation import mae_cost_func, permutate +from pyannote.audio.utils.signal import binarize class Resegmentation(SpeakerDiarizationMixin, Pipeline): @@ -181,11 +182,17 @@ def apply( hook("segmentation", segmentations) - # estimate frame-level number of instantaneous speakers - count = self.speaker_count( + # binarize segmentations before speaker counting + binarized_segmentations: SlidingWindowFeature = binarize( segmentations, onset=self.onset, offset=self.offset, + initial_state=False, + ) + + # estimate frame-level number of instantaneous speakers + count = self.speaker_count( + binarized_segmentations, warm_up=(self.warm_up, self.warm_up), frames=self._frames, ) diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py index f3b89a560..2f28812cb 100644 --- a/pyannote/audio/pipelines/utils/diarization.py +++ b/pyannote/audio/pipelines/utils/diarization.py @@ -121,7 +121,7 @@ def optimal_mapping( @staticmethod def speaker_count( binarized_segmentations: SlidingWindowFeature, - max_speakers: Union[int, float], + max_speakers: Union[int, float] = np.inf, warm_up: Tuple[float, float] = (0.1, 0.1), frames: SlidingWindow = None, ) -> SlidingWindowFeature: From e561a3b19062bad2ff00bed073bef952862b519c Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Fri, 14 Jul 2023 06:33:42 +0000 Subject: [PATCH 3/8] Issue warning instead of an assert --- .../audio/pipelines/speaker_diarization.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index 12533294b..1579420f1 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -25,6 +25,8 @@ import functools import itertools import math +import textwrap +import warnings from typing import Callable, Optional, Text, Union import numpy as np @@ -533,11 +535,18 @@ def apply( # number of detected clusters is the number of different speakers num_different_speakers = centroids.shape[0] - # quick sanity check - assert ( - num_different_speakers >= min_speakers - and num_different_speakers <= max_speakers - ) + + # detected number of speakers can still be out of bounds + # (specifically, lower than `min_speakers`), since there could be too few embeddings + # to make enough clusters with a given minimum cluster size. + if num_different_speakers < min_speakers or num_different_speakers > max_speakers: + warnings.warn(textwrap.dedent( + f"""The detected number of speakers ({num_different_speakers}) is outside + the given bounds [{min_speakers}, {max_speakers}]. This can happen if the + given audio file is too short to contain {min_speakers} or more speakers. + Try to lower the desired minimal number of speakers. + """ + )) # during counting, we could possibly overcount the number of instantaneous # speakers due to segmentation errors, so we cap the maximum instantaneous number From 0cfa92ed0a79bec5d9a3d33eb7a80e8dee7428b0 Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Tue, 8 Aug 2023 09:46:41 +0000 Subject: [PATCH 4/8] Constraint the speaker counting only with `max_speakers` --- pyannote/audio/pipelines/speaker_diarization.py | 5 ++--- pyannote/audio/pipelines/utils/diarization.py | 4 ---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index 1579420f1..ff586adbc 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -493,7 +493,6 @@ def apply( # estimate frame-level number of instantaneous speakers count = self.speaker_count( binarized_segmentations, - max_speakers, frames=self._frames, warm_up=(0.0, 0.0), ) @@ -550,8 +549,8 @@ def apply( # during counting, we could possibly overcount the number of instantaneous # speakers due to segmentation errors, so we cap the maximum instantaneous number - # of speakers by the number of detected clusters - count.data = np.minimum(count.data, num_different_speakers) + # of speakers by the `max_speakers` value + count.data = np.minimum(count.data, max_speakers) # reconstruct discrete diarization from raw hard clusters diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py index 2f28812cb..4a35f7049 100644 --- a/pyannote/audio/pipelines/utils/diarization.py +++ b/pyannote/audio/pipelines/utils/diarization.py @@ -121,7 +121,6 @@ def optimal_mapping( @staticmethod def speaker_count( binarized_segmentations: SlidingWindowFeature, - max_speakers: Union[int, float] = np.inf, warm_up: Tuple[float, float] = (0.1, 0.1), frames: SlidingWindow = None, ) -> SlidingWindowFeature: @@ -131,8 +130,6 @@ def speaker_count( ---------- binarized_segmentations : SlidingWindowFeature (num_chunks, num_frames, num_classes)-shaped binarized scores. - max_speakers : int or np.inf - Maximum number of speakers allowed. Counts will not exceed this number warm_up : (float, float) tuple, optional Left/right warm up ratio of chunk duration. Defaults to (0.1, 0.1), i.e. 10% on both sides. @@ -155,7 +152,6 @@ def speaker_count( missing=0.0, skip_average=False, ) - count.data[count.data > max_speakers] = max_speakers count.data = np.rint(count.data).astype(np.uint8) return count From 47a3ec7bab4a54fb5fd233dda0bdb3a9b68dc578 Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Thu, 10 Aug 2023 09:53:00 +0000 Subject: [PATCH 5/8] Fix typing of count.data and possible error due to number of centroids mismatch --- pyannote/audio/pipelines/speaker_diarization.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index ff586adbc..550f2adb1 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -550,7 +550,7 @@ def apply( # during counting, we could possibly overcount the number of instantaneous # speakers due to segmentation errors, so we cap the maximum instantaneous number # of speakers by the `max_speakers` value - count.data = np.minimum(count.data, max_speakers) + count.data = np.minimum(count.data, max_speakers).astype(np.int8) # reconstruct discrete diarization from raw hard clusters @@ -607,6 +607,14 @@ def apply( if not return_embeddings: return diarization + # The number of centroids may be smaller than the number of speakers + # in the annotation. This can happen if the number of active speakers + # obtained from `speaker_count` for some frames is larger than the number + # of clusters obtained from `clustering`. In this case, we append zero embeddings + # for extra speakers + if len(diarization.labels()) > centroids.shape[0]: + centroids = np.pad(centroids, (0, len(diarization.labels()) - centroids.shape[0])) + # re-order centroids so that they match # the order given by diarization.labels() inverse_mapping = {label: index for index, label in mapping.items()} @@ -614,11 +622,6 @@ def apply( [inverse_mapping[label] for label in diarization.labels()] ] - # FIXME: the number of centroids may be smaller than the number of speakers - # in the annotation. This can happen if the number of active speakers - # obtained from `speaker_count` for some frames is larger than the number - # of clusters obtained from `clustering`. Will be fixed in the future - return diarization, centroids def get_metric(self) -> GreedyDiarizationErrorRate: From 0d72a4eb5b66fdc68bcaf395b1519d1100d8f9ce Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Fri, 18 Aug 2023 12:05:46 +0000 Subject: [PATCH 6/8] Fix a couple of elusive bugs --- pyannote/audio/pipelines/speaker_diarization.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index 550f2adb1..695d423c7 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -533,7 +533,7 @@ def apply( # centroids: (num_speakers, dimension) # number of detected clusters is the number of different speakers - num_different_speakers = centroids.shape[0] + num_different_speakers = int(np.max(hard_clusters) + 1) # detected number of speakers can still be out of bounds # (specifically, lower than `min_speakers`), since there could be too few embeddings @@ -607,13 +607,17 @@ def apply( if not return_embeddings: return diarization + # this can happen when we use OracleClustering + if centroids is None: + return diarization, None + # The number of centroids may be smaller than the number of speakers # in the annotation. This can happen if the number of active speakers # obtained from `speaker_count` for some frames is larger than the number # of clusters obtained from `clustering`. In this case, we append zero embeddings # for extra speakers if len(diarization.labels()) > centroids.shape[0]: - centroids = np.pad(centroids, (0, len(diarization.labels()) - centroids.shape[0])) + centroids = np.pad(centroids, ((0, len(diarization.labels()) - centroids.shape[0]), (0, 0))) # re-order centroids so that they match # the order given by diarization.labels() From 05a333c77bb143991d73e8a4eb849a518f9ce1d1 Mon Sep 17 00:00:00 2001 From: Dmitrii Mukhutdinov Date: Tue, 14 Nov 2023 07:38:16 +0000 Subject: [PATCH 7/8] Small suggestions --- pyannote/audio/pipelines/speaker_diarization.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py index 695d423c7..354f6be7e 100644 --- a/pyannote/audio/pipelines/speaker_diarization.py +++ b/pyannote/audio/pipelines/speaker_diarization.py @@ -533,14 +533,15 @@ def apply( # centroids: (num_speakers, dimension) # number of detected clusters is the number of different speakers - num_different_speakers = int(np.max(hard_clusters) + 1) + num_different_speakers = np.max(hard_clusters) + 1 # detected number of speakers can still be out of bounds # (specifically, lower than `min_speakers`), since there could be too few embeddings # to make enough clusters with a given minimum cluster size. if num_different_speakers < min_speakers or num_different_speakers > max_speakers: warnings.warn(textwrap.dedent( - f"""The detected number of speakers ({num_different_speakers}) is outside + f""" + The detected number of speakers ({num_different_speakers}) is outside the given bounds [{min_speakers}, {max_speakers}]. This can happen if the given audio file is too short to contain {min_speakers} or more speakers. Try to lower the desired minimal number of speakers. From c1254c47b03c32b7587600d96010dae96ea08d5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= Date: Thu, 16 Nov 2023 11:07:11 +0100 Subject: [PATCH 8/8] doc: update changelog --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6feac98ae..19e25f36e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ - fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization` - fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided +- fix(pipeline): fix frame-wise speaker count exceeding `max_speakers` or detected `num_speakers` in `SpeakerDiarization` pipeline ### Improvements @@ -26,6 +27,8 @@ - BREAKING(setup): remove `onnxruntime` dependency. You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself. - BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead) +- BREAKING(pipeline): remove `onset` and `offset` parameter in `SpeakerDiarizationMixin.speaker_count` + You should now binarize segmentations before passing them to `speaker_count` ## Version 3.0.1 (2023-09-28)