From 629fdd207874f563426e119dc882d057e1341ef3 Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Thu, 27 Apr 2023 07:27:29 +0000
Subject: [PATCH 1/8] Fix instantaneous speaker numbers overcounting in the
 diarization pipeline

---
 pyannote/audio/pipelines/clustering.py        |  1 -
 .../audio/pipelines/speaker_diarization.py    | 39 ++++++++++++-------
 pyannote/audio/pipelines/utils/diarization.py | 22 ++++-------
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/pyannote/audio/pipelines/clustering.py b/pyannote/audio/pipelines/clustering.py
index c51cdcc50..b63ab214f 100644
--- a/pyannote/audio/pipelines/clustering.py
+++ b/pyannote/audio/pipelines/clustering.py
@@ -253,7 +253,6 @@ def __call__(
             hard_clusters = np.zeros((num_chunks, num_speakers), dtype=np.int8)
             soft_clusters = np.ones((num_chunks, num_speakers, 1))
             centroids = np.mean(train_embeddings, axis=0, keepdims=True)
-
             return hard_clusters, soft_clusters, centroids
 
         train_clusters = self.cluster(
diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index d5cf04e05..12533294b 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -478,12 +478,20 @@ def apply(
         hook("segmentation", segmentations)
         #   shape: (num_chunks, num_frames, local_num_speakers)
 
+        # binarize segmentation
+        if self._segmentation.model.specifications.powerset:
+            binarized_segmentations = segmentations
+        else:
+            binarized_segmentations: SlidingWindowFeature = binarize(
+                segmentations,
+                onset=self.segmentation.threshold,
+                initial_state=False,
+            )
+
         # estimate frame-level number of instantaneous speakers
         count = self.speaker_count(
-            segmentations,
-            onset=0.5
-            if self._segmentation.model.specifications.powerset
-            else self.segmentation.threshold,
+            binarized_segmentations,
+            max_speakers,
             frames=self._frames,
             warm_up=(0.0, 0.0),
         )
@@ -499,16 +507,6 @@ def apply(
 
             return diarization
 
-        # binarize segmentation
-        if self._segmentation.model.specifications.powerset:
-            binarized_segmentations = segmentations
-        else:
-            binarized_segmentations: SlidingWindowFeature = binarize(
-                segmentations,
-                onset=self.segmentation.threshold,
-                initial_state=False,
-            )
-
         if self.klustering == "OracleClustering" and not return_embeddings:
             embeddings = None
         else:
@@ -533,6 +531,19 @@ def apply(
         # hard_clusters: (num_chunks, num_speakers)
         # centroids: (num_speakers, dimension)
 
+        # number of detected clusters is the number of different speakers
+        num_different_speakers = centroids.shape[0]
+        # quick sanity check
+        assert (
+            num_different_speakers >= min_speakers
+            and num_different_speakers <= max_speakers
+        )
+
+        # during counting, we could possibly overcount the number of instantaneous
+        # speakers due to segmentation errors, so we cap the maximum instantaneous number
+        # of speakers by the number of detected clusters
+        count.data = np.minimum(count.data, num_different_speakers)
+
         # reconstruct discrete diarization from raw hard clusters
 
         # keep track of inactive speakers
diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py
index 91413350b..f3b89a560 100644
--- a/pyannote/audio/pipelines/utils/diarization.py
+++ b/pyannote/audio/pipelines/utils/diarization.py
@@ -117,13 +117,11 @@ def optimal_mapping(
         else:
             return mapped_hypothesis
 
-    # TODO: get rid of onset/offset (binarization should be applied before calling speaker_count)
     # TODO: get rid of warm-up parameter (trimming should be applied before calling speaker_count)
     @staticmethod
     def speaker_count(
-        segmentations: SlidingWindowFeature,
-        onset: float = 0.5,
-        offset: float = None,
+        binarized_segmentations: SlidingWindowFeature,
+        max_speakers: Union[int, float],
         warm_up: Tuple[float, float] = (0.1, 0.1),
         frames: SlidingWindow = None,
     ) -> SlidingWindowFeature:
@@ -131,12 +129,10 @@ def speaker_count(
 
         Parameters
         ----------
-        segmentations : SlidingWindowFeature
-            (num_chunks, num_frames, num_classes)-shaped scores.
-        onset : float, optional
-           Onset threshold. Defaults to 0.5
-        offset : float, optional
-           Offset threshold. Defaults to `onset`.
+        binarized_segmentations : SlidingWindowFeature
+            (num_chunks, num_frames, num_classes)-shaped binarized scores.
+        max_speakers : int or np.inf
+            Maximum number of speakers allowed. Counts will not exceed this number
         warm_up : (float, float) tuple, optional
             Left/right warm up ratio of chunk duration.
             Defaults to (0.1, 0.1), i.e. 10% on both sides.
@@ -151,10 +147,7 @@ def speaker_count(
             (num_frames, 1)-shaped instantaneous speaker count
         """
 
-        binarized: SlidingWindowFeature = binarize(
-            segmentations, onset=onset, offset=offset, initial_state=False
-        )
-        trimmed = Inference.trim(binarized, warm_up=warm_up)
+        trimmed = Inference.trim(binarized_segmentations, warm_up=warm_up)
         count = Inference.aggregate(
             np.sum(trimmed, axis=-1, keepdims=True),
             frames=frames,
@@ -162,6 +155,7 @@ def speaker_count(
             missing=0.0,
             skip_average=False,
         )
+        count.data[count.data > max_speakers] = max_speakers
         count.data = np.rint(count.data).astype(np.uint8)
 
         return count

From 3c503ece9b0ecbea6b717c8507f5124981f6b5d6 Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Wed, 5 Jul 2023 07:35:48 +0000
Subject: [PATCH 2/8] Fix resegmentation pipeline

---
 pyannote/audio/pipelines/resegmentation.py    | 11 +++++++++--
 pyannote/audio/pipelines/utils/diarization.py |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/pyannote/audio/pipelines/resegmentation.py b/pyannote/audio/pipelines/resegmentation.py
index bb71abf22..d01e5d65f 100644
--- a/pyannote/audio/pipelines/resegmentation.py
+++ b/pyannote/audio/pipelines/resegmentation.py
@@ -39,6 +39,7 @@
     get_model,
 )
 from pyannote.audio.utils.permutation import mae_cost_func, permutate
+from pyannote.audio.utils.signal import binarize
 
 
 class Resegmentation(SpeakerDiarizationMixin, Pipeline):
@@ -181,11 +182,17 @@ def apply(
 
         hook("segmentation", segmentations)
 
-        # estimate frame-level number of instantaneous speakers
-        count = self.speaker_count(
+        # binarize segmentations before speaker counting
+        binarized_segmentations: SlidingWindowFeature = binarize(
             segmentations,
             onset=self.onset,
             offset=self.offset,
+            initial_state=False,
+        )
+
+        # estimate frame-level number of instantaneous speakers
+        count = self.speaker_count(
+            binarized_segmentations,
             warm_up=(self.warm_up, self.warm_up),
             frames=self._frames,
         )
diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py
index f3b89a560..2f28812cb 100644
--- a/pyannote/audio/pipelines/utils/diarization.py
+++ b/pyannote/audio/pipelines/utils/diarization.py
@@ -121,7 +121,7 @@ def optimal_mapping(
     @staticmethod
     def speaker_count(
         binarized_segmentations: SlidingWindowFeature,
-        max_speakers: Union[int, float],
+        max_speakers: Union[int, float] = np.inf,
         warm_up: Tuple[float, float] = (0.1, 0.1),
         frames: SlidingWindow = None,
     ) -> SlidingWindowFeature:

From e561a3b19062bad2ff00bed073bef952862b519c Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Fri, 14 Jul 2023 06:33:42 +0000
Subject: [PATCH 3/8] Issue warning instead of an assert

---
 .../audio/pipelines/speaker_diarization.py    | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index 12533294b..1579420f1 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -25,6 +25,8 @@
 import functools
 import itertools
 import math
+import textwrap
+import warnings
 from typing import Callable, Optional, Text, Union
 
 import numpy as np
@@ -533,11 +535,18 @@ def apply(
 
         # number of detected clusters is the number of different speakers
         num_different_speakers = centroids.shape[0]
-        # quick sanity check
-        assert (
-            num_different_speakers >= min_speakers
-            and num_different_speakers <= max_speakers
-        )
+
+        # detected number of speakers can still be out of bounds
+        # (specifically, lower than `min_speakers`), since there could be too few embeddings
+        # to make enough clusters with a given minimum cluster size.
+        if num_different_speakers < min_speakers or num_different_speakers > max_speakers:
+            warnings.warn(textwrap.dedent(
+                f"""The detected number of speakers ({num_different_speakers}) is outside
+                the given bounds [{min_speakers}, {max_speakers}]. This can happen if the
+                given audio file is too short to contain {min_speakers} or more speakers.
+                Try to lower the desired minimal number of speakers.
+                """
+            ))
 
         # during counting, we could possibly overcount the number of instantaneous
         # speakers due to segmentation errors, so we cap the maximum instantaneous number

From 0cfa92ed0a79bec5d9a3d33eb7a80e8dee7428b0 Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Tue, 8 Aug 2023 09:46:41 +0000
Subject: [PATCH 4/8] Constraint the speaker counting only with `max_speakers`

---
 pyannote/audio/pipelines/speaker_diarization.py | 5 ++---
 pyannote/audio/pipelines/utils/diarization.py   | 4 ----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index 1579420f1..ff586adbc 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -493,7 +493,6 @@ def apply(
         # estimate frame-level number of instantaneous speakers
         count = self.speaker_count(
             binarized_segmentations,
-            max_speakers,
             frames=self._frames,
             warm_up=(0.0, 0.0),
         )
@@ -550,8 +549,8 @@ def apply(
 
         # during counting, we could possibly overcount the number of instantaneous
         # speakers due to segmentation errors, so we cap the maximum instantaneous number
-        # of speakers by the number of detected clusters
-        count.data = np.minimum(count.data, num_different_speakers)
+        # of speakers by the `max_speakers` value
+        count.data = np.minimum(count.data, max_speakers)
 
         # reconstruct discrete diarization from raw hard clusters
 
diff --git a/pyannote/audio/pipelines/utils/diarization.py b/pyannote/audio/pipelines/utils/diarization.py
index 2f28812cb..4a35f7049 100644
--- a/pyannote/audio/pipelines/utils/diarization.py
+++ b/pyannote/audio/pipelines/utils/diarization.py
@@ -121,7 +121,6 @@ def optimal_mapping(
     @staticmethod
     def speaker_count(
         binarized_segmentations: SlidingWindowFeature,
-        max_speakers: Union[int, float] = np.inf,
         warm_up: Tuple[float, float] = (0.1, 0.1),
         frames: SlidingWindow = None,
     ) -> SlidingWindowFeature:
@@ -131,8 +130,6 @@ def speaker_count(
         ----------
         binarized_segmentations : SlidingWindowFeature
             (num_chunks, num_frames, num_classes)-shaped binarized scores.
-        max_speakers : int or np.inf
-            Maximum number of speakers allowed. Counts will not exceed this number
         warm_up : (float, float) tuple, optional
             Left/right warm up ratio of chunk duration.
             Defaults to (0.1, 0.1), i.e. 10% on both sides.
@@ -155,7 +152,6 @@ def speaker_count(
             missing=0.0,
             skip_average=False,
         )
-        count.data[count.data > max_speakers] = max_speakers
         count.data = np.rint(count.data).astype(np.uint8)
 
         return count

From 47a3ec7bab4a54fb5fd233dda0bdb3a9b68dc578 Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Thu, 10 Aug 2023 09:53:00 +0000
Subject: [PATCH 5/8] Fix typing of count.data and possible error due to number
 of centroids mismatch

---
 pyannote/audio/pipelines/speaker_diarization.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index ff586adbc..550f2adb1 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -550,7 +550,7 @@ def apply(
         # during counting, we could possibly overcount the number of instantaneous
         # speakers due to segmentation errors, so we cap the maximum instantaneous number
         # of speakers by the `max_speakers` value
-        count.data = np.minimum(count.data, max_speakers)
+        count.data = np.minimum(count.data, max_speakers).astype(np.int8)
 
         # reconstruct discrete diarization from raw hard clusters
 
@@ -607,6 +607,14 @@ def apply(
         if not return_embeddings:
             return diarization
 
+        # The number of centroids may be smaller than the number of speakers
+        # in the annotation. This can happen if the number of active speakers
+        # obtained from `speaker_count` for some frames is larger than the number
+        # of clusters obtained from `clustering`. In this case, we append zero embeddings
+        # for extra speakers
+        if len(diarization.labels()) > centroids.shape[0]:
+            centroids = np.pad(centroids, (0, len(diarization.labels()) - centroids.shape[0]))
+
         # re-order centroids so that they match
         # the order given by diarization.labels()
         inverse_mapping = {label: index for index, label in mapping.items()}
@@ -614,11 +622,6 @@ def apply(
             [inverse_mapping[label] for label in diarization.labels()]
         ]
 
-        # FIXME: the number of centroids may be smaller than the number of speakers
-        # in the annotation. This can happen if the number of active speakers
-        # obtained from `speaker_count` for some frames is larger than the number
-        # of clusters obtained from `clustering`. Will be fixed in the future
-
         return diarization, centroids
 
     def get_metric(self) -> GreedyDiarizationErrorRate:

From 0d72a4eb5b66fdc68bcaf395b1519d1100d8f9ce Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Fri, 18 Aug 2023 12:05:46 +0000
Subject: [PATCH 6/8] Fix a couple of elusive bugs

---
 pyannote/audio/pipelines/speaker_diarization.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index 550f2adb1..695d423c7 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -533,7 +533,7 @@ def apply(
         # centroids: (num_speakers, dimension)
 
         # number of detected clusters is the number of different speakers
-        num_different_speakers = centroids.shape[0]
+        num_different_speakers = int(np.max(hard_clusters) + 1)
 
         # detected number of speakers can still be out of bounds
         # (specifically, lower than `min_speakers`), since there could be too few embeddings
@@ -607,13 +607,17 @@ def apply(
         if not return_embeddings:
             return diarization
 
+        # this can happen when we use OracleClustering
+        if centroids is None:
+            return diarization, None
+
         # The number of centroids may be smaller than the number of speakers
         # in the annotation. This can happen if the number of active speakers
         # obtained from `speaker_count` for some frames is larger than the number
         # of clusters obtained from `clustering`. In this case, we append zero embeddings
         # for extra speakers
         if len(diarization.labels()) > centroids.shape[0]:
-            centroids = np.pad(centroids, (0, len(diarization.labels()) - centroids.shape[0]))
+            centroids = np.pad(centroids, ((0, len(diarization.labels()) - centroids.shape[0]), (0, 0)))
 
         # re-order centroids so that they match
         # the order given by diarization.labels()

From 05a333c77bb143991d73e8a4eb849a518f9ce1d1 Mon Sep 17 00:00:00 2001
From: Dmitrii Mukhutdinov <flyingleafe@gmail.com>
Date: Tue, 14 Nov 2023 07:38:16 +0000
Subject: [PATCH 7/8] Small suggestions

---
 pyannote/audio/pipelines/speaker_diarization.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pyannote/audio/pipelines/speaker_diarization.py b/pyannote/audio/pipelines/speaker_diarization.py
index 695d423c7..354f6be7e 100644
--- a/pyannote/audio/pipelines/speaker_diarization.py
+++ b/pyannote/audio/pipelines/speaker_diarization.py
@@ -533,14 +533,15 @@ def apply(
         # centroids: (num_speakers, dimension)
 
         # number of detected clusters is the number of different speakers
-        num_different_speakers = int(np.max(hard_clusters) + 1)
+        num_different_speakers = np.max(hard_clusters) + 1
 
         # detected number of speakers can still be out of bounds
         # (specifically, lower than `min_speakers`), since there could be too few embeddings
         # to make enough clusters with a given minimum cluster size.
         if num_different_speakers < min_speakers or num_different_speakers > max_speakers:
             warnings.warn(textwrap.dedent(
-                f"""The detected number of speakers ({num_different_speakers}) is outside
+                f"""
+                The detected number of speakers ({num_different_speakers}) is outside
                 the given bounds [{min_speakers}, {max_speakers}]. This can happen if the
                 given audio file is too short to contain {min_speakers} or more speakers.
                 Try to lower the desired minimal number of speakers.

From c1254c47b03c32b7587600d96010dae96ea08d5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Thu, 16 Nov 2023 11:07:11 +0100
Subject: [PATCH 8/8] doc: update changelog

---
 CHANGELOG.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6feac98ae..19e25f36e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,7 @@
 
 - fix(pipeline): add missing "embedding" hook call in `SpeakerDiarization`
 - fix(pipeline): fix `AgglomerativeClustering` to honor `num_clusters` when provided
+- fix(pipeline): fix frame-wise speaker count exceeding `max_speakers` or detected `num_speakers` in `SpeakerDiarization` pipeline
 
 ### Improvements
 
@@ -26,6 +27,8 @@
 - BREAKING(setup): remove `onnxruntime` dependency.
   You can still use ONNX `hbredin/wespeaker-voxceleb-resnet34-LM` but you will have to install `onnxruntime` yourself.
 - BREAKING(pipeline): remove `logging_hook` (use `ArtifactHook` instead)
+- BREAKING(pipeline): remove `onset` and `offset` parameter in `SpeakerDiarizationMixin.speaker_count`  
+  You should now binarize segmentations before passing them to `speaker_count`
 
 ## Version 3.0.1 (2023-09-28)