Skip to content

Commit

Permalink
feat: add support for powerset in VAD and OSD pipelines (#1467)
Browse files Browse the repository at this point in the history
  • Loading branch information
hbredin authored Sep 22, 2023
1 parent 9297c0c commit b796089
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 9 deletions.
19 changes: 15 additions & 4 deletions pyannote/audio/pipelines/overlapped_speech_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,13 @@ def __init__(
)[:, :, -2, np.newaxis]
self._segmentation = Inference(model, **inference_kwargs)

#  hyper-parameters used for hysteresis thresholding
self.onset = Uniform(0.0, 1.0)
self.offset = Uniform(0.0, 1.0)
if model.specifications.powerset:
self.onset = self.offset = 0.5

else:
#  hyper-parameters used for hysteresis thresholding
self.onset = Uniform(0.0, 1.0)
self.offset = Uniform(0.0, 1.0)

# hyper-parameters used for post-processing i.e. removing short overlapped regions
# or filling short gaps between overlapped regions
Expand All @@ -152,14 +156,21 @@ def __init__(
self.recall = recall

def default_parameters(self):
# parameters optimized on DIHARD 3 development set
if self.segmentation == "pyannote/segmentation":
# parameters optimized on DIHARD 3 development set
return {
"onset": 0.430,
"offset": 0.320,
"min_duration_on": 0.091,
"min_duration_off": 0.144,
}

elif self.segmentation == "pyannote/segmentation-3.0.0":
return {
"min_duration_on": 0.0,
"min_duration_off": 0.0,
}

raise NotImplementedError()

def classes(self):
Expand Down
20 changes: 15 additions & 5 deletions pyannote/audio/pipelines/voice_activity_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,29 +121,40 @@ def __init__(

# load model and send it to GPU (when available and not already on GPU)
model = get_model(segmentation, use_auth_token=use_auth_token)

inference_kwargs["pre_aggregation_hook"] = lambda scores: np.max(
scores, axis=-1, keepdims=True
)
self._segmentation = Inference(model, **inference_kwargs)

#  hyper-parameters used for hysteresis thresholding
self.onset = Uniform(0.0, 1.0)
self.offset = Uniform(0.0, 1.0)
if model.specifications.powerset:
self.onset = self.offset = 0.5
else:
#  hyper-parameters used for hysteresis thresholding
self.onset = Uniform(0.0, 1.0)
self.offset = Uniform(0.0, 1.0)

# hyper-parameters used for post-processing i.e. removing short speech regions
# or filling short gaps between speech regions
self.min_duration_on = Uniform(0.0, 1.0)
self.min_duration_off = Uniform(0.0, 1.0)

def default_parameters(self):
# parameters optimized on DIHARD 3 development set
if self.segmentation == "pyannote/segmentation":
# parameters optimized for DIHARD 3 development set
return {
"onset": 0.767,
"offset": 0.377,
"min_duration_on": 0.136,
"min_duration_off": 0.067,
}

elif self.segmentation == "pyannote/segmentation-3.0.0":
return {
"min_duration_on": 0.0,
"min_duration_off": 0.0,
}

raise NotImplementedError()

def classes(self):
Expand Down Expand Up @@ -289,7 +300,6 @@ def __init__(
self.learning_rate = LogUniform(1e-6, 1)

def apply(self, file: AudioFile) -> Annotation:

# create a copy of file
file = dict(file)

Expand Down

0 comments on commit b796089

Please sign in to comment.