Skip to content

Commit

Permalink
Merge branch 'main' into test_mon
Browse files Browse the repository at this point in the history
  • Loading branch information
fabiocat93 authored Aug 9, 2024
2 parents edaf888 + f1853bb commit 715f9e4
Show file tree
Hide file tree
Showing 11 changed files with 427 additions and 4 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
# 0.7.8 (Fri Aug 09 2024)

#### 🐛 Bug Fix

- Adding documentation and tutorial and logger to voice activity detection and speaker diarization [#136](https://github.com/sensein/senselab/pull/136) ([@fabiocat93](https://github.com/fabiocat93))

#### Authors: 1

- Fabio Catania ([@fabiocat93](https://github.com/fabiocat93))

---

# 0.7.7 (Fri Aug 09 2024)

#### 🐛 Bug Fix
Expand Down
4 changes: 3 additions & 1 deletion src/senselab/audio/tasks/speaker_diarization/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
"""This module contains the speaker diarization API for senselab."""
""".. include:: ./doc.md""" # noqa: D415

from .api import diarize_audios # noqa: F401
5 changes: 3 additions & 2 deletions src/senselab/audio/tasks/speaker_diarization/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

def diarize_audios(
audios: List[Audio],
model: SenselabModel,
model: SenselabModel = PyannoteAudioModel(path_or_uri="pyannote/speaker-diarization-3.1", revision="main"),
num_speakers: Optional[int] = None,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
Expand All @@ -26,7 +26,8 @@ def diarize_audios(
Args:
audios (List[Audio]): The list of audio objects to be diarized.
model (SenselabModel): The model used for diarization.
model (SenselabModel): The model used for diarization
(default is "pyannote/speaker-diarization-3.1").
device (Optional[DeviceType]): The device to run the model on (default is None).
num_speakers (Optional[int]): The number of speakers (default is None).
min_speakers (Optional[int]): The minimum number of speakers (default is None).
Expand Down
30 changes: 30 additions & 0 deletions src/senselab/audio/tasks/speaker_diarization/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Speaker diarization

[![Tutorial](https://img.shields.io/badge/Tutorial-Click%20Here-blue?style=for-the-badge)](https://github.com/sensein/senselab/blob/main/tutorials/speaker_diarization.ipynb)

## Task Overview
Speaker diarization is the process of segmenting audio recordings by speaker labels, aiming to answer the question: **"Who spoke when?"**

## Models

In `senselab`, we integrate [pyannote.audio](https://github.com/pyannote/pyannote-audio) models for speaker diarization. These models can be explored on the [Hugging Face Hub](https://huggingface.co/pyannote). We may integrate additional approaches for speaker diarization into the package in the future.

## Evaluation

### Metrics

The **Diarization Error Rate (DER)** is the standard metric for evaluating and comparing speaker diarization systems. It is defined as:
```text
DER= (false alarm + missed detection + confusion) / total
```
where
- `false alarm` is the duration of non-speech incorrectly classified as speech, missed detection
- `missed detection` is the duration of speech incorrectly classified as non-speech, confusion
- `confusion` is the duration of speaker confusion, and total
- `total` is the sum over all speakers of their reference speech duration.

**Note:** DER takes overlapping speech into account. This can lead to increased missed detection rates if the speaker diarization system does not include an overlapping speech detection module.

### Benchmark

You can find a benchmark of the latest pyannote.audio model's performance on various time-stamped speech datasets [here](https://github.com/pyannote/pyannote-audio?tab=readme-ov-file#benchmark).
13 changes: 13 additions & 0 deletions src/senselab/audio/tasks/speaker_diarization/pyannote.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""This module implements the Pyannote Diarization task."""

import time
from typing import Dict, List, Optional, Union

import torch
Expand All @@ -8,6 +9,7 @@

from senselab.audio.data_structures.audio import Audio
from senselab.utils.data_structures.device import DeviceType, _select_device_and_dtype
from senselab.utils.data_structures.logging import logger
from senselab.utils.data_structures.model import PyannoteAudioModel
from senselab.utils.data_structures.script_line import ScriptLine

Expand Down Expand Up @@ -95,7 +97,15 @@ def _annotation_to_script_lines(annotation: Annotation) -> List[ScriptLine]:
+ str(expected_sample_rate)
)

# Take the start time of the model initialization
start_time_model = time.time()
pipeline = PyannoteDiarization._get_pyannote_diarization_pipeline(model=model, device=device)
end_time_model = time.time()
elapsed_time_model = end_time_model - start_time_model
logger.info(f"Time taken to initialize the pyannote model: {elapsed_time_model:.2f} seconds")

# Perform diarization
start_time_diarization = time.time()
results: List[List[ScriptLine]] = []
for audio in audios:
diarization = pipeline(
Expand All @@ -105,5 +115,8 @@ def _annotation_to_script_lines(annotation: Annotation) -> List[ScriptLine]:
max_speakers=max_speakers,
)
results.append(_annotation_to_script_lines(diarization))
end_time_diarization = time.time()
elapsed_time_diarization = end_time_diarization - start_time_diarization
logger.info(f"Time taken to perform diarization: {elapsed_time_diarization:.2f} seconds")

return results
2 changes: 1 addition & 1 deletion src/senselab/audio/tasks/speech_enhancement/speechbrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def enhance_audios_with_speechbrain(
"""
# Take the start time of the model initialization
start_time_model = time.time()
enhancer, device, dtype = cls._get_speechbrain_model(model=model, device=device)
enhancer, device, _ = cls._get_speechbrain_model(model=model, device=device)
end_time_model = time.time()
elapsed_time_model = end_time_model - start_time_model
logger.info(f"Time taken to initialize the speechbrain model: {elapsed_time_model:.2f} seconds")
Expand Down
3 changes: 3 additions & 0 deletions src/senselab/audio/tasks/voice_activity_detection/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
""".. include:: ./doc.md""" # noqa: D415

from .api import detect_human_voice_activity_in_audios # noqa: F401
46 changes: 46 additions & 0 deletions src/senselab/audio/tasks/voice_activity_detection/doc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Voice Activity Detection (VAD)

[![Tutorial](https://img.shields.io/badge/Tutorial-Click%20Here-blue?style=for-the-badge)](https://github.com/sensein/senselab/blob/main/tutorials/voice_activity_detection.ipynb)

## Task Overview

Voice Activity Detection (VAD) is a binary classification task that identifies the presence of human voice in audio. The primary challenge in VAD lies in differentiating between noise and human voice, particularly in environments with significant background noise (e.g., fans, car engines). While VAD performs well in quiet environments where distinguishing between silence and speech is straightforward, the task becomes more difficult when background noise or non-standard speech patterns are present.

## Models

In `senselab`, we integrate [pyannote.audio](https://github.com/pyannote/pyannote-audio) models for VAD. These models can be explored on the [Hugging Face Hub](https://huggingface.co/pyannote). Additional approaches for VAD may be integrated into the package in the future.

## Evaluation

### Metrics

The primary metrics used to evaluate VAD modules are Detection Error Rate (DER) and Detection Cost Function (DCF).

- **Detection Error Rate (DER):**

```text
DER = (false alarm + missed detection) / total duration of speech in reference
```

- **False alarm:** Duration of non-speech incorrectly classified as speech.
- **Missed detection:** Duration of speech incorrectly classified as non-speech.
- **Total:** Total duration of speech in the reference.

- **Detection Cost Function (DCF):**

```text
DCF = 0.25 * false alarm rate + 0.75 * miss rate
```

- **False alarm rate:** Proportion of non-speech incorrectly classified as speech.
- **Miss rate:** Proportion of speech incorrectly classified as non-speech.

### Additional Metrics

VAD systems may also be evaluated using the following metrics:

- **Accuracy:** Proportion of the input signal correctly classified.
- **Precision:** Proportion of detected speech that is actually speech.
- **Recall:** Proportion of speech that is correctly detected.

For more detailed information on these metrics, refer to the [pyannote.metrics documentation](https://pyannote.github.io/pyannote-metrics/reference.html).
56 changes: 56 additions & 0 deletions src/senselab/utils/tasks/plotting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
"""This module implements plotting methods for utilities."""

from typing import List

import matplotlib.cm as cm
import matplotlib.pyplot as plt

from senselab.utils.data_structures.script_line import ScriptLine
Expand Down Expand Up @@ -52,3 +55,56 @@ def plot_transcript(transcript: ScriptLine) -> None:

# Show the plot
plt.show()


def plot_segment(segments: List[ScriptLine]) -> None:
"""Plots the segments of the transcript over time.
Args:
segments (List[ScriptLine]): The segments object containing segments with start and end times and a label.
Returns:
None
Todo:
- Add option to save the plot
- Add option to choose the size of the Figure
- Add check if transcript contains segments with time information
"""
start_times = []
end_times = []
labels = []

for segment in segments:
# Ensure that segments have start and end times and a label
if segment.start is None or segment.end is None or segment.speaker is None:
raise ValueError("Each segment must have start and end times and a label.")
else:
start_times.append(segment.start)
end_times.append(segment.end)
labels.append(segment.speaker)

# Create a figure and axis
_, ax = plt.subplots(figsize=(12, 6))

# Create a color map based on unique labels
unique_labels = list(set(labels))
color_map = cm.get_cmap("tab10", len(unique_labels)) # 'tab10' provides 10 distinct colors
label_to_color = {label: color_map(i) for i, label in enumerate(unique_labels)}
label_to_y_value = {label: i for i, label in enumerate(unique_labels)} # Assign y-value based on label index

# Plot each segment and add text label with color
for i, label in enumerate(labels):
color = label_to_color[label]
y_value = label_to_y_value[label] # Get y-value based on label
ax.plot([start_times[i], end_times[i]], [y_value, y_value], marker="o", color=color, linewidth=2)
ax.text((start_times[i] + end_times[i]) / 2, y_value, label, ha="center", va="bottom", color=color)

# Setting labels and title
ax.set_yticks(range(len(unique_labels)))
ax.set_yticklabels(unique_labels)
ax.set_xlabel("Time (seconds)")
ax.set_title("Segment Visualization Over Time")

# Show the plot
plt.show()
115 changes: 115 additions & 0 deletions tutorials/speaker_diarization.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Speaker diarization\n",
"\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/speaker_diarization.ipynb)\n",
"\n",
"This tutorial demonstrates how to use the `diarize_audios` function to perform speaker diarization on some audio files, which means to segment the audio into multiple speakers."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Import necessary modules\n",
"from senselab.audio.data_structures.audio import Audio\n",
"from senselab.audio.tasks.speaker_diarization import diarize_audios\n",
"from senselab.utils.data_structures.model import PyannoteAudioModel\n",
"from senselab.utils.data_structures.device import DeviceType\n",
"from senselab.audio.tasks.plotting.plotting import play_audio\n",
"from senselab.audio.tasks.preprocessing.preprocessing import resample_audios\n",
"from senselab.utils.tasks.plotting import plot_segment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialize a PyannoteAudioModel for speaker diarization, providing the model's path or URI.\n",
"model = PyannoteAudioModel(path_or_uri=\"pyannote/speaker-diarization-3.1\")\n",
"\n",
"# Specify the device type to be used for processing (CPU in this case).\n",
"device = DeviceType.CPU"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load an audio file from the specified file path into an Audio object.\n",
"audio = Audio.from_filepath(\"../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n",
"\n",
"# Resample the audio to 16kHz, as this is the expected input format for the model.\n",
"# The resample_audios function returns a list, so we take the first (and only) element.\n",
"audio = resample_audios([audio], 16000)[0]\n",
"\n",
"# Play the resampled audio to verify the preprocessing step was successful.\n",
"play_audio(audio)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Perform speaker diarization on the audio using the specified model and device.\n",
"# The function returns a list of results, where each element corresponds to an audio segment.\n",
"results = diarize_audios(audios=[audio], model=model, device=device)\n",
"\n",
"# Print the results of speaker diarization to the console.\n",
"print(results)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot the detected speakers for visualization.\n",
"plot_segment(results[0])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Ehm wait**. In the audio, we can hear four speakers, but the speaker diarization results indicate only two speakers. Why is this happening?\n",
"\n",
"Unfortunately, the model is not perfect and can make mistakes. We can try adjusting the parameters by setting `num_speakers=4`, `min_speakers=4`, and `max_speakers=4` to force the model to recognize four speakers. However, this approach doesn't always work as expected."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "senselab-lOUhtavG-py3.10",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 715f9e4

Please sign in to comment.