Merge branch 'main' into test_mon

sensein · Aug 9, 2024 · 715f9e4 · 715f9e4
2 parents edaf888 + f1853bb
commit 715f9e4
Show file tree

Hide file tree

Showing 11 changed files with 427 additions and 4 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,15 @@
+# 0.7.8 (Fri Aug 09 2024)
+
+#### 🐛 Bug Fix
+
+- Adding documentation and tutorial and logger to voice activity detection and speaker diarization [#136](https://github.com/sensein/senselab/pull/136) ([@fabiocat93](https://github.com/fabiocat93))
+
+#### Authors: 1
+
+- Fabio Catania ([@fabiocat93](https://github.com/fabiocat93))
+
+---
+
 # 0.7.7 (Fri Aug 09 2024)
 
 #### 🐛 Bug Fix

diff --git a/src/senselab/audio/tasks/speaker_diarization/__init__.py b/src/senselab/audio/tasks/speaker_diarization/__init__.py
@@ -1 +1,3 @@
-"""This module contains the speaker diarization API for senselab."""
+""".. include:: ./doc.md"""  # noqa: D415
+
+from .api import diarize_audios  # noqa: F401
diff --git a/src/senselab/audio/tasks/speaker_diarization/api.py b/src/senselab/audio/tasks/speaker_diarization/api.py
@@ -16,7 +16,7 @@
 
 def diarize_audios(
     audios: List[Audio],
-    model: SenselabModel,
+    model: SenselabModel = PyannoteAudioModel(path_or_uri="pyannote/speaker-diarization-3.1", revision="main"),
     num_speakers: Optional[int] = None,
     min_speakers: Optional[int] = None,
     max_speakers: Optional[int] = None,
@@ -26,7 +26,8 @@ def diarize_audios(
 
     Args:
         audios (List[Audio]): The list of audio objects to be diarized.
-        model (SenselabModel): The model used for diarization.
+        model (SenselabModel): The model used for diarization
+            (default is "pyannote/speaker-diarization-3.1").
         device (Optional[DeviceType]): The device to run the model on (default is None).
         num_speakers (Optional[int]): The number of speakers (default is None).
         min_speakers (Optional[int]): The minimum number of speakers (default is None).

diff --git a/src/senselab/audio/tasks/speaker_diarization/doc.md b/src/senselab/audio/tasks/speaker_diarization/doc.md
@@ -0,0 +1,30 @@
+# Speaker diarization
+
+[![Tutorial](https://img.shields.io/badge/Tutorial-Click%20Here-blue?style=for-the-badge)](https://github.com/sensein/senselab/blob/main/tutorials/speaker_diarization.ipynb)
+
+## Task Overview
+Speaker diarization is the process of segmenting audio recordings by speaker labels, aiming to answer the question: **"Who spoke when?"**
+
+## Models
+
+In `senselab`, we integrate [pyannote.audio](https://github.com/pyannote/pyannote-audio) models for speaker diarization. These models can be explored on the [Hugging Face Hub](https://huggingface.co/pyannote). We may integrate additional approaches for speaker diarization into the package in the future.
+
+## Evaluation
+
+### Metrics
+
+The **Diarization Error Rate (DER)** is the standard metric for evaluating and comparing speaker diarization systems. It is defined as:
+```text
+DER= (false alarm + missed detection + confusion) / total
+```
+where
+- `false alarm` is the duration of non-speech incorrectly classified as speech, missed detection
+- `missed detection` is the duration of speech incorrectly classified as non-speech, confusion
+- `confusion` is the duration of speaker confusion, and total
+- `total` is the sum over all speakers of their reference speech duration.
+
+**Note:** DER takes overlapping speech into account. This can lead to increased missed detection rates if the speaker diarization system does not include an overlapping speech detection module.
+
+### Benchmark
+
+You can find a benchmark of the latest pyannote.audio model's performance on various time-stamped speech datasets [here](https://github.com/pyannote/pyannote-audio?tab=readme-ov-file#benchmark).
diff --git a/src/senselab/audio/tasks/speaker_diarization/pyannote.py b/src/senselab/audio/tasks/speaker_diarization/pyannote.py
@@ -1,5 +1,6 @@
 """This module implements the Pyannote Diarization task."""
 
+import time
 from typing import Dict, List, Optional, Union
 
 import torch
@@ -8,6 +9,7 @@
 
 from senselab.audio.data_structures.audio import Audio
 from senselab.utils.data_structures.device import DeviceType, _select_device_and_dtype
+from senselab.utils.data_structures.logging import logger
 from senselab.utils.data_structures.model import PyannoteAudioModel
 from senselab.utils.data_structures.script_line import ScriptLine
 
@@ -95,7 +97,15 @@ def _annotation_to_script_lines(annotation: Annotation) -> List[ScriptLine]:
                 + str(expected_sample_rate)
             )
 
+    # Take the start time of the model initialization
+    start_time_model = time.time()
     pipeline = PyannoteDiarization._get_pyannote_diarization_pipeline(model=model, device=device)
+    end_time_model = time.time()
+    elapsed_time_model = end_time_model - start_time_model
+    logger.info(f"Time taken to initialize the pyannote model: {elapsed_time_model:.2f} seconds")
+
+    # Perform diarization
+    start_time_diarization = time.time()
     results: List[List[ScriptLine]] = []
     for audio in audios:
         diarization = pipeline(
@@ -105,5 +115,8 @@ def _annotation_to_script_lines(annotation: Annotation) -> List[ScriptLine]:
             max_speakers=max_speakers,
         )
         results.append(_annotation_to_script_lines(diarization))
+    end_time_diarization = time.time()
+    elapsed_time_diarization = end_time_diarization - start_time_diarization
+    logger.info(f"Time taken to perform diarization: {elapsed_time_diarization:.2f} seconds")
 
     return results
diff --git a/src/senselab/audio/tasks/speech_enhancement/speechbrain.py b/src/senselab/audio/tasks/speech_enhancement/speechbrain.py
@@ -66,7 +66,7 @@ def enhance_audios_with_speechbrain(
         """
         # Take the start time of the model initialization
         start_time_model = time.time()
-        enhancer, device, dtype = cls._get_speechbrain_model(model=model, device=device)
+        enhancer, device, _ = cls._get_speechbrain_model(model=model, device=device)
         end_time_model = time.time()
         elapsed_time_model = end_time_model - start_time_model
         logger.info(f"Time taken to initialize the speechbrain model: {elapsed_time_model:.2f} seconds")

diff --git a/src/senselab/audio/tasks/voice_activity_detection/__init__.py b/src/senselab/audio/tasks/voice_activity_detection/__init__.py
@@ -0,0 +1,3 @@
+""".. include:: ./doc.md"""  # noqa: D415
+
+from .api import detect_human_voice_activity_in_audios  # noqa: F401
diff --git a/src/senselab/audio/tasks/voice_activity_detection/doc.md b/src/senselab/audio/tasks/voice_activity_detection/doc.md
@@ -0,0 +1,46 @@
+# Voice Activity Detection (VAD)
+
+[![Tutorial](https://img.shields.io/badge/Tutorial-Click%20Here-blue?style=for-the-badge)](https://github.com/sensein/senselab/blob/main/tutorials/voice_activity_detection.ipynb)
+
+## Task Overview
+
+Voice Activity Detection (VAD) is a binary classification task that identifies the presence of human voice in audio. The primary challenge in VAD lies in differentiating between noise and human voice, particularly in environments with significant background noise (e.g., fans, car engines). While VAD performs well in quiet environments where distinguishing between silence and speech is straightforward, the task becomes more difficult when background noise or non-standard speech patterns are present.
+
+## Models
+
+In `senselab`, we integrate [pyannote.audio](https://github.com/pyannote/pyannote-audio) models for VAD. These models can be explored on the [Hugging Face Hub](https://huggingface.co/pyannote). Additional approaches for VAD may be integrated into the package in the future.
+
+## Evaluation
+
+### Metrics
+
+The primary metrics used to evaluate VAD modules are Detection Error Rate (DER) and Detection Cost Function (DCF).
+
+- **Detection Error Rate (DER):**
+
+  ```text
+    DER = (false alarm + missed detection) / total duration of speech in reference
+  ```
+
+  - **False alarm:** Duration of non-speech incorrectly classified as speech.
+  - **Missed detection:** Duration of speech incorrectly classified as non-speech.
+  - **Total:** Total duration of speech in the reference.
+
+- **Detection Cost Function (DCF):**
+
+  ```text
+  DCF = 0.25 * false alarm rate + 0.75 * miss rate
+  ```
+
+  - **False alarm rate:** Proportion of non-speech incorrectly classified as speech.
+  - **Miss rate:** Proportion of speech incorrectly classified as non-speech.
+
+### Additional Metrics
+
+VAD systems may also be evaluated using the following metrics:
+
+- **Accuracy:** Proportion of the input signal correctly classified.
+- **Precision:** Proportion of detected speech that is actually speech.
+- **Recall:** Proportion of speech that is correctly detected.
+
+For more detailed information on these metrics, refer to the [pyannote.metrics documentation](https://pyannote.github.io/pyannote-metrics/reference.html).
diff --git a/src/senselab/utils/tasks/plotting.py b/src/senselab/utils/tasks/plotting.py
@@ -1,5 +1,8 @@
 """This module implements plotting methods for utilities."""
 
+from typing import List
+
+import matplotlib.cm as cm
 import matplotlib.pyplot as plt
 
 from senselab.utils.data_structures.script_line import ScriptLine
@@ -52,3 +55,56 @@ def plot_transcript(transcript: ScriptLine) -> None:
 
     # Show the plot
     plt.show()
+
+
+def plot_segment(segments: List[ScriptLine]) -> None:
+    """Plots the segments of the transcript over time.
+
+    Args:
+        segments (List[ScriptLine]): The segments object containing segments with start and end times and a label.
+
+    Returns:
+        None
+
+    Todo:
+        - Add option to save the plot
+        - Add option to choose the size of the Figure
+        - Add check if transcript contains segments with time information
+    """
+    start_times = []
+    end_times = []
+    labels = []
+
+    for segment in segments:
+        # Ensure that segments have start and end times and a label
+        if segment.start is None or segment.end is None or segment.speaker is None:
+            raise ValueError("Each segment must have start and end times and a label.")
+        else:
+            start_times.append(segment.start)
+            end_times.append(segment.end)
+            labels.append(segment.speaker)
+
+    # Create a figure and axis
+    _, ax = plt.subplots(figsize=(12, 6))
+
+    # Create a color map based on unique labels
+    unique_labels = list(set(labels))
+    color_map = cm.get_cmap("tab10", len(unique_labels))  # 'tab10' provides 10 distinct colors
+    label_to_color = {label: color_map(i) for i, label in enumerate(unique_labels)}
+    label_to_y_value = {label: i for i, label in enumerate(unique_labels)}  # Assign y-value based on label index
+
+    # Plot each segment and add text label with color
+    for i, label in enumerate(labels):
+        color = label_to_color[label]
+        y_value = label_to_y_value[label]  # Get y-value based on label
+        ax.plot([start_times[i], end_times[i]], [y_value, y_value], marker="o", color=color, linewidth=2)
+        ax.text((start_times[i] + end_times[i]) / 2, y_value, label, ha="center", va="bottom", color=color)
+
+    # Setting labels and title
+    ax.set_yticks(range(len(unique_labels)))
+    ax.set_yticklabels(unique_labels)
+    ax.set_xlabel("Time (seconds)")
+    ax.set_title("Segment Visualization Over Time")
+
+    # Show the plot
+    plt.show()
diff --git a/tutorials/speaker_diarization.ipynb b/tutorials/speaker_diarization.ipynb
@@ -0,0 +1,115 @@
+{
+    "cells": [
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Speaker diarization\n",
+                "\n",
+                "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/sensein/senselab/blob/main/tutorials/speaker_diarization.ipynb)\n",
+                "\n",
+                "This tutorial demonstrates how to use the `diarize_audios` function to perform speaker diarization on some audio files, which means to segment the audio into multiple speakers."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Import necessary modules\n",
+                "from senselab.audio.data_structures.audio import Audio\n",
+                "from senselab.audio.tasks.speaker_diarization import diarize_audios\n",
+                "from senselab.utils.data_structures.model import PyannoteAudioModel\n",
+                "from senselab.utils.data_structures.device import DeviceType\n",
+                "from senselab.audio.tasks.plotting.plotting import play_audio\n",
+                "from senselab.audio.tasks.preprocessing.preprocessing import resample_audios\n",
+                "from senselab.utils.tasks.plotting import plot_segment"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Initialize a PyannoteAudioModel for speaker diarization, providing the model's path or URI.\n",
+                "model = PyannoteAudioModel(path_or_uri=\"pyannote/speaker-diarization-3.1\")\n",
+                "\n",
+                "# Specify the device type to be used for processing (CPU in this case).\n",
+                "device = DeviceType.CPU"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Load an audio file from the specified file path into an Audio object.\n",
+                "audio = Audio.from_filepath(\"../src/tests/data_for_testing/audio_48khz_mono_16bits.wav\")\n",
+                "\n",
+                "# Resample the audio to 16kHz, as this is the expected input format for the model.\n",
+                "# The resample_audios function returns a list, so we take the first (and only) element.\n",
+                "audio = resample_audios([audio], 16000)[0]\n",
+                "\n",
+                "# Play the resampled audio to verify the preprocessing step was successful.\n",
+                "play_audio(audio)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Perform speaker diarization on the audio using the specified model and device.\n",
+                "# The function returns a list of results, where each element corresponds to an audio segment.\n",
+                "results = diarize_audios(audios=[audio], model=model, device=device)\n",
+                "\n",
+                "# Print the results of speaker diarization to the console.\n",
+                "print(results)"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Plot the detected speakers for visualization.\n",
+                "plot_segment(results[0])"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "**Ehm wait**. In the audio, we can hear four speakers, but the speaker diarization results indicate only two speakers. Why is this happening?\n",
+                "\n",
+                "Unfortunately, the model is not perfect and can make mistakes. We can try adjusting the parameters by setting `num_speakers=4`, `min_speakers=4`, and `max_speakers=4` to force the model to recognize four speakers. However, this approach doesn't always work as expected."
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": "senselab-lOUhtavG-py3.10",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.10.10"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}