sensein · fabiocat93 · Jun 5, 2024 · May 31, 2024 · May 9, 2024 · May 14, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,6 +17,8 @@ repos:
   hooks:
   - id: mypy
     args: [--ignore-missing-imports]
+    additional_dependencies:
+    - pydantic>=1.10.4
 - repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks
   rev: v2.12.0
   hooks:
@@ -54,6 +56,8 @@ repos:
   rev: v2.3.0
   hooks:
   - id: codespell
+    additional_dependencies:
+    - tomli
 
 - repo: https://github.com/hija/clean-dotenv
   rev: v0.0.7

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "senselab"
-version = "0.1.2.dev23+c952ded"
+version = "0.0.1"
 description = "Senselab is a Python package that simplifies building pipelines for speech and voice analysis."
 authors = [
   "Fabio Catania <[email protected]>",
@@ -47,6 +47,8 @@ opensmile = "^2.5.0"
 audiomentations = "^0.35.0"
 torch-audiomentations = "^0.11.1"
 sentence-transformers = "^2.7.0"
+jiwer = "^3.0.4"
+speechbrain = "^1.0.0"
 
 [tool.poetry.group.dev]
 optional = true
@@ -79,6 +81,9 @@ testpaths = [
 
 [tool.mypy]
 ignore_missing_imports = true
+plugins = [
+  "pydantic.mypy"
+]
 
 [tool.ruff]
 exclude = [
@@ -104,7 +109,7 @@ exclude = [
   "node_modules",
   "venv"
 ]
-line-length = 80
+line-length = 120
 indent-width = 4
 src = ["src"]
 target-version = "py310"
@@ -140,10 +145,10 @@ pattern = "default-unprefixed"
 
 [tool.codespell]
 skip = [
-  "./poetry.lock",
-  "./docs_style/pdoc-theme/syntax-highlighting.css"
+  "poetry.lock",
+  "docs_style/pdoc-theme/syntax-highlighting.css"
 ]
-ignore-words-list = ["senselab"]
+ignore-words-list = ["senselab", "nd", "astroid", "wil"]
 
 [build-system]
 requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"]

diff --git a/src/senselab/audio/tasks/data_augmentation.py b/src/senselab/audio/tasks/data_augmentation.py
@@ -1,44 +1,92 @@
 """This module implements some utilities for audio data augmentation."""
 
-from typing import Any, Dict
+from typing import Any, Dict, List, Union
 
 import torch
 from datasets import Dataset
 from torch_audiomentations import Compose
 
+from senselab.utils.data_structures.audio import (
+    Audio,
+    batch_audios,
+    unbatch_audios,
+)
+from senselab.utils.device import DeviceType, _select_device_and_dtype
 from senselab.utils.tasks.input_output import (
     _from_dict_to_hf_dataset,
     _from_hf_dataset_to_dict,
 )
 
 
-def augment_hf_dataset(
-    dataset: Dict[str, Any], augmentation: Compose, audio_column: str = "audio"
-) -> Dict[str, Any]:
+def augment_audio_dataset(
+    audios: List[Audio], augmentation: Compose, device_options: Union[DeviceType, List[DeviceType]] = [DeviceType.CPU]
+) -> List[Audio]:
+    """Augments all provided audios with a given augmentation, either individually or all batched together.
+
+    Augment all audios with a user defined augmentation that can be a composition of multiple augmentations. This
+    augmentation is either performed on each audio individually or all of the audios provided are batched together
+    and run at once. NOTE: if batching, all audios must have the same sampling rate.
+
+    Args:
+        audios: List of Audios whose data will be augmented with the given augmentations
+        augmentation: A Composition of augmentations to run on each audio (uses torch-audiomentations), should have its
+            output_type set to "dict"
+        device_options: The device, or a List of possible devices, to use for augmenting. If the chosen device
+            is MPS or CUDA then the audios are all batched together, so for optimal performance, batching should
+            be done by passing a batch_size worth of audios ar a time
+
+    Returns:
+        List of audios that has passed the all of input audios through the provided augmentation. This does
+            not necessarily mean that the augmentation has been run on every audio. For more information,
+            see the torch-audiomentations documentation.
+    """
+    augmentation.output_type = "dict"
+    new_audios = []
+    device_type, dtype = _select_device_and_dtype(
+        device_options if isinstance(device_options, List) else [device_options]
+    )
+    if device_type == DeviceType.CPU:
+        for audio in audios:
+            audio_to_augment = audio.waveform.unsqueeze(0)
+            augmented_audio = augmentation(audio_to_augment, sample_rate=audio.sampling_rate).samples
+            new_audios.append(
+                Audio(
+                    waveform=torch.squeeze(augmented_audio),
+                    sampling_rate=audio.sampling_rate,
+                    metadata=audio.metadata.copy(),
+                    orig_path_or_id=audio.orig_path_or_id,
+                )
+            )
+    else:
+        batched_audios, sampling_rates, metadatas = batch_audios(audios)
+
+        batched_audios = batched_audios.to(device=torch.device(str(device_type)), dtype=dtype)
+        sampling_rate = sampling_rates[0] if isinstance(sampling_rates, List) else sampling_rates
+        augmented_audio = augmentation(batched_audios, sample_rate=sampling_rate).samples
+
+        augmented_audio = augmented_audio.detach().cpu()
+        return unbatch_audios(augmented_audio, sampling_rates, metadatas)
+
+    return new_audios
+
+
+def augment_hf_dataset(dataset: Dict[str, Any], augmentation: Compose, audio_column: str = "audio") -> Dict[str, Any]:
     """Resamples a Hugging Face `Dataset` object."""
     hf_dataset = _from_dict_to_hf_dataset(dataset)
 
-    def _augment_hf_row(
-        row: Dataset, augmentation: Compose, audio_column: str
-    ) -> Dict[str, Any]:
+    def _augment_hf_row(row: Dataset, augmentation: Compose, audio_column: str) -> Dict[str, Any]:
         waveform = row[audio_column]["array"]
         sampling_rate = row[audio_column]["sampling_rate"]
 
         # Ensure waveform is a PyTorch tensor
         if not isinstance(waveform, torch.Tensor):
             waveform = torch.tensor(waveform)
         if waveform.dim() == 1:
-            waveform = waveform.unsqueeze(0).unsqueeze(
-                0
-            )  # [num_samples] -> [1, 1, num_samples]
+            waveform = waveform.unsqueeze(0).unsqueeze(0)  # [num_samples] -> [1, 1, num_samples]
         elif waveform.dim() == 2:
-            waveform = waveform.unsqueeze(
-                1
-            )  # [batch_size, num_samples] -> [batch_size, 1, num_samples]
+            waveform = waveform.unsqueeze(1)  # [batch_size, num_samples] -> [batch_size, 1, num_samples]
 
-        augmented_hf_row = augmentation(
-            waveform, sample_rate=sampling_rate
-        ).squeeze()
+        augmented_hf_row = augmentation(waveform, sample_rate=sampling_rate).squeeze()
 
         return {
             "augmented_audio": {
@@ -47,8 +95,6 @@ def _augment_hf_row(
             }
         }
 
-    augmented_hf_dataset = hf_dataset.map(
-        lambda x: _augment_hf_row(x, augmentation, audio_column)
-    )
+    augmented_hf_dataset = hf_dataset.map(lambda x: _augment_hf_row(x, augmentation, audio_column))
     augmented_hf_dataset = augmented_hf_dataset.remove_columns([audio_column])
     return _from_hf_dataset_to_dict(augmented_hf_dataset)
diff --git a/src/senselab/audio/tasks/preprocessing.py b/src/senselab/audio/tasks/preprocessing.py
@@ -1,26 +1,148 @@
 """This module implements some utilities for the preprocessing task."""
 
-from typing import Any, Dict
+from typing import Any, Dict, List, Tuple
 
 import torch
 import torchaudio.functional as F
 from datasets import Dataset
 
-from senselab.utils.tasks.input_output import (
-    _from_dict_to_hf_dataset,
-    _from_hf_dataset_to_dict,
-)
+from senselab.utils.data_structures.audio import Audio
+from senselab.utils.tasks.input_output import _from_dict_to_hf_dataset, _from_hf_dataset_to_dict
 
 
-def resample_hf_dataset(
-    dataset: Dict[str, Any], resample_rate: int, rolloff: float = 0.99
-) -> Dict[str, Any]:
+def resample_audios(audios: List[Audio], resample_rate: int, rolloff: float = 0.99) -> List[Audio]:
+    """Resamples all Audios to a given sampling rate.
+
+    Takes a list of audios and resamples each into the new sampling rate. Notably does not assume any
+    specific structure of the audios (can vary in stereo vs. mono as well as their original sampling rate)
+
+    Args:
+        audios: List of Audios to resample
+        resample_rate: Rate at which to resample the Audio
+        rolloff: The roll-off frequency of the filter, as a fraction of the Nyquist.
+            Lower values reduce anti-aliasing, but also reduce some of the highest frequencies
+
+    Returns:
+        List of Audios that have all been resampled to the given resampling rate
+    """
+    resampled_audios = []
+    for audio in audios:
+        new_metadata = audio.metadata.copy()
+        new_metadata_pre_proc = new_metadata.setdefault("preprocessing", [])
+        new_metadata_pre_proc.append(f"resample_{audio.sampling_rate}_to_{resample_rate}")
+
+        resampled = F.resample(audio.waveform, audio.sampling_rate, resample_rate, rolloff=rolloff)
+        resampled_audios.append(
+            Audio(
+                waveform=resampled,
+                sampling_rate=resample_rate,
+                metadata=new_metadata,
+                orig_path_or_id=audio.orig_path_or_id,
+            )
+        )
+    return resampled_audios
+
+
+def downmix_audios_to_mono(audios: List[Audio]) -> List[Audio]:
+    """Downmixes a list of Audio objects to mono by averaging all channels.
+
+    Args:
+        audios (List[Audio]): A list of Audio objects with a tensor representing the audio waveform.
+                                 Shape: (num_channels, num_samples).
+
+    Returns:
+        List[Audio]: The list of audio objects with a mono waveform averaged from all channels. Shape: (num_samples).
+    """
+    down_mixed_audios = []
+    for audio in audios:
+        new_metadata = audio.metadata.copy()
+        new_metadata_pre_proc = new_metadata.setdefault("preprocessing", [])
+        new_metadata_pre_proc.append("downmix_mono_averaging")
+        down_mixed_audios.append(
+            Audio(
+                waveform=audio.waveform.mean(dim=0, keepdim=True),
+                sampling_rate=audio.sampling_rate,
+                metadata=new_metadata,
+                orig_path_or_id=audio.orig_path_or_id,
+            )
+        )
+
+    return down_mixed_audios
+
+
+def select_channel_from_audios(audios: List[Audio], channel_index: int) -> List[Audio]:
+    """Selects a specific channel from a list of Audio objects.
+
+    Args:
+        audios (List[Audio]): A list of Audio objects with a tensor representing the audio waveform.
+                              Shape: (num_channels, num_samples).
+        channel_index (int): The index of the channel to select.
+
+    Returns:
+        List[Audio]: The list of audio objects with the selected channel. Shape: (1, num_samples).
+    """
+    mono_channel_audios = []
+    for audio in audios:
+        if audio.waveform.size(0) <= channel_index:  # should consider how much sense negative values make
+            raise ValueError("channel_index should be valid")
+
+        new_metadata = audio.metadata.copy()
+        new_metadata_pre_proc = new_metadata.setdefault("preprocessing", [])
+        new_metadata_pre_proc.append(f"downmix_mono_select_{channel_index}")
+
+        mono_channel_audios.append(
+            Audio(
+                waveform=audio.waveform[channel_index, :],
+                sampling_rate=audio.sampling_rate,
+                metadata=new_metadata,
+                orig_path_or_id=audio.orig_path_or_id,
+            )
+        )
+    return mono_channel_audios
+
+
+def chunk_audios(data: List[Tuple[Audio, Tuple[float, float]]]) -> List[Audio]:
+    """Chunks the input audios based on the start and end timestamp.
+
+    Args:
+        data: List of tuples containing an Audio object and a tuple with start and end (in seconds) for chunking.
+
+    Returns:
+        List of Audios that have been chunked based on the provided timestamps
+    """
+    chunked_audios = []
+
+    for audio, timestamps in data:
+        start, end = timestamps
+        if start < 0:
+            raise ValueError("Start time must be greater than or equal to 0.")
+        duration = audio.waveform.shape[1] / audio.sampling_rate
+        if end > duration:
+            raise ValueError(f"End time must be less than the duration of the audio file ({duration} seconds).")
+        start_sample = int(start * audio.sampling_rate)
+        end_sample = int(end * audio.sampling_rate)
+        chunked_waveform = audio.waveform[:, start_sample:end_sample]
+
+        new_metadata = audio.metadata.copy()
+        new_metadata_pre_proc = new_metadata.setdefault("preprocessing", [])
+        new_metadata_pre_proc.append(f"chunk_{start}_{end}")
+
+        chunked_audios.append(
+            Audio(
+                waveform=chunked_waveform,
+                sampling_rate=audio.sampling_rate,
+                metadata=new_metadata,
+                orig_path_or_id=audio.orig_path_or_id,
+            )
+        )
+    return chunked_audios
+
+
+def resample_hf_dataset(dataset: Dict[str, Any], resample_rate: int, rolloff: float = 0.99) -> Dict[str, Any]:
     """Resamples a Hugging Face `Dataset` object."""
     hf_dataset = _from_dict_to_hf_dataset(dataset)
 
-    def _resample_hf_row(
-        row: Dataset, resample_rate: int, rolloff: float = 0.99
-    ) -> Dict[str, Any]:
+    def _resample_hf_row(row: Dataset, resample_rate: int, rolloff: float = 0.99) -> Dict[str, Any]:
         """Resamples audio data in a hf dataset row.
 
         A lower rolloff will therefore reduce the amount of aliasing,
@@ -32,9 +154,7 @@ def _resample_hf_row(
             waveform = torch.tensor(waveform)
         sampling_rate = row["audio"]["sampling_rate"]
 
-        resampled_waveform = F.resample(
-            waveform, sampling_rate, resample_rate, rolloff=rolloff
-        )
+        resampled_waveform = F.resample(waveform, sampling_rate, resample_rate, rolloff=rolloff)
 
         return {
             "audio": {
@@ -43,7 +163,5 @@ def _resample_hf_row(
             }
         }
 
-    resampled_hf_dataset = hf_dataset.map(
-        lambda x: _resample_hf_row(x, resample_rate, rolloff)
-    )
+    resampled_hf_dataset = hf_dataset.map(lambda x: _resample_hf_row(x, resample_rate, rolloff))
     return _from_hf_dataset_to_dict(resampled_hf_dataset)
diff --git a/src/senselab/audio/tasks/preprocessing_pydra.py b/src/senselab/audio/tasks/preprocessing_pydra.py
@@ -1,6 +1,17 @@
 """This module defines a pydra API for the preprocessing task."""
+
 import pydra
 
-from senselab.audio.tasks.preprocessing import resample_hf_dataset
+from senselab.audio.tasks.preprocessing import (
+    chunk_audios,
+    downmix_audios_to_mono,
+    resample_audios,
+    resample_hf_dataset,
+    select_channel_from_audios,
+)
 
+resample_audios_pt = pydra.mark.task(resample_audios)
+downmix_audios_to_mono_pt = pydra.mark.task(downmix_audios_to_mono)
+chunk_audios_pt = pydra.mark.task(chunk_audios)
 resample_hf_dataset_pt = pydra.mark.task(resample_hf_dataset)
+select_channel_from_audios_pt = pydra.mark.task(select_channel_from_audios)