pyannote · hbredin · Sep 18, 2023 · May 5, 2023 · May 9, 2023 · May 15, 2023
diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
@@ -0,0 +1,70 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel, Wav2Vec2FeatureExtractor, AutoConfig
+
+class SelfSupModel(nn.Module):
+
+    def __init__(self, model,layer, cache):
+        super().__init__()
+        self.model = model
+        print("\nThe selected Self-Supervised Model from HuggingFace is "+ model+".\n")
+        SelfSupModel.__name__ = model.rsplit('/', 1)[1] #Overwrite the class name to that of the selected model
+        if cache is not None :
+            print("Model and configuration file location is : "+str(cache))
+            config = AutoConfig.from_pretrained(model, cache_dir = cache)
+            config.cache_dir= cache
+        else :
+            config = AutoConfig.from_pretrained(model)
+
+        config.output_hidden_states = True
+
+
+        self.ssl_model = AutoModel.from_pretrained(model, config = config, cache_dir = cache) #Load the model
+        self.ssl_model.eval()
+
+        self.feat_size = config.hidden_size #Get the encoder feature size
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model, return_tensors="pt")
+
+        if layer == None :
+            print("\nLayer number not specified. Default to the first one (layer 0).\n")
+            self.layer = 0
+        else :        
+            self.layer = layer
+            print("\nSelected frozen layer is "+ str(layer) +". \n")
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
+        if self.processor.do_normalize == True :
+            waveforms = F.layer_norm(waveforms, waveforms.shape)
+
+        with torch.no_grad():
+            features = self.ssl_model(waveforms)  #Compute the features and extract last hidden layer weights
+
+        outputs = features.hidden_states[self.layer + 1]
+
+        return (outputs)
diff --git a/pyannote/audio/models/segmentation/PyanHugg.py b/pyannote/audio/models/segmentation/PyanHugg.py
@@ -0,0 +1,211 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from pyannote.core.utils.generators import pairwise
+
+from pyannote.audio.core.model import Model
+from pyannote.audio.core.task import Task
+from pyannote.audio.models.blocks.sincnet import SincNet
+from pyannote.audio.models.blocks.selfsup import SelfSupModel
+from pyannote.audio.utils.params import merge_dict
+
+
+class PyanHugg(Model):
+    """PyanHugg segmentation model
+
+    Self-Supervised Model (or SincNet if specified) > LSTM > Feed forward > Classifier
+
+    All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
+    Tested (and currently working) models are : 
+     - "microsoft/wavlm-base"
+     - "microsoft/wavlm-large"
+     - "facebook/hubert-base-ls960"
+     - "facebook/wav2vec2-base-960h"
+
+    Parameters
+    ----------
+    sample_rate : int, optional
+        Audio sample rate. Defaults to 16kHz (16000).
+    num_channels : int, optional
+        Number of channels. Defaults to mono (1).
+
+    selfsupervised : dict, optional
+        Keyword arugments passed to the selfsupervised block.
+        Defaults to {
+        "model": "microsoft/wavlm-base",
+        "layer": 4,
+        "cache": None,
+    }. If "model" is specified as "sincnet", SincNet block will be used instead.
+    sincnet : dict, optional
+        Keyword arugments passed to the SincNet block.
+        Defaults to {"stride": 1}.
+    lstm : dict, optional
+        Keyword arguments passed to the LSTM layer.
+        Defaults to {"hidden_size": 128, "num_layers": 2, "bidirectional": True},
+        i.e. two bidirectional layers with 128 units each.
+        Set "monolithic" to False to split monolithic multi-layer LSTM into multiple mono-layer LSTMs.
+        This may proove useful for probing LSTM internals.
+    linear : dict, optional
+        Keyword arugments used to initialize linear layers
+        Defaults to {"hidden_size": 128, "num_layers": 2},
+        i.e. two linear layers with 128 units each.
+    """
+
+
+
+    SINCNET_DEFAULTS = {"stride": 10}
+    SSL_DEFAULTS = {
+        "model": "microsoft/wavlm-base",
+        "layer": 4,
+        "cache": None,
+    }
+    LSTM_DEFAULTS = {
+        "hidden_size": 128,
+        "num_layers": 2,
+        "bidirectional": True,
+        "monolithic": True,
+        "dropout": 0.0,
+    }
+    LINEAR_DEFAULTS = {"hidden_size": 128, "num_layers": 2}
+
+    def __init__(
+        self,
+        selfsupervised: dict = None,
+        sincnet: dict = None,
+        lstm: dict = None,
+        linear: dict = None,
+        sample_rate: int = 16000,
+        num_channels: int = 1,
+        task: Optional[Task] = None,
+    ):
+
+        super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
+
+        selfsupervised = merge_dict(self.SSL_DEFAULTS, selfsupervised)
+        sincnet = merge_dict(self.SINCNET_DEFAULTS, sincnet)
+        sincnet["sample_rate"] = sample_rate
+        lstm = merge_dict(self.LSTM_DEFAULTS, lstm)
+        lstm["batch_first"] = True
+        linear = merge_dict(self.LINEAR_DEFAULTS, linear)
+        if (selfsupervised["model"] == "sincnet") :
+            self.save_hyperparameters("sincnet", "lstm", "linear")
+        else :
+            self.save_hyperparameters("selfsupervised", "lstm", "linear")
+
+        self.model = selfsupervised["model"]
+
+        #All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
+        print("\n##################################################################")
+        if selfsupervised["model"] is not "sincnet" :
+          print("### A self-supervised model is used for the feature extraction ###")
+          print("##################################################################")
+          self.selfsupervised = SelfSupModel(**self.hparams.selfsupervised)
+          feat_size = self.selfsupervised.feat_size
+        else :
+          self.sincnet = SincNet(**self.hparams.sincnet)
+          print("###   The SincNet module is used for the feature extraction    ### ")
+          feat_size = 60
+
+        print("##################################################################\n")
+        monolithic = lstm["monolithic"]
+        if monolithic:
+            multi_layer_lstm = dict(lstm)
+            del multi_layer_lstm["monolithic"]
+            self.lstm = nn.LSTM(feat_size, **multi_layer_lstm)
+
+        else:
+            num_layers = lstm["num_layers"]
+            if num_layers > 1:
+                self.dropout = nn.Dropout(p=lstm["dropout"])
+
+            one_layer_lstm = dict(lstm)
+            one_layer_lstm["num_layers"] = 1
+            one_layer_lstm["dropout"] = 0.0
+            del one_layer_lstm["monolithic"]
+
+            self.lstm = nn.ModuleList(
+                [
+                    nn.LSTM(
+                        feat_size
+                        if i == 0
+                        else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
+                        **one_layer_lstm
+                    )
+                    for i in range(num_layers)
+                ]
+            )
+
+        if linear["num_layers"] < 1:
+            return
+
+        lstm_out_features: int = self.hparams.lstm["hidden_size"] * (
+            2 if self.hparams.lstm["bidirectional"] else 1
+        )
+        self.linear = nn.ModuleList(
+            [
+                nn.Linear(in_features, out_features)
+                for in_features, out_features in pairwise(
+                    [
+                        lstm_out_features,
+                    ]
+                    + [self.hparams.linear["hidden_size"]]
+                    * self.hparams.linear["num_layers"]
+                )
+            ]
+        )
+
+    def build(self):
+
+        if self.hparams.linear["num_layers"] > 0:
+            in_features = self.hparams.linear["hidden_size"]
+        else:
+            in_features = self.hparams.lstm["hidden_size"] * (
+                2 if self.hparams.lstm["bidirectional"] else 1
+            )
+
+        if self.specifications.powerset:
+            out_features = self.specifications.num_powerset_classes
+        else:
+            out_features = len(self.specifications.classes)
+
+        self.classifier = nn.Linear(in_features, out_features)
+        self.activation = self.default_activation()
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """Pass forward
+
+        Parameters
+        ----------
+        waveforms : (batch, channel, sample)
+
+        Returns
+        -------
+        scores : (batch, frame, classes)
+        """
+        if self.model != "sincnet" :
+          outputs = self.selfsupervised(waveforms)
+        else :
+          outputs = self.sincnet(waveforms) 
+        if self.hparams.lstm["monolithic"]:
+            if self.model != "sincnet" :
+              outputs, _ = self.lstm(outputs)
+            else:
+              outputs, _ = self.lstm(
+              rearrange(outputs, "batch feature frame -> batch frame feature")
+              )
+        else:
+            if self.model == "sincnet" :
+              outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
+            for i, lstm in enumerate(self.lstm):
+                outputs, _ = lstm(outputs)
+                if i + 1 < self.hparams.lstm["num_layers"]:
+                    outputs = self.dropout(outputs)
+
+        if self.hparams.linear["num_layers"] > 0:
+            for linear in self.linear:
+                outputs = F.leaky_relu(linear(outputs))
+
+        return self.activation(self.classifier(outputs))
diff --git a/pyannote/audio/models/segmentation/__init__.py b/pyannote/audio/models/segmentation/__init__.py
@@ -21,5 +21,6 @@
 # SOFTWARE.
 
 from .PyanNet import PyanNet
+from .PyanHugg import PyanHugg
 
-__all__ = ["PyanNet"]
+__all__ = ["PyanNet","PyanHugg"]