From 6d3af2e209b4543a886935807bffaa0175f36759 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Fri, 5 May 2023 15:00:57 +0200
Subject: [PATCH 01/17] add WaVLM-Base model to PyanNet.py in replacement of
 SincNet

Added WavLM-Base model which replaces the SincNet feature extraction model
within the PyanNet architecture (loaded outside of the class from
HuggingFace.co).
---
 pyannote/audio/models/segmentation/PyanNet.py | 51 ++++++++++++++-----
 1 file changed, 39 insertions(+), 12 deletions(-)

diff --git a/pyannote/audio/models/segmentation/PyanNet.py b/pyannote/audio/models/segmentation/PyanNet.py
index 1b68a32a9..10991a1f6 100644
--- a/pyannote/audio/models/segmentation/PyanNet.py
+++ b/pyannote/audio/models/segmentation/PyanNet.py
@@ -34,6 +34,20 @@
 from pyannote.audio.models.blocks.sincnet import SincNet
 from pyannote.audio.utils.params import merge_dict
 
+##WAVLM_BASE
+#Requires to pass the PyanNet model to cuda during training script
+
+#Model is loaded outside of the PyanNet class
+
+from transformers import AutoModel
+
+#Loading the model from HuggingFace (requires git lfs to load the .bin checkpoint)
+#model = AutoModel.from_pretrained('/content/drive/MyDrive/PyanNet/wavlm-base')
+
+model = AutoModel.from_pretrained('microsoft/wavlm-base')
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model.to(device) #Pass the model to the gpu (supposing that accelerator = gpu in the TorchLightning Trainer)
 
 class PyanNet(Model):
     """PyanNet segmentation model
@@ -62,6 +76,7 @@ class PyanNet(Model):
     """
 
     SINCNET_DEFAULTS = {"stride": 10}
+    
     LSTM_DEFAULTS = {
         "hidden_size": 128,
         "num_layers": 2,
@@ -91,13 +106,13 @@ def __init__(
         self.save_hyperparameters("sincnet", "lstm", "linear")
 
         self.sincnet = SincNet(**self.hparams.sincnet)
-
+        
+        
         monolithic = lstm["monolithic"]
         if monolithic:
             multi_layer_lstm = dict(lstm)
             del multi_layer_lstm["monolithic"]
-            self.lstm = nn.LSTM(60, **multi_layer_lstm)
-
+            self.lstm = nn.LSTM(512, **multi_layer_lstm)
         else:
             num_layers = lstm["num_layers"]
             if num_layers > 1:
@@ -111,7 +126,7 @@ def __init__(
             self.lstm = nn.ModuleList(
                 [
                     nn.LSTM(
-                        60
+                        512
                         if i == 0
                         else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
                         **one_layer_lstm
@@ -167,22 +182,34 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         -------
         scores : (batch, frame, classes)
         """
-
-        outputs = self.sincnet(waveforms)
-
+        #outputs = self.sincnet(waveforms)        
+        
+        #WavLM feature extraction
+        
+        waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
+        with torch.no_grad():
+            feat = model(waveforms) #Compute the features and extract last hidden layer weights
+        
+        outputs = feat.extract_features #Get the features : outputs : (batch, frame, feature)
+        
         if self.hparams.lstm["monolithic"]:
-            outputs, _ = self.lstm(
-                rearrange(outputs, "batch feature frame -> batch frame feature")
-            )
+            #No need to rearrange the output, as the features are already structured in (batch frame feature)
+            
+            #outputs, _ = self.lstm(
+            #    rearrange(outputs, "batch feature frame -> batch frame feature")) 
+            outputs, _ = self.lstm(outputs)
+            
         else:
-            outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
+            #outputs = rearrange(outputs, "batch feature frame -> batch frame feature").cuda()
             for i, lstm in enumerate(self.lstm):
                 outputs, _ = lstm(outputs)
                 if i + 1 < self.hparams.lstm["num_layers"]:
                     outputs = self.dropout(outputs)
-
+        
+        
         if self.hparams.linear["num_layers"] > 0:
             for linear in self.linear:
                 outputs = F.leaky_relu(linear(outputs))
 
         return self.activation(self.classifier(outputs))
+

From d03906bbc1372c93c04b9cae5f5e98d713f89926 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Tue, 9 May 2023 09:16:19 +0200
Subject: [PATCH 02/17] implement wavlm inside PyanNet class and add wavlm
 block

---
 pyannote/audio/models/blocks/wavlm.py         | 44 +++++++++
 pyannote/audio/models/segmentation/PyanNet.py | 90 ++++++-------------
 2 files changed, 71 insertions(+), 63 deletions(-)
 create mode 100644 pyannote/audio/models/blocks/wavlm.py

diff --git a/pyannote/audio/models/blocks/wavlm.py b/pyannote/audio/models/blocks/wavlm.py
new file mode 100644
index 000000000..5c79cb286
--- /dev/null
+++ b/pyannote/audio/models/blocks/wavlm.py
@@ -0,0 +1,44 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import AutoModel
+
+class WavLM(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+        self.wvlm = AutoModel.from_pretrained('microsoft/wavlm-base') #Load the model
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        
+        waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
+        with torch.no_grad():
+            outputs = self.wvlm(waveforms).extract_features #Compute the features and extract last hidden layer weights
+        
+        return (outputs)
diff --git a/pyannote/audio/models/segmentation/PyanNet.py b/pyannote/audio/models/segmentation/PyanNet.py
index 10991a1f6..9ebaddbe0 100644
--- a/pyannote/audio/models/segmentation/PyanNet.py
+++ b/pyannote/audio/models/segmentation/PyanNet.py
@@ -1,26 +1,3 @@
-# MIT License
-#
-# Copyright (c) 2020 CNRS
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-
 from typing import Optional
 
 import torch
@@ -32,22 +9,9 @@
 from pyannote.audio.core.model import Model
 from pyannote.audio.core.task import Task
 from pyannote.audio.models.blocks.sincnet import SincNet
+from pyannote.audio.models.blocks.wavlm import WavLM
 from pyannote.audio.utils.params import merge_dict
 
-##WAVLM_BASE
-#Requires to pass the PyanNet model to cuda during training script
-
-#Model is loaded outside of the PyanNet class
-
-from transformers import AutoModel
-
-#Loading the model from HuggingFace (requires git lfs to load the .bin checkpoint)
-#model = AutoModel.from_pretrained('/content/drive/MyDrive/PyanNet/wavlm-base')
-
-model = AutoModel.from_pretrained('microsoft/wavlm-base')
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model.to(device) #Pass the model to the gpu (supposing that accelerator = gpu in the TorchLightning Trainer)
 
 class PyanNet(Model):
     """PyanNet segmentation model
@@ -76,7 +40,6 @@ class PyanNet(Model):
     """
 
     SINCNET_DEFAULTS = {"stride": 10}
-    
     LSTM_DEFAULTS = {
         "hidden_size": 128,
         "num_layers": 2,
@@ -88,6 +51,7 @@ class PyanNet(Model):
 
     def __init__(
         self,
+        model: str = None,
         sincnet: dict = None,
         lstm: dict = None,
         linear: dict = None,
@@ -104,15 +68,21 @@ def __init__(
         lstm["batch_first"] = True
         linear = merge_dict(self.LINEAR_DEFAULTS, linear)
         self.save_hyperparameters("sincnet", "lstm", "linear")
+        self.model = model
+
+        if model == "wavlm":
+          self.wavlm = WavLM()
+          feat_size = 512  
+        else :
+          self.sincnet = SincNet(**self.hparams.sincnet)
+          feat_size = 60        
 
-        self.sincnet = SincNet(**self.hparams.sincnet)
-        
-        
         monolithic = lstm["monolithic"]
         if monolithic:
             multi_layer_lstm = dict(lstm)
             del multi_layer_lstm["monolithic"]
-            self.lstm = nn.LSTM(512, **multi_layer_lstm)
+            self.lstm = nn.LSTM(feat_size, **multi_layer_lstm)
+
         else:
             num_layers = lstm["num_layers"]
             if num_layers > 1:
@@ -126,7 +96,7 @@ def __init__(
             self.lstm = nn.ModuleList(
                 [
                     nn.LSTM(
-                        512
+                        feat_size
                         if i == 0
                         else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
                         **one_layer_lstm
@@ -182,34 +152,28 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         -------
         scores : (batch, frame, classes)
         """
-        #outputs = self.sincnet(waveforms)        
-        
-        #WavLM feature extraction
-        
-        waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
-        with torch.no_grad():
-            feat = model(waveforms) #Compute the features and extract last hidden layer weights
-        
-        outputs = feat.extract_features #Get the features : outputs : (batch, frame, feature)
-        
+        if self.model == "wavlm" :
+          outputs = self.wavlm(waveforms)
+        else :
+          outputs = self.sincnet(waveforms) 
+
         if self.hparams.lstm["monolithic"]:
-            #No need to rearrange the output, as the features are already structured in (batch frame feature)
-            
-            #outputs, _ = self.lstm(
-            #    rearrange(outputs, "batch feature frame -> batch frame feature")) 
-            outputs, _ = self.lstm(outputs)
-            
+            if self.model == "wavlm":
+              outputs, _ = self.lstm(outputs)
+            else:
+              outputs, _ = self.lstm(
+              rearrange(outputs, "batch feature frame -> batch frame feature")
+              )
         else:
-            #outputs = rearrange(outputs, "batch feature frame -> batch frame feature").cuda()
+            if self.model != "wavlm":
+              outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
             for i, lstm in enumerate(self.lstm):
                 outputs, _ = lstm(outputs)
                 if i + 1 < self.hparams.lstm["num_layers"]:
                     outputs = self.dropout(outputs)
-        
-        
+
         if self.hparams.linear["num_layers"] > 0:
             for linear in self.linear:
                 outputs = F.leaky_relu(linear(outputs))
 
         return self.activation(self.classifier(outputs))
-

From 3fc2d37e7f98f33b1e33f247493132865f5df06f Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Mon, 15 May 2023 15:21:36 +0200
Subject: [PATCH 03/17] add support of all Torchaudio self-supverised models to
 PyanNet, including layer selection.

Created a block (in replacement of the old WavLM one) called "selfsup.py" which loads and
apply a specific SSL Torchaudio model, depending on PyanNet's input parameter. User can now
also choose a specific layer which will then be used for feature extraction.
Ex :

seg_model = PyanNet(task=seg, model = "HUBERT_BASE", layer = 5)

This will load "HUBERT_BASE" model and select the 6th layer for the feature extraction. If
layer is not specified, will automatically use the first one (layer 0).

All available models can be found at : https://pytorch.org/audio/main/pipelines.html
---
 .../models/blocks/{wavlm.py => selfsup.py}    | 33 ++++++++++++++-----
 pyannote/audio/models/segmentation/PyanNet.py | 30 ++++++++++-------
 2 files changed, 44 insertions(+), 19 deletions(-)
 rename pyannote/audio/models/blocks/{wavlm.py => selfsup.py} (53%)

diff --git a/pyannote/audio/models/blocks/wavlm.py b/pyannote/audio/models/blocks/selfsup.py
similarity index 53%
rename from pyannote/audio/models/blocks/wavlm.py
rename to pyannote/audio/models/blocks/selfsup.py
index 5c79cb286..29bef9c4b 100644
--- a/pyannote/audio/models/blocks/wavlm.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -20,25 +20,42 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-
 from typing import Optional
 
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import AutoModel
-
-class WavLM(nn.Module):
+import torchaudio
+from torchaudio.models import wav2vec2_model, Wav2Vec2Model
+from torchaudio.pipelines import Wav2Vec2Bundle
 
-    def __init__(self):
-        super().__init__()
+#All torchaudio Self-Sup. models can be found at https://pytorch.org/audio/main/pipelines.html
+#ex : WAVLM_BASE, HUBERT_BASE, WAV2VEC2_BASE
 
-        self.wvlm = AutoModel.from_pretrained('microsoft/wavlm-base') #Load the model
+class SelfSupModel(nn.Module):
 
+    def __init__(self, model_name,layer_nb):
+        super().__init__()
+        self.model_name = model_name
+        print("\nThe selected Self-Supervised Model is "+ model_name+".\n")
+        SelfSupModel.__name__ = model_name #Overwrite the class name to that of the selected model       
+        bundle = getattr(torchaudio.pipelines, model_name)
+        self.feat_size = bundle._params['encoder_embed_dim'] #Get the encoder feature size
+        torch.hub.set_dir("./models")
+        self.ssl_model = bundle.get_model() #Load the model
+        
+        if layer_nb == None :
+            print("\nLayer number not specified. Default to the first one (layer 0).\n")
+            self.layer_nb = 0
+        else :        
+            self.layer_nb = layer_nb
+            print("\nSelected frozen layer is "+ str(layer_nb) +". \n")
+               
     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         
         waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
         with torch.no_grad():
-            outputs = self.wvlm(waveforms).extract_features #Compute the features and extract last hidden layer weights
+            features, _ = self.ssl_model.extract_features(waveforms)  #Compute the features and extract last hidden layer weights
+        outputs = features[self.layer_nb]
         
         return (outputs)
diff --git a/pyannote/audio/models/segmentation/PyanNet.py b/pyannote/audio/models/segmentation/PyanNet.py
index 9ebaddbe0..7891da485 100644
--- a/pyannote/audio/models/segmentation/PyanNet.py
+++ b/pyannote/audio/models/segmentation/PyanNet.py
@@ -9,7 +9,7 @@
 from pyannote.audio.core.model import Model
 from pyannote.audio.core.task import Task
 from pyannote.audio.models.blocks.sincnet import SincNet
-from pyannote.audio.models.blocks.wavlm import WavLM
+from pyannote.audio.models.blocks.selfsup import SelfSupModel
 from pyannote.audio.utils.params import merge_dict
 
 
@@ -52,6 +52,7 @@ class PyanNet(Model):
     def __init__(
         self,
         model: str = None,
+        layer: int = None,
         sincnet: dict = None,
         lstm: dict = None,
         linear: dict = None,
@@ -69,14 +70,21 @@ def __init__(
         linear = merge_dict(self.LINEAR_DEFAULTS, linear)
         self.save_hyperparameters("sincnet", "lstm", "linear")
         self.model = model
-
-        if model == "wavlm":
-          self.wavlm = WavLM()
-          feat_size = 512  
+        
+        #All torchaudio Self-Sup. models can be found at https://pytorch.org/audio/main/pipelines.html
+        print("\n##################################################################")
+        if model != None :
+          print("### A self-supervised model is used for the feature extraction ###")
+          print("##################################################################")
+          self.SelfSupervised = SelfSupModel(model,layer)
+          #feat_size = 768
+          feat_size = self.SelfSupervised.feat_size
         else :
           self.sincnet = SincNet(**self.hparams.sincnet)
-          feat_size = 60        
-
+          print("###   The SincNet module is used for the feature extraction    ### ")
+          feat_size = 60
+        
+        print("##################################################################\n")
         monolithic = lstm["monolithic"]
         if monolithic:
             multi_layer_lstm = dict(lstm)
@@ -152,20 +160,20 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         -------
         scores : (batch, frame, classes)
         """
-        if self.model == "wavlm" :
-          outputs = self.wavlm(waveforms)
+        if self.model != None :
+          outputs = self.SelfSupervised(waveforms)
         else :
           outputs = self.sincnet(waveforms) 
 
         if self.hparams.lstm["monolithic"]:
-            if self.model == "wavlm":
+            if self.model != None :
               outputs, _ = self.lstm(outputs)
             else:
               outputs, _ = self.lstm(
               rearrange(outputs, "batch feature frame -> batch frame feature")
               )
         else:
-            if self.model != "wavlm":
+            if self.model == None :
               outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
             for i, lstm in enumerate(self.lstm):
                 outputs, _ = lstm(outputs)

From 1e370fcab384b1914fc4c44f747e505976984892 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Fri, 26 May 2023 10:51:46 +0200
Subject: [PATCH 04/17] add support of ssl models from huggingface to pyannote
 using PyanHugg class

Can use pre-trained ssl models from huggingface using PyanHugg class.
Tested (and working) models are :

     - "microsoft/wavlm-base"
     - "microsoft/wavlm-large"
     - "facebook/hubert-base-ls960"
     - "facebook/wav2vec2-base-960h"

Class supports model and layer selection (as well as cache location for the
downloaded model and configuration file).
Ex :
seg_model = PyanHugg(task=seg, selfsupervised={
'model' : 'microsoft/wavlm-base',
'layer' : 2,
'cache' : 'mod_location/'})
---
 pyannote/audio/models/blocks/selfsup.py       |  53 +++--
 .../audio/models/segmentation/PyanHugg.py     | 211 ++++++++++++++++++
 pyannote/audio/models/segmentation/PyanNet.py |  67 +++---
 .../audio/models/segmentation/__init__.py     |   3 +-
 4 files changed, 278 insertions(+), 56 deletions(-)
 create mode 100644 pyannote/audio/models/segmentation/PyanHugg.py

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index 29bef9c4b..1461a38bd 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -25,37 +25,46 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import torchaudio
-from torchaudio.models import wav2vec2_model, Wav2Vec2Model
-from torchaudio.pipelines import Wav2Vec2Bundle
-
-#All torchaudio Self-Sup. models can be found at https://pytorch.org/audio/main/pipelines.html
-#ex : WAVLM_BASE, HUBERT_BASE, WAV2VEC2_BASE
+from transformers import AutoModel, Wav2Vec2FeatureExtractor, AutoConfig
 
 class SelfSupModel(nn.Module):
 
-    def __init__(self, model_name,layer_nb):
+    def __init__(self, model,layer, cache):
         super().__init__()
-        self.model_name = model_name
-        print("\nThe selected Self-Supervised Model is "+ model_name+".\n")
-        SelfSupModel.__name__ = model_name #Overwrite the class name to that of the selected model       
-        bundle = getattr(torchaudio.pipelines, model_name)
-        self.feat_size = bundle._params['encoder_embed_dim'] #Get the encoder feature size
-        torch.hub.set_dir("./models")
-        self.ssl_model = bundle.get_model() #Load the model
+        self.model = model
+        print("\nThe selected Self-Supervised Model from HuggingFace is "+ model+".\n")
+        SelfSupModel.__name__ = model.rsplit('/', 1)[1] #Overwrite the class name to that of the selected model
+        if cache is not None :
+            print("Model and configuration file location is : "+str(cache))
+            config = AutoConfig.from_pretrained(model, cache_dir = cache)
+            config.cache_dir= cache
+        else :
+            config = AutoConfig.from_pretrained(model)
+        
+        config.output_hidden_states = True
+
         
-        if layer_nb == None :
+        self.ssl_model = AutoModel.from_pretrained(model, config = config, cache_dir = cache) #Load the model
+        self.ssl_model.eval()
+        
+        self.feat_size = config.hidden_size #Get the encoder feature size
+        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model, return_tensors="pt")
+
+        if layer == None :
             print("\nLayer number not specified. Default to the first one (layer 0).\n")
-            self.layer_nb = 0
+            self.layer = 0
         else :        
-            self.layer_nb = layer_nb
-            print("\nSelected frozen layer is "+ str(layer_nb) +". \n")
-               
+            self.layer = layer
+            print("\nSelected frozen layer is "+ str(layer) +". \n")
+
     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
-        
         waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
+        if self.processor.do_normalize == True :
+            waveforms = F.layer_norm(waveforms, waveforms.shape)
+        
         with torch.no_grad():
-            features, _ = self.ssl_model.extract_features(waveforms)  #Compute the features and extract last hidden layer weights
-        outputs = features[self.layer_nb]
+            features = self.ssl_model(waveforms)  #Compute the features and extract last hidden layer weights
         
+        outputs = features.hidden_states[self.layer + 1]
+
         return (outputs)
diff --git a/pyannote/audio/models/segmentation/PyanHugg.py b/pyannote/audio/models/segmentation/PyanHugg.py
new file mode 100644
index 000000000..ae490b1a5
--- /dev/null
+++ b/pyannote/audio/models/segmentation/PyanHugg.py
@@ -0,0 +1,211 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from pyannote.core.utils.generators import pairwise
+
+from pyannote.audio.core.model import Model
+from pyannote.audio.core.task import Task
+from pyannote.audio.models.blocks.sincnet import SincNet
+from pyannote.audio.models.blocks.selfsup import SelfSupModel
+from pyannote.audio.utils.params import merge_dict
+
+
+class PyanHugg(Model):
+    """PyanHugg segmentation model
+
+    Self-Supervised Model (or SincNet if specified) > LSTM > Feed forward > Classifier
+    
+    All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
+    Tested (and currently working) models are : 
+     - "microsoft/wavlm-base"
+     - "microsoft/wavlm-large"
+     - "facebook/hubert-base-ls960"
+     - "facebook/wav2vec2-base-960h"
+     
+    Parameters
+    ----------
+    sample_rate : int, optional
+        Audio sample rate. Defaults to 16kHz (16000).
+    num_channels : int, optional
+        Number of channels. Defaults to mono (1).
+    
+    selfsupervised : dict, optional
+        Keyword arugments passed to the selfsupervised block.
+        Defaults to {
+        "model": "microsoft/wavlm-base",
+        "layer": 4,
+        "cache": None,
+    }. If "model" is specified as "sincnet", SincNet block will be used instead.
+    sincnet : dict, optional
+        Keyword arugments passed to the SincNet block.
+        Defaults to {"stride": 1}.
+    lstm : dict, optional
+        Keyword arguments passed to the LSTM layer.
+        Defaults to {"hidden_size": 128, "num_layers": 2, "bidirectional": True},
+        i.e. two bidirectional layers with 128 units each.
+        Set "monolithic" to False to split monolithic multi-layer LSTM into multiple mono-layer LSTMs.
+        This may proove useful for probing LSTM internals.
+    linear : dict, optional
+        Keyword arugments used to initialize linear layers
+        Defaults to {"hidden_size": 128, "num_layers": 2},
+        i.e. two linear layers with 128 units each.
+    """
+    
+    
+    
+    SINCNET_DEFAULTS = {"stride": 10}
+    SSL_DEFAULTS = {
+        "model": "microsoft/wavlm-base",
+        "layer": 4,
+        "cache": None,
+    }
+    LSTM_DEFAULTS = {
+        "hidden_size": 128,
+        "num_layers": 2,
+        "bidirectional": True,
+        "monolithic": True,
+        "dropout": 0.0,
+    }
+    LINEAR_DEFAULTS = {"hidden_size": 128, "num_layers": 2}
+
+    def __init__(
+        self,
+        selfsupervised: dict = None,
+        sincnet: dict = None,
+        lstm: dict = None,
+        linear: dict = None,
+        sample_rate: int = 16000,
+        num_channels: int = 1,
+        task: Optional[Task] = None,
+    ):
+
+        super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
+        
+        selfsupervised = merge_dict(self.SSL_DEFAULTS, selfsupervised)
+        sincnet = merge_dict(self.SINCNET_DEFAULTS, sincnet)
+        sincnet["sample_rate"] = sample_rate
+        lstm = merge_dict(self.LSTM_DEFAULTS, lstm)
+        lstm["batch_first"] = True
+        linear = merge_dict(self.LINEAR_DEFAULTS, linear)
+        if (selfsupervised["model"] == "sincnet") :
+            self.save_hyperparameters("sincnet", "lstm", "linear")
+        else :
+            self.save_hyperparameters("selfsupervised", "lstm", "linear")
+        
+        self.model = selfsupervised["model"]
+        
+        #All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
+        print("\n##################################################################")
+        if selfsupervised["model"] is not "sincnet" :
+          print("### A self-supervised model is used for the feature extraction ###")
+          print("##################################################################")
+          self.selfsupervised = SelfSupModel(**self.hparams.selfsupervised)
+          feat_size = self.selfsupervised.feat_size
+        else :
+          self.sincnet = SincNet(**self.hparams.sincnet)
+          print("###   The SincNet module is used for the feature extraction    ### ")
+          feat_size = 60
+        
+        print("##################################################################\n")
+        monolithic = lstm["monolithic"]
+        if monolithic:
+            multi_layer_lstm = dict(lstm)
+            del multi_layer_lstm["monolithic"]
+            self.lstm = nn.LSTM(feat_size, **multi_layer_lstm)
+
+        else:
+            num_layers = lstm["num_layers"]
+            if num_layers > 1:
+                self.dropout = nn.Dropout(p=lstm["dropout"])
+
+            one_layer_lstm = dict(lstm)
+            one_layer_lstm["num_layers"] = 1
+            one_layer_lstm["dropout"] = 0.0
+            del one_layer_lstm["monolithic"]
+
+            self.lstm = nn.ModuleList(
+                [
+                    nn.LSTM(
+                        feat_size
+                        if i == 0
+                        else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
+                        **one_layer_lstm
+                    )
+                    for i in range(num_layers)
+                ]
+            )
+
+        if linear["num_layers"] < 1:
+            return
+
+        lstm_out_features: int = self.hparams.lstm["hidden_size"] * (
+            2 if self.hparams.lstm["bidirectional"] else 1
+        )
+        self.linear = nn.ModuleList(
+            [
+                nn.Linear(in_features, out_features)
+                for in_features, out_features in pairwise(
+                    [
+                        lstm_out_features,
+                    ]
+                    + [self.hparams.linear["hidden_size"]]
+                    * self.hparams.linear["num_layers"]
+                )
+            ]
+        )
+
+    def build(self):
+
+        if self.hparams.linear["num_layers"] > 0:
+            in_features = self.hparams.linear["hidden_size"]
+        else:
+            in_features = self.hparams.lstm["hidden_size"] * (
+                2 if self.hparams.lstm["bidirectional"] else 1
+            )
+
+        if self.specifications.powerset:
+            out_features = self.specifications.num_powerset_classes
+        else:
+            out_features = len(self.specifications.classes)
+
+        self.classifier = nn.Linear(in_features, out_features)
+        self.activation = self.default_activation()
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """Pass forward
+
+        Parameters
+        ----------
+        waveforms : (batch, channel, sample)
+
+        Returns
+        -------
+        scores : (batch, frame, classes)
+        """
+        if self.model != "sincnet" :
+          outputs = self.selfsupervised(waveforms)
+        else :
+          outputs = self.sincnet(waveforms) 
+        if self.hparams.lstm["monolithic"]:
+            if self.model != "sincnet" :
+              outputs, _ = self.lstm(outputs)
+            else:
+              outputs, _ = self.lstm(
+              rearrange(outputs, "batch feature frame -> batch frame feature")
+              )
+        else:
+            if self.model == "sincnet" :
+              outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
+            for i, lstm in enumerate(self.lstm):
+                outputs, _ = lstm(outputs)
+                if i + 1 < self.hparams.lstm["num_layers"]:
+                    outputs = self.dropout(outputs)
+
+        if self.hparams.linear["num_layers"] > 0:
+            for linear in self.linear:
+                outputs = F.leaky_relu(linear(outputs))
+
+        return self.activation(self.classifier(outputs))
diff --git a/pyannote/audio/models/segmentation/PyanNet.py b/pyannote/audio/models/segmentation/PyanNet.py
index 7891da485..1b68a32a9 100644
--- a/pyannote/audio/models/segmentation/PyanNet.py
+++ b/pyannote/audio/models/segmentation/PyanNet.py
@@ -1,3 +1,26 @@
+# MIT License
+#
+# Copyright (c) 2020 CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
 from typing import Optional
 
 import torch
@@ -9,7 +32,6 @@
 from pyannote.audio.core.model import Model
 from pyannote.audio.core.task import Task
 from pyannote.audio.models.blocks.sincnet import SincNet
-from pyannote.audio.models.blocks.selfsup import SelfSupModel
 from pyannote.audio.utils.params import merge_dict
 
 
@@ -51,8 +73,6 @@ class PyanNet(Model):
 
     def __init__(
         self,
-        model: str = None,
-        layer: int = None,
         sincnet: dict = None,
         lstm: dict = None,
         linear: dict = None,
@@ -69,27 +89,14 @@ def __init__(
         lstm["batch_first"] = True
         linear = merge_dict(self.LINEAR_DEFAULTS, linear)
         self.save_hyperparameters("sincnet", "lstm", "linear")
-        self.model = model
-        
-        #All torchaudio Self-Sup. models can be found at https://pytorch.org/audio/main/pipelines.html
-        print("\n##################################################################")
-        if model != None :
-          print("### A self-supervised model is used for the feature extraction ###")
-          print("##################################################################")
-          self.SelfSupervised = SelfSupModel(model,layer)
-          #feat_size = 768
-          feat_size = self.SelfSupervised.feat_size
-        else :
-          self.sincnet = SincNet(**self.hparams.sincnet)
-          print("###   The SincNet module is used for the feature extraction    ### ")
-          feat_size = 60
-        
-        print("##################################################################\n")
+
+        self.sincnet = SincNet(**self.hparams.sincnet)
+
         monolithic = lstm["monolithic"]
         if monolithic:
             multi_layer_lstm = dict(lstm)
             del multi_layer_lstm["monolithic"]
-            self.lstm = nn.LSTM(feat_size, **multi_layer_lstm)
+            self.lstm = nn.LSTM(60, **multi_layer_lstm)
 
         else:
             num_layers = lstm["num_layers"]
@@ -104,7 +111,7 @@ def __init__(
             self.lstm = nn.ModuleList(
                 [
                     nn.LSTM(
-                        feat_size
+                        60
                         if i == 0
                         else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
                         **one_layer_lstm
@@ -160,21 +167,15 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         -------
         scores : (batch, frame, classes)
         """
-        if self.model != None :
-          outputs = self.SelfSupervised(waveforms)
-        else :
-          outputs = self.sincnet(waveforms) 
+
+        outputs = self.sincnet(waveforms)
 
         if self.hparams.lstm["monolithic"]:
-            if self.model != None :
-              outputs, _ = self.lstm(outputs)
-            else:
-              outputs, _ = self.lstm(
-              rearrange(outputs, "batch feature frame -> batch frame feature")
-              )
+            outputs, _ = self.lstm(
+                rearrange(outputs, "batch feature frame -> batch frame feature")
+            )
         else:
-            if self.model == None :
-              outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
+            outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
             for i, lstm in enumerate(self.lstm):
                 outputs, _ = lstm(outputs)
                 if i + 1 < self.hparams.lstm["num_layers"]:
diff --git a/pyannote/audio/models/segmentation/__init__.py b/pyannote/audio/models/segmentation/__init__.py
index 82e149853..91b79e68a 100644
--- a/pyannote/audio/models/segmentation/__init__.py
+++ b/pyannote/audio/models/segmentation/__init__.py
@@ -21,5 +21,6 @@
 # SOFTWARE.
 
 from .PyanNet import PyanNet
+from .PyanHugg import PyanHugg
 
-__all__ = ["PyanNet"]
+__all__ = ["PyanNet","PyanHugg"]

From e170eed64b53c8a661952293c73778967864b8cd Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Wed, 31 May 2023 10:00:52 +0200
Subject: [PATCH 05/17] remove support for sincnet block

---
 .../audio/models/segmentation/PyanHugg.py     | 48 +++++--------------
 1 file changed, 12 insertions(+), 36 deletions(-)

diff --git a/pyannote/audio/models/segmentation/PyanHugg.py b/pyannote/audio/models/segmentation/PyanHugg.py
index ae490b1a5..4400c748b 100644
--- a/pyannote/audio/models/segmentation/PyanHugg.py
+++ b/pyannote/audio/models/segmentation/PyanHugg.py
@@ -8,7 +8,6 @@
 
 from pyannote.audio.core.model import Model
 from pyannote.audio.core.task import Task
-from pyannote.audio.models.blocks.sincnet import SincNet
 from pyannote.audio.models.blocks.selfsup import SelfSupModel
 from pyannote.audio.utils.params import merge_dict
 
@@ -16,7 +15,7 @@
 class PyanHugg(Model):
     """PyanHugg segmentation model
 
-    Self-Supervised Model (or SincNet if specified) > LSTM > Feed forward > Classifier
+    Self-Supervised Model > LSTM > Feed forward > Classifier
     
     All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
     Tested (and currently working) models are : 
@@ -38,10 +37,7 @@ class PyanHugg(Model):
         "model": "microsoft/wavlm-base",
         "layer": 4,
         "cache": None,
-    }. If "model" is specified as "sincnet", SincNet block will be used instead.
-    sincnet : dict, optional
-        Keyword arugments passed to the SincNet block.
-        Defaults to {"stride": 1}.
+    }
     lstm : dict, optional
         Keyword arguments passed to the LSTM layer.
         Defaults to {"hidden_size": 128, "num_layers": 2, "bidirectional": True},
@@ -56,7 +52,6 @@ class PyanHugg(Model):
     
     
     
-    SINCNET_DEFAULTS = {"stride": 10}
     SSL_DEFAULTS = {
         "model": "microsoft/wavlm-base",
         "layer": 4,
@@ -74,7 +69,6 @@ class PyanHugg(Model):
     def __init__(
         self,
         selfsupervised: dict = None,
-        sincnet: dict = None,
         lstm: dict = None,
         linear: dict = None,
         sample_rate: int = 16000,
@@ -85,30 +79,21 @@ def __init__(
         super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
         
         selfsupervised = merge_dict(self.SSL_DEFAULTS, selfsupervised)
-        sincnet = merge_dict(self.SINCNET_DEFAULTS, sincnet)
-        sincnet["sample_rate"] = sample_rate
         lstm = merge_dict(self.LSTM_DEFAULTS, lstm)
         lstm["batch_first"] = True
         linear = merge_dict(self.LINEAR_DEFAULTS, linear)
-        if (selfsupervised["model"] == "sincnet") :
-            self.save_hyperparameters("sincnet", "lstm", "linear")
-        else :
-            self.save_hyperparameters("selfsupervised", "lstm", "linear")
+        self.save_hyperparameters("selfsupervised", "lstm", "linear")
         
         self.model = selfsupervised["model"]
         
+        
+
         #All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
         print("\n##################################################################")
-        if selfsupervised["model"] is not "sincnet" :
-          print("### A self-supervised model is used for the feature extraction ###")
-          print("##################################################################")
-          self.selfsupervised = SelfSupModel(**self.hparams.selfsupervised)
-          feat_size = self.selfsupervised.feat_size
-        else :
-          self.sincnet = SincNet(**self.hparams.sincnet)
-          print("###   The SincNet module is used for the feature extraction    ### ")
-          feat_size = 60
-        
+        print("### A self-supervised model is used for the feature extraction ###")
+        print("##################################################################")
+        self.selfsupervised = SelfSupModel(**self.hparams.selfsupervised)
+        feat_size = self.selfsupervised.feat_size
         print("##################################################################\n")
         monolithic = lstm["monolithic"]
         if monolithic:
@@ -185,20 +170,11 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         -------
         scores : (batch, frame, classes)
         """
-        if self.model != "sincnet" :
-          outputs = self.selfsupervised(waveforms)
-        else :
-          outputs = self.sincnet(waveforms) 
+        outputs = self.selfsupervised(waveforms)
+        
         if self.hparams.lstm["monolithic"]:
-            if self.model != "sincnet" :
-              outputs, _ = self.lstm(outputs)
-            else:
-              outputs, _ = self.lstm(
-              rearrange(outputs, "batch feature frame -> batch frame feature")
-              )
+            outputs, _ = self.lstm(outputs)
         else:
-            if self.model == "sincnet" :
-              outputs = rearrange(outputs, "batch feature frame -> batch frame feature")
             for i, lstm in enumerate(self.lstm):
                 outputs, _ = lstm(outputs)
                 if i + 1 < self.hparams.lstm["num_layers"]:

From 9f81c3061d93683acdc713abeaa262e23b070ff1 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Mon, 5 Jun 2023 09:34:52 +0200
Subject: [PATCH 06/17] Remove unnecessary computation for unused deeper
 layers.

---
 pyannote/audio/models/blocks/selfsup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index 1461a38bd..5ad9b2d86 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -42,7 +42,7 @@ def __init__(self, model,layer, cache):
             config = AutoConfig.from_pretrained(model)
         
         config.output_hidden_states = True
-
+	config.num_hidden_layers = layer + 1
         
         self.ssl_model = AutoModel.from_pretrained(model, config = config, cache_dir = cache) #Load the model
         self.ssl_model.eval()

From e5330fcbcff8d3c655a8f296d9201e6bb0176d7c Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Tue, 20 Jun 2023 14:54:29 +0200
Subject: [PATCH 07/17] add support for fairseq pretrained ssl models

Can load a fairseq ckpt from a pretrained model (which is converted to
torchaudio wav2vec2 format)
---
 pyannote/audio/models/blocks/selfsup.py       | 79 +++++++++++++------
 .../audio/models/segmentation/PyanHugg.py     |  4 +-
 2 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index 5ad9b2d86..d5868b26c 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -25,46 +25,73 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+import fairseq
+from fairseq import checkpoint_utils
 from transformers import AutoModel, Wav2Vec2FeatureExtractor, AutoConfig
+from fairseq import checkpoint_utils
+from torchaudio.models.wav2vec2.utils import import_fairseq_model
 
 class SelfSupModel(nn.Module):
 
-    def __init__(self, model,layer, cache):
+    def __init__(self, model,layer, cache,fairseq_ckpt):
         super().__init__()
-        self.model = model
-        print("\nThe selected Self-Supervised Model from HuggingFace is "+ model+".\n")
-        SelfSupModel.__name__ = model.rsplit('/', 1)[1] #Overwrite the class name to that of the selected model
-        if cache is not None :
-            print("Model and configuration file location is : "+str(cache))
-            config = AutoConfig.from_pretrained(model, cache_dir = cache)
-            config.cache_dir= cache
+        if fairseq_ckpt != None :
+            #Load the fairseq checkpoint
+            models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([fairseq_ckpt])
+            model = models[0]
+            model.eval()
+            model_name = model.__class__.__name__
+            print("\nThe pre trained model "+model_name+" from fairseq is loaded.")
+            
+            SelfSupModel.__name__ = model_name         
+            #Convert the fairseq model to torchaudio to facilitate feature extraction from any layer.
+            model = import_fairseq_model(model).eval()
+            self.ssl_model = model
+            self.feat_size = 768
+            self.pretraining = True 
+            #TODO : Remove unused encoders from the architecture
+
         else :
-            config = AutoConfig.from_pretrained(model)
-        
-        config.output_hidden_states = True
-	config.num_hidden_layers = layer + 1
-        
-        self.ssl_model = AutoModel.from_pretrained(model, config = config, cache_dir = cache) #Load the model
-        self.ssl_model.eval()
-        
-        self.feat_size = config.hidden_size #Get the encoder feature size
-        self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model, return_tensors="pt")
+            self.model = model
+            print("\nThe selected Self-Supervised Model from HuggingFace is "+ model+".\n")
+            SelfSupModel.__name__ = model.rsplit('/', 1)[1] #Overwrite the class name to that of the selected model
+            if cache is not None :
+                print("Model and configuration file location is : "+str(cache))
+                config = AutoConfig.from_pretrained(model, cache_dir = cache)
+                config.cache_dir= cache
+            else :
+                config = AutoConfig.from_pretrained(model)
+
+            config.output_hidden_states = True
+            config.num_hidden_layers = layer + 1
+
+            self.ssl_model = AutoModel.from_pretrained(model, config = config, cache_dir = cache) #Load the model
+            self.ssl_model.eval()
 
+            self.feat_size = config.hidden_size #Get the encoder feature size
+            self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model, return_tensors="pt")
+            self.pretraining = False #If a pretrained model from fairseq is loaded instead, set to False
+            
         if layer == None :
             print("\nLayer number not specified. Default to the first one (layer 0).\n")
             self.layer = 0
         else :        
             self.layer = layer
             print("\nSelected frozen layer is "+ str(layer) +". \n")
-
+            
     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
-        if self.processor.do_normalize == True :
-            waveforms = F.layer_norm(waveforms, waveforms.shape)
-        
-        with torch.no_grad():
-            features = self.ssl_model(waveforms)  #Compute the features and extract last hidden layer weights
-        
-        outputs = features.hidden_states[self.layer + 1]
+        if self.pretraining == False :
+            if self.processor.do_normalize == True :
+                waveforms = F.layer_norm(waveforms, waveforms.shape)
+
+            with torch.no_grad():
+                features = self.ssl_model(waveforms)  #Compute the features and extract last hidden layer weights
 
+            outputs = features.hidden_states[self.layer + 1]
+        else : 
+            with torch.no_grad():
+                feat,_ = self.ssl_model.extract_features(waveforms)
+            outputs = feat[self.layer]
+                
         return (outputs)
diff --git a/pyannote/audio/models/segmentation/PyanHugg.py b/pyannote/audio/models/segmentation/PyanHugg.py
index 4400c748b..063cfa95a 100644
--- a/pyannote/audio/models/segmentation/PyanHugg.py
+++ b/pyannote/audio/models/segmentation/PyanHugg.py
@@ -56,6 +56,7 @@ class PyanHugg(Model):
         "model": "microsoft/wavlm-base",
         "layer": 4,
         "cache": None,
+        "fairseq_ckpt": None,
     }
     LSTM_DEFAULTS = {
         "hidden_size": 128,
@@ -171,7 +172,6 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         scores : (batch, frame, classes)
         """
         outputs = self.selfsupervised(waveforms)
-        
         if self.hparams.lstm["monolithic"]:
             outputs, _ = self.lstm(outputs)
         else:
@@ -183,5 +183,5 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         if self.hparams.linear["num_layers"] > 0:
             for linear in self.linear:
                 outputs = F.leaky_relu(linear(outputs))
-
+                
         return self.activation(self.classifier(outputs))

From 7a21fc9058de96a260a5abbd0d90e09d293857a5 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Tue, 20 Jun 2023 16:00:21 +0200
Subject: [PATCH 08/17] fairseq dependency only used if needed

---
 pyannote/audio/models/blocks/selfsup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index d5868b26c..802db8f50 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -25,10 +25,7 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import fairseq
-from fairseq import checkpoint_utils
 from transformers import AutoModel, Wav2Vec2FeatureExtractor, AutoConfig
-from fairseq import checkpoint_utils
 from torchaudio.models.wav2vec2.utils import import_fairseq_model
 
 class SelfSupModel(nn.Module):
@@ -36,6 +33,8 @@ class SelfSupModel(nn.Module):
     def __init__(self, model,layer, cache,fairseq_ckpt):
         super().__init__()
         if fairseq_ckpt != None :
+            import fairseq
+            from fairseq import checkpoint_utils
             #Load the fairseq checkpoint
             models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([fairseq_ckpt])
             model = models[0]

From 328505c5462e9e70797f943e024e6f70427548c3 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Wed, 5 Jul 2023 16:13:28 +0200
Subject: [PATCH 09/17] Remove unnecessary computation for unused deeper layers
 (regarding a fairseq model)

---
 pyannote/audio/models/blocks/selfsup.py | 27 ++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index 802db8f50..e3fe257a7 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -21,7 +21,6 @@
 # SOFTWARE.
 
 from typing import Optional
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -41,14 +40,15 @@ def __init__(self, model,layer, cache,fairseq_ckpt):
             model.eval()
             model_name = model.__class__.__name__
             print("\nThe pre trained model "+model_name+" from fairseq is loaded.")
+            SelfSupModel.__name__ = model_name
             
-            SelfSupModel.__name__ = model_name         
             #Convert the fairseq model to torchaudio to facilitate feature extraction from any layer.
+            if model.__class__.__name__ is not "Wav2Vec2Model" :
+                model.__class__.__name__ = "Wav2Vec2Model"
+            self.feat_size = model.cfg.encoder_embed_dim
             model = import_fairseq_model(model).eval()
             self.ssl_model = model
-            self.feat_size = 768
             self.pretraining = True 
-            #TODO : Remove unused encoders from the architecture
 
         else :
             self.model = model
@@ -77,7 +77,19 @@ def __init__(self, model,layer, cache,fairseq_ckpt):
         else :        
             self.layer = layer
             print("\nSelected frozen layer is "+ str(layer) +". \n")
-            
+        
+        #self.feat_layer_mean = [7,8,9]
+    
+    def mean_mat(self,features,feat_list):
+        num_feat = len(feat_list)
+        stack_feat = []
+        for layer in feat_list:
+            stack_feat.append(features[layer])
+        stack_feat = torch.stack(stack_feat)
+        feat_sum = torch.sum(stack_feat, dim=0)
+        mean_feat = feat_sum / num_feat
+        return(mean_feat)
+    
     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
         if self.pretraining == False :
@@ -85,12 +97,13 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
                 waveforms = F.layer_norm(waveforms, waveforms.shape)
 
             with torch.no_grad():
-                features = self.ssl_model(waveforms)  #Compute the features and extract last hidden layer weights
+                features = self.ssl_model(waveforms)  #Compute the features and extract hidden layers
 
             outputs = features.hidden_states[self.layer + 1]
         else : 
             with torch.no_grad():
-                feat,_ = self.ssl_model.extract_features(waveforms)
+                feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer+1)
+                
             outputs = feat[self.layer]
                 
         return (outputs)

From cbd01a30841490ea79b45f8df15420adabf600cf Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Mon, 10 Jul 2023 17:24:26 +0200
Subject: [PATCH 10/17] Remove HuggingFace and fairseq dependencies from
 self-sup

---
 pyannote/audio/models/blocks/selfsup.py       | 107 ++++++++----------
 .../segmentation/{PyanHugg.py => PyanSup.py}  |  46 ++++----
 .../audio/models/segmentation/__init__.py     |   4 +-
 3 files changed, 72 insertions(+), 85 deletions(-)
 rename pyannote/audio/models/segmentation/{PyanHugg.py => PyanSup.py} (86%)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index e3fe257a7..4ee36e16e 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -22,63 +22,65 @@
 
 from typing import Optional
 import torch
+import torchaudio
 import torch.nn as nn
 import torch.nn.functional as F
-from transformers import AutoModel, Wav2Vec2FeatureExtractor, AutoConfig
-from torchaudio.models.wav2vec2.utils import import_fairseq_model
+from torchaudio.models.wav2vec2 import wav2vec2_model
+from collections import OrderedDict
 
 class SelfSupModel(nn.Module):
 
-    def __init__(self, model,layer, cache,fairseq_ckpt):
+    def __init__(self,checkpoint,name,layer,cfg):
         super().__init__()
-        if fairseq_ckpt != None :
-            import fairseq
-            from fairseq import checkpoint_utils
-            #Load the fairseq checkpoint
-            models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([fairseq_ckpt])
-            model = models[0]
-            model.eval()
-            model_name = model.__class__.__name__
-            print("\nThe pre trained model "+model_name+" from fairseq is loaded.")
-            SelfSupModel.__name__ = model_name
+        print("A checkpoint from a Self-Supervised Model is used for training")
+        if torch.cuda.is_available():  
+            ckpt = torch.load(checkpoint)
+        else:
+            ckpt = torch.load(checkpoint,map_location=torch.device('cpu'))
+            #Check if the checkpoint is from an already finetuned Diarization model (containing SSL), or from a SSL pretrained model only
+        if 'pyannote.audio' in ckpt: #1: Check if there is a Segmentation model attached onto or not
+            name,config,ordered_dict = self.dict_finetune(ckpt)
+                
+        else: #Otherwise, load the dictionary of the SSL checkpoint
+            print("The checkpoint is a pretrained SSL model to use for Segmentation.\nBuilding the SSL model.")
+            name,config,ordered_dict = self.dict_pretrained(ckpt)
             
-            #Convert the fairseq model to torchaudio to facilitate feature extraction from any layer.
-            if model.__class__.__name__ is not "Wav2Vec2Model" :
-                model.__class__.__name__ = "Wav2Vec2Model"
-            self.feat_size = model.cfg.encoder_embed_dim
-            model = import_fairseq_model(model).eval()
-            self.ssl_model = model
-            self.pretraining = True 
-
-        else :
-            self.model = model
-            print("\nThe selected Self-Supervised Model from HuggingFace is "+ model+".\n")
-            SelfSupModel.__name__ = model.rsplit('/', 1)[1] #Overwrite the class name to that of the selected model
-            if cache is not None :
-                print("Model and configuration file location is : "+str(cache))
-                config = AutoConfig.from_pretrained(model, cache_dir = cache)
-                config.cache_dir= cache
-            else :
-                config = AutoConfig.from_pretrained(model)
-
-            config.output_hidden_states = True
-            config.num_hidden_layers = layer + 1
-
-            self.ssl_model = AutoModel.from_pretrained(model, config = config, cache_dir = cache) #Load the model
-            self.ssl_model.eval()
-
-            self.feat_size = config.hidden_size #Get the encoder feature size
-            self.processor = Wav2Vec2FeatureExtractor.from_pretrained(model, return_tensors="pt")
-            self.pretraining = False #If a pretrained model from fairseq is loaded instead, set to False
+        self.model_name = name
+        SelfSupModel.__name__ = self.model_name #Assign name of the class
+        
+        model = wav2vec2_model(**config) #Assign config to the model
+        model.load_state_dict(ordered_dict) #Assign state dict to the model
+        self.config = config #Assign the configuration
+        self.ssl_model = model.eval()
+        self.feat_size = config['encoder_embed_dim']
             
         if layer == None :
-            print("\nLayer number not specified. Default to the first one (layer 0).\n")
+            print("\nLayer number not specified. Default to layer 1.\n")
             self.layer = 0
         else :        
             self.layer = layer
-            print("\nSelected frozen layer is "+ str(layer) +". \n")
+            print("\nSelected layer is "+ str(layer) +". \n")
+            
+    def dict_finetune(self, ckpt):
+        #Need to reconstruct the dictionary
+        #Get dict
+        print("The checkpoint is used for finetuning. \nThe attached SSL model will be used for feature extraction !")
+        dict_modules = list(ckpt['state_dict'].keys()) #Get the list of ssl modules
+        ssl_modules = [key for key in dict_modules if 'selfsupervised' in key] #Extract only the SSL parts
+        weights = [ckpt['state_dict'][key] for key in ssl_modules] #Get the weights corresponding to the modules
+        modules_torchaudio = ['.'.join(key.split('.')[2:]) for key in ssl_modules] #Get a new list which contains only torchaudio keywords
+        ordered_dict = OrderedDict((key,weight) for key,weight in zip(modules_torchaudio,weights)) #Recreate the state_dict
+        config = ckpt['hyper_parameters']['selfsupervised']['cfg'] #Get config
+        name = ckpt['hyper_parameters']['selfsupervised']['name'] #Get model name
+            
+        return(name,config,ordered_dict)
+    
+    def dict_pretrained(self, ckpt):
+        ordered_dict = ckpt['state_dict'] #Get dict
+        config = ckpt['config'] #Get config
+        name = ckpt['model_name'] #Get model name
         
-        #self.feat_layer_mean = [7,8,9]
+        return(ckpt['model_name'],ckpt['config'],ckpt['state_dict'])
     
     def mean_mat(self,features,feat_list):
         num_feat = len(feat_list)
@@ -92,18 +94,9 @@ def mean_mat(self,features,feat_list):
     
     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
-        if self.pretraining == False :
-            if self.processor.do_normalize == True :
-                waveforms = F.layer_norm(waveforms, waveforms.shape)
-
-            with torch.no_grad():
-                features = self.ssl_model(waveforms)  #Compute the features and extract hidden layers
-
-            outputs = features.hidden_states[self.layer + 1]
-        else : 
-            with torch.no_grad():
-                feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer+1)
-                
-            outputs = feat[self.layer]
+        
+        with torch.no_grad():
+            feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer+1)
                 
+        outputs = feat[self.layer]
         return (outputs)
diff --git a/pyannote/audio/models/segmentation/PyanHugg.py b/pyannote/audio/models/segmentation/PyanSup.py
similarity index 86%
rename from pyannote/audio/models/segmentation/PyanHugg.py
rename to pyannote/audio/models/segmentation/PyanSup.py
index 063cfa95a..0558d54ba 100644
--- a/pyannote/audio/models/segmentation/PyanHugg.py
+++ b/pyannote/audio/models/segmentation/PyanSup.py
@@ -12,18 +12,11 @@
 from pyannote.audio.utils.params import merge_dict
 
 
-class PyanHugg(Model):
+class PyanSup(Model):
     """PyanHugg segmentation model
 
     Self-Supervised Model > LSTM > Feed forward > Classifier
-    
-    All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
-    Tested (and currently working) models are : 
-     - "microsoft/wavlm-base"
-     - "microsoft/wavlm-large"
-     - "facebook/hubert-base-ls960"
-     - "facebook/wav2vec2-base-960h"
-     
+
     Parameters
     ----------
     sample_rate : int, optional
@@ -32,11 +25,11 @@ class PyanHugg(Model):
         Number of channels. Defaults to mono (1).
     
     selfsupervised : dict, optional
-        Keyword arugments passed to the selfsupervised block.
+        Keyword arugments passed to the selfsupervised block. name and cfg are used to reconstruct the feature extractor from the dictionary of the checkpoint. Layer corresponds to the layer that serves for the feature extraction.
         Defaults to {
-        "model": "microsoft/wavlm-base",
-        "layer": 4,
-        "cache": None,
+        "name": None,
+        "layer": None,
+        "cfg": None,
     }
     lstm : dict, optional
         Keyword arguments passed to the LSTM layer.
@@ -53,10 +46,9 @@ class PyanHugg(Model):
     
     
     SSL_DEFAULTS = {
-        "model": "microsoft/wavlm-base",
-        "layer": 4,
-        "cache": None,
-        "fairseq_ckpt": None,
+        "name": None,
+        "layer": None,
+        "cfg": None,
     }
     LSTM_DEFAULTS = {
         "hidden_size": 128,
@@ -69,6 +61,7 @@ class PyanHugg(Model):
 
     def __init__(
         self,
+        ckpt: str = None,
         selfsupervised: dict = None,
         lstm: dict = None,
         linear: dict = None,
@@ -83,19 +76,21 @@ def __init__(
         lstm = merge_dict(self.LSTM_DEFAULTS, lstm)
         lstm["batch_first"] = True
         linear = merge_dict(self.LINEAR_DEFAULTS, linear)
-        self.save_hyperparameters("selfsupervised", "lstm", "linear")
-        
-        self.model = selfsupervised["model"]
+        self.save_hyperparameters("lstm", "linear") #A first merge is done using the default parameters specified
         
-        
-
-        #All HuggingFace Self-Sup. models can be found at https://huggingface.co/models
         print("\n##################################################################")
         print("### A self-supervised model is used for the feature extraction ###")
         print("##################################################################")
-        self.selfsupervised = SelfSupModel(**self.hparams.selfsupervised)
+        
+        self.selfsupervised = SelfSupModel(ckpt,**selfsupervised)
+        selfsupervised['name'] = self.selfsupervised.model_name
+        selfsupervised['cfg'] = self.selfsupervised.config
+        
+        self.save_hyperparameters("selfsupervised")
         feat_size = self.selfsupervised.feat_size
+        
         print("##################################################################\n")
+        
         monolithic = lstm["monolithic"]
         if monolithic:
             multi_layer_lstm = dict(lstm)
@@ -141,8 +136,7 @@ def __init__(
                     * self.hparams.linear["num_layers"]
                 )
             ]
-        )
-
+        )  
     def build(self):
 
         if self.hparams.linear["num_layers"] > 0:
diff --git a/pyannote/audio/models/segmentation/__init__.py b/pyannote/audio/models/segmentation/__init__.py
index 91b79e68a..a78204114 100644
--- a/pyannote/audio/models/segmentation/__init__.py
+++ b/pyannote/audio/models/segmentation/__init__.py
@@ -21,6 +21,6 @@
 # SOFTWARE.
 
 from .PyanNet import PyanNet
-from .PyanHugg import PyanHugg
+from .PyanSup import PyanSup
 
-__all__ = ["PyanNet","PyanHugg"]
+__all__ = ["PyanNet","PyanSup"]

From d7e920318d2ee460be4ef68b93a664e3d6f484ff Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Wed, 12 Jul 2023 17:04:45 +0200
Subject: [PATCH 11/17] add support for torchaudio self sup models

---
 pyannote/audio/models/blocks/selfsup.py       | 59 ++++++++++++-------
 pyannote/audio/models/segmentation/PyanSup.py |  7 ++-
 2 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index 4ee36e16e..019be955b 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -19,41 +19,50 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
-
+import sys
 from typing import Optional
 import torch
 import torchaudio
 import torch.nn as nn
 import torch.nn.functional as F
-from torchaudio.models.wav2vec2 import wav2vec2_model
 from collections import OrderedDict
+from torchaudio.models.wav2vec2 import wav2vec2_model, wavlm_model
+from torchaudio.pipelines import Wav2Vec2Bundle
 
 class SelfSupModel(nn.Module):
 
-    def __init__(self,checkpoint,name,layer,cfg):
+    def __init__(self,checkpoint,torchaudio_ssl,torchaudio_cache,name,layer,cfg):
         super().__init__()
-        print("A checkpoint from a Self-Supervised Model is used for training")
-        if torch.cuda.is_available():  
-            ckpt = torch.load(checkpoint)
-        else:
-            ckpt = torch.load(checkpoint,map_location=torch.device('cpu'))
-            #Check if the checkpoint is from an already finetuned Diarization model (containing SSL), or from a SSL pretrained model only
-        if 'pyannote.audio' in ckpt: #1: Check if there is a Segmentation model attached onto or not
-            name,config,ordered_dict = self.dict_finetune(ckpt)
-                
-        else: #Otherwise, load the dictionary of the SSL checkpoint
-            print("The checkpoint is a pretrained SSL model to use for Segmentation.\nBuilding the SSL model.")
-            name,config,ordered_dict = self.dict_pretrained(ckpt)
+        if torchaudio_ssl:
+            if checkpoint:
+                raise ValueError("Error : Cannot specify both a checkpoint and a torchaudio model !")
             
+            print("\nThe Self-Supervised Model "+str(torchaudio_ssl)+" is loaded from torchaudio.\n")
+            name,config,ordered_dict = self.dict_torchaudio(torchaudio_ssl,torchaudio_cache)
+        else:
+            print("A checkpoint from a Self-Supervised Model is used for training.")
+            if torch.cuda.is_available():  
+                ckpt = torch.load(checkpoint)
+            else:
+                ckpt = torch.load(checkpoint,map_location=torch.device('cpu'))
+                #Check if the checkpoint is from an already finetuned Diarization model (containing SSL), or from a SSL pretrained model only
+            if 'pyannote.audio' in ckpt: #1: Check if there is a Segmentation model attached onto or not
+                print("The checkpoint is used for finetuning. \nThe attached SSL model will be used for feature extraction !")
+                name,config,ordered_dict = self.dict_finetune(ckpt)
+
+            else: #Otherwise, load the dictionary of the SSL checkpoint
+                print("The checkpoint is a pretrained SSL model to use for Segmentation.\nBuilding the SSL model.")
+                name,config,ordered_dict = self.dict_pretrained(ckpt)
         self.model_name = name
         SelfSupModel.__name__ = self.model_name #Assign name of the class
-        
-        model = wav2vec2_model(**config) #Assign config to the model
+        if name is "WAVLM_BASE" or "WAVLM_LARGE": #Only wavlm_model has two additional arguments
+            model = wavlm_model(**config)
+        else:
+            model = wav2vec2_model(**config)
         model.load_state_dict(ordered_dict) #Assign state dict to the model
         self.config = config #Assign the configuration
         self.ssl_model = model.eval()
-        self.feat_size = config['encoder_embed_dim']
-            
+        self.feat_size = config['encoder_embed_dim'] #Get feature output dimension
         if layer == None :
             print("\nLayer number not specified. Default to layer 1.\n")
             self.layer = 0
@@ -64,7 +73,6 @@ def __init__(self,checkpoint,name,layer,cfg):
     def dict_finetune(self, ckpt):
         #Need to reconstruct the dictionary
         #Get dict
-        print("The checkpoint is used for finetuning. \nThe attached SSL model will be used for feature extraction !")
         dict_modules = list(ckpt['state_dict'].keys()) #Get the list of ssl modules
         ssl_modules = [key for key in dict_modules if 'selfsupervised' in key] #Extract only the SSL parts
         weights = [ckpt['state_dict'][key] for key in ssl_modules] #Get the weights corresponding to the modules
@@ -82,6 +90,17 @@ def dict_pretrained(self, ckpt):
         
         return(ckpt['model_name'],ckpt['config'],ckpt['state_dict'])
     
+    def dict_torchaudio(self,torchaudio_ssl,torchaudio_cache):
+        bundle = getattr(torchaudio.pipelines, torchaudio_ssl)
+        #Name is torchaudio_ssl
+        name = torchaudio_ssl #Get name
+        config = bundle._params #Get config
+        if torchaudio_cache:
+            torch.hub.set_dir(torchaudio_cache) #Set cache
+        ordered_dict = bundle.get_model().state_dict() #Get the dict
+        
+        return(name,config,ordered_dict)
+    
     def mean_mat(self,features,feat_list):
         num_feat = len(feat_list)
         stack_feat = []
diff --git a/pyannote/audio/models/segmentation/PyanSup.py b/pyannote/audio/models/segmentation/PyanSup.py
index 0558d54ba..f8c5ab174 100644
--- a/pyannote/audio/models/segmentation/PyanSup.py
+++ b/pyannote/audio/models/segmentation/PyanSup.py
@@ -62,6 +62,8 @@ class PyanSup(Model):
     def __init__(
         self,
         ckpt: str = None,
+        torchaudio_ssl: str = None, #Specify a torchaudio SSL model (list here : https://pytorch.org/audio/main/pipelines.html). Ex : "WAVLM_BASE","HUBERT_BASE","WAV2VEC2_LARGE",...
+        torchaudio_cache: str = None, #Specify location of the model
         selfsupervised: dict = None,
         lstm: dict = None,
         linear: dict = None,
@@ -81,11 +83,9 @@ def __init__(
         print("\n##################################################################")
         print("### A self-supervised model is used for the feature extraction ###")
         print("##################################################################")
-        
-        self.selfsupervised = SelfSupModel(ckpt,**selfsupervised)
+        self.selfsupervised = SelfSupModel(ckpt,torchaudio_ssl,torchaudio_cache,**selfsupervised)
         selfsupervised['name'] = self.selfsupervised.model_name
         selfsupervised['cfg'] = self.selfsupervised.config
-        
         self.save_hyperparameters("selfsupervised")
         feat_size = self.selfsupervised.feat_size
         
@@ -179,3 +179,4 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
                 outputs = F.leaky_relu(linear(outputs))
                 
         return self.activation(self.classifier(outputs))
+

From 81aafdd6d2e2738c12dd1e5b8a1901decb44d931 Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Thu, 13 Jul 2023 10:32:56 +0200
Subject: [PATCH 12/17] fixed bug condition of wavlm_base and wavlm_large

---
 pyannote/audio/models/blocks/selfsup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index 019be955b..bf3ddedad 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -55,7 +55,8 @@ def __init__(self,checkpoint,torchaudio_ssl,torchaudio_cache,name,layer,cfg):
                 name,config,ordered_dict = self.dict_pretrained(ckpt)
         self.model_name = name
         SelfSupModel.__name__ = self.model_name #Assign name of the class
-        if name is "WAVLM_BASE" or "WAVLM_LARGE": #Only wavlm_model has two additional arguments
+        print("show name"+str(name))
+        if name is "WAVLM_BASE" or name is "WAVLM_LARGE": #Only wavlm_model has two additional arguments
             model = wavlm_model(**config)
         else:
             model = wav2vec2_model(**config)

From b9c89b69c88860ee2df086a7ca7145ae7ef42f4a Mon Sep 17 00:00:00 2001
From: SevKod <sevbargal@outlook.fr>
Date: Wed, 2 Aug 2023 10:31:20 +0200
Subject: [PATCH 13/17] add layer-wise pooling and finetuning (still wip)

---
 pyannote/audio/models/blocks/selfsup.py       | 115 +++++++++++++-----
 pyannote/audio/models/segmentation/PyanSup.py |  11 +-
 2 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
index bf3ddedad..60da5efd2 100644
--- a/pyannote/audio/models/blocks/selfsup.py
+++ b/pyannote/audio/models/blocks/selfsup.py
@@ -19,11 +19,14 @@
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
+
 import sys
+import re
 from typing import Optional
 import torch
 import torchaudio
 import torch.nn as nn
+from torch.nn.functional import normalize
 import torch.nn.functional as F
 from collections import OrderedDict
 from torchaudio.models.wav2vec2 import wav2vec2_model, wavlm_model
@@ -31,11 +34,11 @@
 
 class SelfSupModel(nn.Module):
 
-    def __init__(self,checkpoint,torchaudio_ssl,torchaudio_cache,name,layer,cfg):
+    def __init__(self,checkpoint=None,torchaudio_ssl=None,torchaudio_cache=None,finetune=None,average_layers=None,average_all=None,name=None,layer=None,cfg=None):
         super().__init__()
         if torchaudio_ssl:
             if checkpoint:
-                raise ValueError("Error : Cannot specify both a checkpoint and a torchaudio model !")
+                raise ValueError("Error : Cannot specify both a checkpoint and a torchaudio model.")
             
             print("\nThe Self-Supervised Model "+str(torchaudio_ssl)+" is loaded from torchaudio.\n")
             name,config,ordered_dict = self.dict_torchaudio(torchaudio_ssl,torchaudio_cache)
@@ -47,30 +50,69 @@ def __init__(self,checkpoint,torchaudio_ssl,torchaudio_cache,name,layer,cfg):
                 ckpt = torch.load(checkpoint,map_location=torch.device('cpu'))
                 #Check if the checkpoint is from an already finetuned Diarization model (containing SSL), or from a SSL pretrained model only
             if 'pyannote.audio' in ckpt: #1: Check if there is a Segmentation model attached onto or not
-                print("The checkpoint is used for finetuning. \nThe attached SSL model will be used for feature extraction !")
+                print("The checkpoint is used for finetuning. \nThe attached SSL model will be used for feature extraction.")
                 name,config,ordered_dict = self.dict_finetune(ckpt)
 
             else: #Otherwise, load the dictionary of the SSL checkpoint
                 print("The checkpoint is a pretrained SSL model to use for Segmentation.\nBuilding the SSL model.")
                 name,config,ordered_dict = self.dict_pretrained(ckpt)
+                
+        # Layer-wise pooling (same way as SUPERB)
+        if not average_all:      
+            if not average_layers :
+                if layer is None :
+                    print("\nLayer number not specified. Default to layer 1.\n")
+                    
+                    self.layer = 1
+                else :        
+                    
+                    self.layer = layer
+                    print("\nSelected layer is "+ str(layer) +". \n")
+            else:
+                print("Layers "+str(average_layers)+" selected for layer-wise pooling.")
+                
+                self.W = nn.Parameter(torch.randn(len(average_layers))) #Set specific number of learnable weights
+                
+                self.average_layers = average_layers
+                
+                self.layer = max(average_layers)
+        else:
+            print("All layers are selected for layer-wise pooling.")
+            
+            self.W = nn.Parameter(torch.randn(config['encoder_num_layers'])) #Set max number of learnable weights
+            
+            self.average_layers = list(range(config['encoder_num_layers']))
+            
+            self.layer = config['encoder_num_layers']
+            
+        if finetune: #Finetuning not working
+            print("Self-supervised model is unfrozen.")      
+            #config['encoder_ff_interm_dropout'] = 0.3
+            config['encoder_layer_norm_first'] = True
+        else :
+            print("Self-supervised model is frozen.")
+                
+        config['encoder_num_layers'] = self.layer    
+        ordered_dict = self.remove_layers_dict(ordered_dict,self.layer) #Remove weights from unused transformer encoders
         self.model_name = name
+        self.finetune = finetune #Assign mode
+        self.average_layers = average_layers
+        self.feat_size = config['encoder_embed_dim'] #Get feature output dimension
+        self.config = config #Assign the configuration
         SelfSupModel.__name__ = self.model_name #Assign name of the class
-        print("show name"+str(name))
+        
         if name is "WAVLM_BASE" or name is "WAVLM_LARGE": #Only wavlm_model has two additional arguments
             model = wavlm_model(**config)
         else:
             model = wav2vec2_model(**config)
         model.load_state_dict(ordered_dict) #Assign state dict to the model
-        self.config = config #Assign the configuration
-        self.ssl_model = model.eval()
-        self.feat_size = config['encoder_embed_dim'] #Get feature output dimension
-        if layer == None :
-            print("\nLayer number not specified. Default to layer 1.\n")
-            self.layer = 0
-        else :        
-            self.layer = layer
-            print("\nSelected layer is "+ str(layer) +". \n")
+        
+        if finetune:
+            self.ssl_model = model.train()
+        else:
+            self.ssl_model = model.eval()
             
+                
     def dict_finetune(self, ckpt):
         #Need to reconstruct the dictionary
         #Get dict
@@ -101,22 +143,39 @@ def dict_torchaudio(self,torchaudio_ssl,torchaudio_cache):
         ordered_dict = bundle.get_model().state_dict() #Get the dict
         
         return(name,config,ordered_dict)
+    def remove_layers_dict(self,state_dict,layer):
+        keys_to_delete = []
+        for key in state_dict.keys():
+            if "transformer.layers" in key:
+                nb = int(re.findall(r'\d+',key)[0])
+                if nb>(layer-1):
+                    keys_to_delete.append(key)
+        for key in keys_to_delete:
+            del state_dict[key]
+            
+        return(state_dict)
     
-    def mean_mat(self,features,feat_list):
-        num_feat = len(feat_list)
-        stack_feat = []
-        for layer in feat_list:
-            stack_feat.append(features[layer])
-        stack_feat = torch.stack(stack_feat)
-        feat_sum = torch.sum(stack_feat, dim=0)
-        mean_feat = feat_sum / num_feat
-        return(mean_feat)
+    def avg_pool(self,scalars,feat_list):
+        sum = 0
+        for i in range(0,len(feat_list)):
+            sum = sum + scalars[i]*feat_list[i]
+        return(sum)
     
     def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
         waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
-        
-        with torch.no_grad():
-            feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer+1)
-                
-        outputs = feat[self.layer]
-        return (outputs)
+        if self.finetune:
+            feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer)
+        else:
+            with torch.no_grad():
+                feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer)
+        if self.average_layers:
+            feat_learn_list = []
+            for index in self.average_layers:
+                feat_learn_list.append(feat[index-1])
+            w = self.W.softmax(-1)
+            outputs = self.avg_pool(w,feat_learn_list)
+            #print(w)
+            #print(outputs.size())
+        else:
+            outputs = feat[self.layer-1]
+        return (outputs)
\ No newline at end of file
diff --git a/pyannote/audio/models/segmentation/PyanSup.py b/pyannote/audio/models/segmentation/PyanSup.py
index f8c5ab174..7809fc1ba 100644
--- a/pyannote/audio/models/segmentation/PyanSup.py
+++ b/pyannote/audio/models/segmentation/PyanSup.py
@@ -48,6 +48,8 @@ class PyanSup(Model):
     SSL_DEFAULTS = {
         "name": None,
         "layer": None,
+        "average_layers": None,
+        "average_all": False,
         "cfg": None,
     }
     LSTM_DEFAULTS = {
@@ -58,12 +60,12 @@ class PyanSup(Model):
         "dropout": 0.0,
     }
     LINEAR_DEFAULTS = {"hidden_size": 128, "num_layers": 2}
-
     def __init__(
         self,
         ckpt: str = None,
-        torchaudio_ssl: str = None, #Specify a torchaudio SSL model (list here : https://pytorch.org/audio/main/pipelines.html). Ex : "WAVLM_BASE","HUBERT_BASE","WAV2VEC2_LARGE",...
-        torchaudio_cache: str = None, #Specify location of the model
+        torchaudio_ssl: str = None,
+        torchaudio_cache: str = None,
+        finetune: bool = False,
         selfsupervised: dict = None,
         lstm: dict = None,
         linear: dict = None,
@@ -83,7 +85,7 @@ def __init__(
         print("\n##################################################################")
         print("### A self-supervised model is used for the feature extraction ###")
         print("##################################################################")
-        self.selfsupervised = SelfSupModel(ckpt,torchaudio_ssl,torchaudio_cache,**selfsupervised)
+        self.selfsupervised = SelfSupModel(ckpt,torchaudio_ssl,torchaudio_cache,finetune,**selfsupervised)
         selfsupervised['name'] = self.selfsupervised.model_name
         selfsupervised['cfg'] = self.selfsupervised.config
         self.save_hyperparameters("selfsupervised")
@@ -179,4 +181,3 @@ def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
                 outputs = F.leaky_relu(linear(outputs))
                 
         return self.activation(self.classifier(outputs))
-

From cedf042439badd4760fda16370bdba9c9858f8d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 13 Sep 2023 22:39:14 +0200
Subject: [PATCH 14/17] feat: add SSeRiouSS architecture

---
 .../cli/train_config/model/SSeRiouSS.yaml     |  13 +
 .../audio/models/segmentation/SSeRiouSS.py    | 234 ++++++++++++++++++
 .../audio/models/segmentation/__init__.py     |   3 +-
 3 files changed, 249 insertions(+), 1 deletion(-)
 create mode 100644 pyannote/audio/cli/train_config/model/SSeRiouSS.yaml
 create mode 100644 pyannote/audio/models/segmentation/SSeRiouSS.py

diff --git a/pyannote/audio/cli/train_config/model/SSeRiouSS.yaml b/pyannote/audio/cli/train_config/model/SSeRiouSS.yaml
new file mode 100644
index 000000000..73f7f963a
--- /dev/null
+++ b/pyannote/audio/cli/train_config/model/SSeRiouSS.yaml
@@ -0,0 +1,13 @@
+# @package _group_
+_target_: pyannote.audio.models.segmentation.SSeRiouSS
+wav2vec: WAVLM_BASE
+wav2vec_layer: -1
+lstm:
+  hidden_size: 128
+  num_layers: 4
+  bidirectional: true
+  monolithic: true
+  dropout: 0.5
+linear:
+  hidden_size: 128
+  num_layers: 2
diff --git a/pyannote/audio/models/segmentation/SSeRiouSS.py b/pyannote/audio/models/segmentation/SSeRiouSS.py
new file mode 100644
index 000000000..7cd545177
--- /dev/null
+++ b/pyannote/audio/models/segmentation/SSeRiouSS.py
@@ -0,0 +1,234 @@
+# MIT License
+#
+# Copyright (c) 2023- CNRS
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio
+from pyannote.core.utils.generators import pairwise
+
+from pyannote.audio.core.model import Model
+from pyannote.audio.core.task import Task
+from pyannote.audio.utils.params import merge_dict
+
+
+class SSeRiouSS(Model):
+    """Self-Supervised Representation for Speaker Segmentation
+
+    wav2vec > LSTM > Feed forward > Classifier
+
+    Parameters
+    ----------
+    sample_rate : int, optional
+        Audio sample rate. Defaults to 16kHz (16000).
+    num_channels : int, optional
+        Number of channels. Defaults to mono (1).
+    wav2vec: dict or str, optional
+        Defaults to "WAVLM_BASE".
+    wav2vec_layer: int, optional
+        Index of layer to use as input to the LSTM.
+        Defaults (-1) to use average of all layers (with learnable weights).
+    lstm : dict, optional
+        Keyword arguments passed to the LSTM layer.
+        Defaults to {"hidden_size": 128, "num_layers": 4, "bidirectional": True},
+        i.e. two bidirectional layers with 128 units each.
+        Set "monolithic" to False to split monolithic multi-layer LSTM into multiple mono-layer LSTMs.
+        This may proove useful for probing LSTM internals.
+    linear : dict, optional
+        Keyword arugments used to initialize linear layers
+        Defaults to {"hidden_size": 128, "num_layers": 2},
+        i.e. two linear layers with 128 units each.
+    """
+
+    WAV2VEC_DEFAULTS = "WAVLM_BASE"
+
+    LSTM_DEFAULTS = {
+        "hidden_size": 128,
+        "num_layers": 4,
+        "bidirectional": True,
+        "monolithic": True,
+        "dropout": 0.0,
+    }
+    LINEAR_DEFAULTS = {"hidden_size": 128, "num_layers": 2}
+
+    def __init__(
+        self,
+        wav2vec: Union[dict, str] = None,
+        wav2vec_layer: int = -1,
+        lstm: dict = None,
+        linear: dict = None,
+        sample_rate: int = 16000,
+        num_channels: int = 1,
+        task: Optional[Task] = None,
+    ):
+        super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
+
+        if isinstance(wav2vec, str):
+            # `wav2vec` is one of the supported pipelines from torchaudio (e.g. "WAVLM_BASE")
+            if hasattr(torchaudio.pipelines, wav2vec):
+                bundle = getattr(torchaudio.pipelines, wav2vec)
+                if sample_rate != bundle._sample_rate:
+                    raise ValueError(
+                        f"Expected {bundle._sample_rate}Hz, found {sample_rate}Hz."
+                    )
+                wav2vec_dim = bundle._params["encoder_embed_dim"]
+                wav2vec_num_layers = bundle._params["encoder_num_layers"]
+                self.wav2vec = bundle.get_model()
+
+            # `wav2vec` is a path to a self-supervised representation checkpoint
+            else:
+                _checkpoint = torch.load(wav2vec)
+                wav2vec = _checkpoint.pop("config")
+                self.wav2vec = torchaudio.models.wav2vec2_model(**wav2vec)
+                state_dict = _checkpoint.pop("state_dict")
+                self.wav2vec.load_state_dict(state_dict)
+                wav2vec_dim = wav2vec["encoder_embed_dim"]
+                wav2vec_num_layers = wav2vec["encoder_num_layers"]
+
+        # `wav2vec` is a config dictionary understood by `wav2vec2_model`
+        # this branch is typically used by Model.from_pretrained(...)
+        elif isinstance(wav2vec, dict):
+            self.wav2vec = torchaudio.models.wav2vec2_model(**wav2vec)
+            wav2vec_dim = wav2vec["encoder_embed_dim"]
+            wav2vec_num_layers = wav2vec["encoder_num_layers"]
+
+        if wav2vec_layer < 0:
+            self.wav2vec_weights = nn.Parameter(
+                data=torch.ones(wav2vec_num_layers), requires_grad=True
+            )
+
+        lstm = merge_dict(self.LSTM_DEFAULTS, lstm)
+        lstm["batch_first"] = True
+        linear = merge_dict(self.LINEAR_DEFAULTS, linear)
+
+        self.save_hyperparameters("wav2vec", "wav2vec_layer", "lstm", "linear")
+
+        monolithic = lstm["monolithic"]
+        if monolithic:
+            multi_layer_lstm = dict(lstm)
+            del multi_layer_lstm["monolithic"]
+            self.lstm = nn.LSTM(wav2vec_dim, **multi_layer_lstm)
+
+        else:
+            num_layers = lstm["num_layers"]
+            if num_layers > 1:
+                self.dropout = nn.Dropout(p=lstm["dropout"])
+
+            one_layer_lstm = dict(lstm)
+            one_layer_lstm["num_layers"] = 1
+            one_layer_lstm["dropout"] = 0.0
+            del one_layer_lstm["monolithic"]
+
+            self.lstm = nn.ModuleList(
+                [
+                    nn.LSTM(
+                        wav2vec_dim
+                        if i == 0
+                        else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
+                        **one_layer_lstm,
+                    )
+                    for i in range(num_layers)
+                ]
+            )
+
+        if linear["num_layers"] < 1:
+            return
+
+        lstm_out_features: int = self.hparams.lstm["hidden_size"] * (
+            2 if self.hparams.lstm["bidirectional"] else 1
+        )
+        self.linear = nn.ModuleList(
+            [
+                nn.Linear(in_features, out_features)
+                for in_features, out_features in pairwise(
+                    [
+                        lstm_out_features,
+                    ]
+                    + [self.hparams.linear["hidden_size"]]
+                    * self.hparams.linear["num_layers"]
+                )
+            ]
+        )
+
+    def build(self):
+        if self.hparams.linear["num_layers"] > 0:
+            in_features = self.hparams.linear["hidden_size"]
+        else:
+            in_features = self.hparams.lstm["hidden_size"] * (
+                2 if self.hparams.lstm["bidirectional"] else 1
+            )
+
+        if isinstance(self.specifications, tuple):
+            raise ValueError("SSeRiouSS model does not support multi-tasking.")
+
+        if self.specifications.powerset:
+            out_features = self.specifications.num_powerset_classes
+        else:
+            out_features = len(self.specifications.classes)
+
+        self.classifier = nn.Linear(in_features, out_features)
+        self.activation = self.default_activation()
+
+    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
+        """Pass forward
+
+        Parameters
+        ----------
+        waveforms : (batch, channel, sample)
+
+        Returns
+        -------
+        scores : (batch, frame, classes)
+        """
+
+        num_layers = (
+            None if self.hparams.wav2vec_layer < 0 else self.hparams.wav2vec_layer
+        )
+
+        with torch.no_grad():
+            outputs, _ = self.wav2vec.extract_features(
+                waveforms.squeeze(1), num_layers=num_layers
+            )
+
+        if num_layers is None:
+            outputs = torch.stack(outputs, dim=-1) @ F.softmax(
+                self.wav2vec_weights, dim=0
+            )
+        else:
+            outputs = outputs[-1]
+
+        if self.hparams.lstm["monolithic"]:
+            outputs, _ = self.lstm(outputs)
+        else:
+            for i, lstm in enumerate(self.lstm):
+                outputs, _ = lstm(outputs)
+                if i + 1 < self.hparams.lstm["num_layers"]:
+                    outputs = self.dropout(outputs)
+
+        if self.hparams.linear["num_layers"] > 0:
+            for linear in self.linear:
+                outputs = F.leaky_relu(linear(outputs))
+
+        return self.activation(self.classifier(outputs))
diff --git a/pyannote/audio/models/segmentation/__init__.py b/pyannote/audio/models/segmentation/__init__.py
index a78204114..d73442721 100644
--- a/pyannote/audio/models/segmentation/__init__.py
+++ b/pyannote/audio/models/segmentation/__init__.py
@@ -22,5 +22,6 @@
 
 from .PyanNet import PyanNet
 from .PyanSup import PyanSup
+from .SSeRiouSS import SSeRiouSS
 
-__all__ = ["PyanNet","PyanSup"]
+__all__ = ["PyanNet", "PyanSup", "SSeRiouSS"]

From 06641bf9da67f73a55cf9c20a1c9089621a290f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Wed, 13 Sep 2023 22:40:47 +0200
Subject: [PATCH 15/17] chore: remove old PyanSup

---
 pyannote/audio/models/segmentation/PyanSup.py | 183 ------------------
 .../audio/models/segmentation/__init__.py     |   5 +-
 2 files changed, 2 insertions(+), 186 deletions(-)
 delete mode 100644 pyannote/audio/models/segmentation/PyanSup.py

diff --git a/pyannote/audio/models/segmentation/PyanSup.py b/pyannote/audio/models/segmentation/PyanSup.py
deleted file mode 100644
index 7809fc1ba..000000000
--- a/pyannote/audio/models/segmentation/PyanSup.py
+++ /dev/null
@@ -1,183 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from einops import rearrange
-from pyannote.core.utils.generators import pairwise
-
-from pyannote.audio.core.model import Model
-from pyannote.audio.core.task import Task
-from pyannote.audio.models.blocks.selfsup import SelfSupModel
-from pyannote.audio.utils.params import merge_dict
-
-
-class PyanSup(Model):
-    """PyanHugg segmentation model
-
-    Self-Supervised Model > LSTM > Feed forward > Classifier
-
-    Parameters
-    ----------
-    sample_rate : int, optional
-        Audio sample rate. Defaults to 16kHz (16000).
-    num_channels : int, optional
-        Number of channels. Defaults to mono (1).
-    
-    selfsupervised : dict, optional
-        Keyword arugments passed to the selfsupervised block. name and cfg are used to reconstruct the feature extractor from the dictionary of the checkpoint. Layer corresponds to the layer that serves for the feature extraction.
-        Defaults to {
-        "name": None,
-        "layer": None,
-        "cfg": None,
-    }
-    lstm : dict, optional
-        Keyword arguments passed to the LSTM layer.
-        Defaults to {"hidden_size": 128, "num_layers": 2, "bidirectional": True},
-        i.e. two bidirectional layers with 128 units each.
-        Set "monolithic" to False to split monolithic multi-layer LSTM into multiple mono-layer LSTMs.
-        This may proove useful for probing LSTM internals.
-    linear : dict, optional
-        Keyword arugments used to initialize linear layers
-        Defaults to {"hidden_size": 128, "num_layers": 2},
-        i.e. two linear layers with 128 units each.
-    """
-    
-    
-    
-    SSL_DEFAULTS = {
-        "name": None,
-        "layer": None,
-        "average_layers": None,
-        "average_all": False,
-        "cfg": None,
-    }
-    LSTM_DEFAULTS = {
-        "hidden_size": 128,
-        "num_layers": 2,
-        "bidirectional": True,
-        "monolithic": True,
-        "dropout": 0.0,
-    }
-    LINEAR_DEFAULTS = {"hidden_size": 128, "num_layers": 2}
-    def __init__(
-        self,
-        ckpt: str = None,
-        torchaudio_ssl: str = None,
-        torchaudio_cache: str = None,
-        finetune: bool = False,
-        selfsupervised: dict = None,
-        lstm: dict = None,
-        linear: dict = None,
-        sample_rate: int = 16000,
-        num_channels: int = 1,
-        task: Optional[Task] = None,
-    ):
-
-        super().__init__(sample_rate=sample_rate, num_channels=num_channels, task=task)
-        
-        selfsupervised = merge_dict(self.SSL_DEFAULTS, selfsupervised)
-        lstm = merge_dict(self.LSTM_DEFAULTS, lstm)
-        lstm["batch_first"] = True
-        linear = merge_dict(self.LINEAR_DEFAULTS, linear)
-        self.save_hyperparameters("lstm", "linear") #A first merge is done using the default parameters specified
-        
-        print("\n##################################################################")
-        print("### A self-supervised model is used for the feature extraction ###")
-        print("##################################################################")
-        self.selfsupervised = SelfSupModel(ckpt,torchaudio_ssl,torchaudio_cache,finetune,**selfsupervised)
-        selfsupervised['name'] = self.selfsupervised.model_name
-        selfsupervised['cfg'] = self.selfsupervised.config
-        self.save_hyperparameters("selfsupervised")
-        feat_size = self.selfsupervised.feat_size
-        
-        print("##################################################################\n")
-        
-        monolithic = lstm["monolithic"]
-        if monolithic:
-            multi_layer_lstm = dict(lstm)
-            del multi_layer_lstm["monolithic"]
-            self.lstm = nn.LSTM(feat_size, **multi_layer_lstm)
-
-        else:
-            num_layers = lstm["num_layers"]
-            if num_layers > 1:
-                self.dropout = nn.Dropout(p=lstm["dropout"])
-
-            one_layer_lstm = dict(lstm)
-            one_layer_lstm["num_layers"] = 1
-            one_layer_lstm["dropout"] = 0.0
-            del one_layer_lstm["monolithic"]
-
-            self.lstm = nn.ModuleList(
-                [
-                    nn.LSTM(
-                        feat_size
-                        if i == 0
-                        else lstm["hidden_size"] * (2 if lstm["bidirectional"] else 1),
-                        **one_layer_lstm
-                    )
-                    for i in range(num_layers)
-                ]
-            )
-
-        if linear["num_layers"] < 1:
-            return
-
-        lstm_out_features: int = self.hparams.lstm["hidden_size"] * (
-            2 if self.hparams.lstm["bidirectional"] else 1
-        )
-        self.linear = nn.ModuleList(
-            [
-                nn.Linear(in_features, out_features)
-                for in_features, out_features in pairwise(
-                    [
-                        lstm_out_features,
-                    ]
-                    + [self.hparams.linear["hidden_size"]]
-                    * self.hparams.linear["num_layers"]
-                )
-            ]
-        )  
-    def build(self):
-
-        if self.hparams.linear["num_layers"] > 0:
-            in_features = self.hparams.linear["hidden_size"]
-        else:
-            in_features = self.hparams.lstm["hidden_size"] * (
-                2 if self.hparams.lstm["bidirectional"] else 1
-            )
-
-        if self.specifications.powerset:
-            out_features = self.specifications.num_powerset_classes
-        else:
-            out_features = len(self.specifications.classes)
-
-        self.classifier = nn.Linear(in_features, out_features)
-        self.activation = self.default_activation()
-
-    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
-        """Pass forward
-
-        Parameters
-        ----------
-        waveforms : (batch, channel, sample)
-
-        Returns
-        -------
-        scores : (batch, frame, classes)
-        """
-        outputs = self.selfsupervised(waveforms)
-        if self.hparams.lstm["monolithic"]:
-            outputs, _ = self.lstm(outputs)
-        else:
-            for i, lstm in enumerate(self.lstm):
-                outputs, _ = lstm(outputs)
-                if i + 1 < self.hparams.lstm["num_layers"]:
-                    outputs = self.dropout(outputs)
-
-        if self.hparams.linear["num_layers"] > 0:
-            for linear in self.linear:
-                outputs = F.leaky_relu(linear(outputs))
-                
-        return self.activation(self.classifier(outputs))
diff --git a/pyannote/audio/models/segmentation/__init__.py b/pyannote/audio/models/segmentation/__init__.py
index d73442721..9f6f5f6e3 100644
--- a/pyannote/audio/models/segmentation/__init__.py
+++ b/pyannote/audio/models/segmentation/__init__.py
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2020 CNRS
+# Copyright (c) 2020- CNRS
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,6 @@
 # SOFTWARE.
 
 from .PyanNet import PyanNet
-from .PyanSup import PyanSup
 from .SSeRiouSS import SSeRiouSS
 
-__all__ = ["PyanNet", "PyanSup", "SSeRiouSS"]
+__all__ = ["PyanNet", "SSeRiouSS"]

From 31d08a40cfe123e8c5f945670dfa46b551474ea3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Fri, 15 Sep 2023 16:23:00 +0200
Subject: [PATCH 16/17] chore: remove now replaced SelfSupModel block

---
 pyannote/audio/models/blocks/selfsup.py | 181 ------------------------
 1 file changed, 181 deletions(-)
 delete mode 100644 pyannote/audio/models/blocks/selfsup.py

diff --git a/pyannote/audio/models/blocks/selfsup.py b/pyannote/audio/models/blocks/selfsup.py
deleted file mode 100644
index 60da5efd2..000000000
--- a/pyannote/audio/models/blocks/selfsup.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# MIT License
-#
-# Copyright (c) 2020 CNRS
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-import sys
-import re
-from typing import Optional
-import torch
-import torchaudio
-import torch.nn as nn
-from torch.nn.functional import normalize
-import torch.nn.functional as F
-from collections import OrderedDict
-from torchaudio.models.wav2vec2 import wav2vec2_model, wavlm_model
-from torchaudio.pipelines import Wav2Vec2Bundle
-
-class SelfSupModel(nn.Module):
-
-    def __init__(self,checkpoint=None,torchaudio_ssl=None,torchaudio_cache=None,finetune=None,average_layers=None,average_all=None,name=None,layer=None,cfg=None):
-        super().__init__()
-        if torchaudio_ssl:
-            if checkpoint:
-                raise ValueError("Error : Cannot specify both a checkpoint and a torchaudio model.")
-            
-            print("\nThe Self-Supervised Model "+str(torchaudio_ssl)+" is loaded from torchaudio.\n")
-            name,config,ordered_dict = self.dict_torchaudio(torchaudio_ssl,torchaudio_cache)
-        else:
-            print("A checkpoint from a Self-Supervised Model is used for training.")
-            if torch.cuda.is_available():  
-                ckpt = torch.load(checkpoint)
-            else:
-                ckpt = torch.load(checkpoint,map_location=torch.device('cpu'))
-                #Check if the checkpoint is from an already finetuned Diarization model (containing SSL), or from a SSL pretrained model only
-            if 'pyannote.audio' in ckpt: #1: Check if there is a Segmentation model attached onto or not
-                print("The checkpoint is used for finetuning. \nThe attached SSL model will be used for feature extraction.")
-                name,config,ordered_dict = self.dict_finetune(ckpt)
-
-            else: #Otherwise, load the dictionary of the SSL checkpoint
-                print("The checkpoint is a pretrained SSL model to use for Segmentation.\nBuilding the SSL model.")
-                name,config,ordered_dict = self.dict_pretrained(ckpt)
-                
-        # Layer-wise pooling (same way as SUPERB)
-        if not average_all:      
-            if not average_layers :
-                if layer is None :
-                    print("\nLayer number not specified. Default to layer 1.\n")
-                    
-                    self.layer = 1
-                else :        
-                    
-                    self.layer = layer
-                    print("\nSelected layer is "+ str(layer) +". \n")
-            else:
-                print("Layers "+str(average_layers)+" selected for layer-wise pooling.")
-                
-                self.W = nn.Parameter(torch.randn(len(average_layers))) #Set specific number of learnable weights
-                
-                self.average_layers = average_layers
-                
-                self.layer = max(average_layers)
-        else:
-            print("All layers are selected for layer-wise pooling.")
-            
-            self.W = nn.Parameter(torch.randn(config['encoder_num_layers'])) #Set max number of learnable weights
-            
-            self.average_layers = list(range(config['encoder_num_layers']))
-            
-            self.layer = config['encoder_num_layers']
-            
-        if finetune: #Finetuning not working
-            print("Self-supervised model is unfrozen.")      
-            #config['encoder_ff_interm_dropout'] = 0.3
-            config['encoder_layer_norm_first'] = True
-        else :
-            print("Self-supervised model is frozen.")
-                
-        config['encoder_num_layers'] = self.layer    
-        ordered_dict = self.remove_layers_dict(ordered_dict,self.layer) #Remove weights from unused transformer encoders
-        self.model_name = name
-        self.finetune = finetune #Assign mode
-        self.average_layers = average_layers
-        self.feat_size = config['encoder_embed_dim'] #Get feature output dimension
-        self.config = config #Assign the configuration
-        SelfSupModel.__name__ = self.model_name #Assign name of the class
-        
-        if name is "WAVLM_BASE" or name is "WAVLM_LARGE": #Only wavlm_model has two additional arguments
-            model = wavlm_model(**config)
-        else:
-            model = wav2vec2_model(**config)
-        model.load_state_dict(ordered_dict) #Assign state dict to the model
-        
-        if finetune:
-            self.ssl_model = model.train()
-        else:
-            self.ssl_model = model.eval()
-            
-                
-    def dict_finetune(self, ckpt):
-        #Need to reconstruct the dictionary
-        #Get dict
-        dict_modules = list(ckpt['state_dict'].keys()) #Get the list of ssl modules
-        ssl_modules = [key for key in dict_modules if 'selfsupervised' in key] #Extract only the SSL parts
-        weights = [ckpt['state_dict'][key] for key in ssl_modules] #Get the weights corresponding to the modules
-        modules_torchaudio = ['.'.join(key.split('.')[2:]) for key in ssl_modules] #Get a new list which contains only torchaudio keywords
-        ordered_dict = OrderedDict((key,weight) for key,weight in zip(modules_torchaudio,weights)) #Recreate the state_dict
-        config = ckpt['hyper_parameters']['selfsupervised']['cfg'] #Get config
-        name = ckpt['hyper_parameters']['selfsupervised']['name'] #Get model name
-            
-        return(name,config,ordered_dict)
-    
-    def dict_pretrained(self, ckpt):
-        ordered_dict = ckpt['state_dict'] #Get dict
-        config = ckpt['config'] #Get config
-        name = ckpt['model_name'] #Get model name
-        
-        return(ckpt['model_name'],ckpt['config'],ckpt['state_dict'])
-    
-    def dict_torchaudio(self,torchaudio_ssl,torchaudio_cache):
-        bundle = getattr(torchaudio.pipelines, torchaudio_ssl)
-        #Name is torchaudio_ssl
-        name = torchaudio_ssl #Get name
-        config = bundle._params #Get config
-        if torchaudio_cache:
-            torch.hub.set_dir(torchaudio_cache) #Set cache
-        ordered_dict = bundle.get_model().state_dict() #Get the dict
-        
-        return(name,config,ordered_dict)
-    def remove_layers_dict(self,state_dict,layer):
-        keys_to_delete = []
-        for key in state_dict.keys():
-            if "transformer.layers" in key:
-                nb = int(re.findall(r'\d+',key)[0])
-                if nb>(layer-1):
-                    keys_to_delete.append(key)
-        for key in keys_to_delete:
-            del state_dict[key]
-            
-        return(state_dict)
-    
-    def avg_pool(self,scalars,feat_list):
-        sum = 0
-        for i in range(0,len(feat_list)):
-            sum = sum + scalars[i]*feat_list[i]
-        return(sum)
-    
-    def forward(self, waveforms: torch.Tensor) -> torch.Tensor:
-        waveforms = torch.squeeze(waveforms,1) #waveforms : (batch, channel, sample) -> (batch,sample)
-        if self.finetune:
-            feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer)
-        else:
-            with torch.no_grad():
-                feat,_ = self.ssl_model.extract_features(waveforms,None,self.layer)
-        if self.average_layers:
-            feat_learn_list = []
-            for index in self.average_layers:
-                feat_learn_list.append(feat[index-1])
-            w = self.W.softmax(-1)
-            outputs = self.avg_pool(w,feat_learn_list)
-            #print(w)
-            #print(outputs.size())
-        else:
-            outputs = feat[self.layer-1]
-        return (outputs)
\ No newline at end of file

From 421ba03f80ecebfb118542ade22618de6d3a01c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Herv=C3=A9=20BREDIN?= <hbredin@users.noreply.github.com>
Date: Fri, 15 Sep 2023 16:23:10 +0200
Subject: [PATCH 17/17] doc: update changelog

---
 CHANGELOG.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e7d220fd..a6f2161c5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,15 +32,16 @@
 
 ### Features and improvements
 
+  - feat(task): add [powerset](https://www.isca-speech.org/archive/interspeech_2023/plaquet23_interspeech.html) support to `SpeakerDiarization` task
   - feat(task): add support for multi-task models
+  - feat(task): add support for label scope in speaker diarization task
+  - feat(task): add support for missing classes in multi-label segmentation task
+  - feat(model): add segmentation model based on torchaudio self-supervised representation
   - feat(pipeline): send pipeline to device with `pipeline.to(device)`
-  - feat(pipeline): make `segmentation_batch_size` and `embedding_batch_size` mutable in `SpeakerDiarization` pipeline (they now default to `1`)
-  - feat(task): add [powerset](https://arxiv.org/PLACEHOLDER) support to `SpeakerDiarization` task
   - feat(pipeline): add `return_embeddings` option to `SpeakerDiarization` pipeline
+  - feat(pipeline): make `segmentation_batch_size` and `embedding_batch_size` mutable in `SpeakerDiarization` pipeline (they now default to `1`)
   - feat(pipeline): add progress hook to pipelines
   - feat(pipeline): check version compatibility at load time
-  - feat(task): add support for label scope in speaker diarization task
-  - feat(task): add support for missing classes in multi-label segmentation task
   - improve(task): load metadata as tensors rather than pyannote.core instances
   - improve(task): improve error message on missing specifications