clean repo

virgile-blg · Jun 28, 2022 · fcaacad · fcaacad
1 parent c090a62
commit fcaacad
Show file tree

Hide file tree

Showing 9 changed files with 168 additions and 58 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__/*
-notebooks/__pycache__/*
+data_processing/__pycache__/*
+data_processing/.ipynb_checkpoints/*
diff --git a/README.md b/README.md
@@ -1,2 +1,24 @@
-# VAD
-VAD Challenge  - Sonos
+## VAD Challenge  - Sonos
+
+This project implements a Voice Activity Detection algorithme based on the paper:  __Sofer, A., & Chazan, S. E. (2022). CNN self-attention voice activity detector. arXiv preprint arXiv:2203.02944.__
+
+The `Data Processing` folder contains primarly a Notebook that has been used for the processing of the annoation and the data aumentation.
+`data_utils.py` contains helper functions for annotation processing.
+`energy_vad.py` contains code that was not written by myself. It is an implementaion of an energy-based VAD found on [GitHub](https://github.com/idnavid/py_vad_tool) that I used to extract noise signals from Librispeech samples.
+
+The algorithm training pipeline is organised as follows:
+
+- `data.py` implements the PyTorch dataset together with the Lightning DataModule
+- `modules.py` implements the PyTorch neural network model
+- `model.py` implements the Lightning Module for training
+- `train.py` is the main script to start a training
+- `inference.py` is a simple script to test the model on real-world audio
+- `config` folder regroupe YAML file for experiment hyperparamters. The `baseline_sa_cnn.yml` is the hyperparameters set as described in the paper, while `128_mels.yml` is a slightly modified version.
+
+You will also find some artifacts created after the training : 
+
+- `checkpoints` folder is the saved model checkpoints, containing weights, optimizer state, hyperparams...
+- `tb_logs` contains the Tensorboard logs
+
+
+
diff --git a/checkpoints/128_mels/hparams.yml b/checkpoints/128_mels/hparams.yml
@@ -0,0 +1,38 @@
+data:
+  batch_size: 128
+  data_dir: /home/virgile/data/vad_data_augmented
+  hop_length: 512
+  n_frames: 256
+  n_mels: 128
+  n_workers: 4
+  nfft: 1048
+  norm: false
+  pin_memory: false
+  sr: 16000
+  valid_percent: 0.85
+model:
+  cnn_channels: 32
+  dff: 512
+  embed_dim: 256
+  n_feat: 128
+  num_heads: 16
+model_checkpoint:
+  filename: VAD-{epoch:02d}
+  monitor: val_loss
+  save_last: true
+trainer:
+  accumulate_grad_batches: 1
+  auto_lr_find: false
+  fast_dev_run: false
+  gpus: '1'
+  max_epochs: 100
+  precision: 32
+  profiler: false
+  val_check_interval: 1.0
+training:
+  lr: 0.0003
+  optim: Adam
+  weight_decay: 1.0e-05
+xp_config:
+  dataset: sonos-vad
+  model_type: VAD
diff --git a/data.py b/data.py
@@ -8,23 +8,35 @@
 
 EPS = 1e-8
 
-def get_frame_targets(audio_path, total_frames, hop_length, sr=16000):
-
+def get_frame_targets(audio_path:str, total_frames:int, hop_length:int, sr:int=16000)->th.Tensor:
+    """Aligns groundtruth annotation in seconds to the spectrogram time axis.
+       Returns a binary Tensor array of the size of the spectrogram length.
+
+    Args:
+        audio_path (str): path to the audio file
+        total_frames (int): total frame of the spectrogram
+        hop_length (int): hop length parameter for the spectrogram
+        sr (int, optional): sample rate. Defaults to 16000.
+
+    Returns:
+        th.Tensor: binary Tensor array for groundtruth
+    """
     df = pd.read_csv(audio_path.replace('.wav', '.csv'))
     gt = th.zeros(total_frames)
 
     cur_frame = 0
     for i in df.index:
-        utt_len = int(round(df.iloc[i].utt_time / (hop_length/sr)))
+        utt_len = int(round(df.iloc[i].utt_time / (hop_length / sr)))
 
-        gt[cur_frame:cur_frame+utt_len] = df.iloc[i].speech 
+        gt[cur_frame:cur_frame + utt_len] = df.iloc[i].speech 
         cur_frame += utt_len
 
     return gt.unsqueeze(0)
 
 
 class MelVADDataset(th.utils.data.Dataset):
-    def __init__(self, path_list, n_frames, nfft, hop_length, n_mels, sr, norm=False):
+    def __init__(self, path_list:list, n_frames:int, nfft:int, hop_length:int, n_mels:int, sr:int, norm:bool=False)->th.utils.data.Dataset:
+
         self.path_list = path_list
         self.sr = sr
         self.mel_spec =  torchaudio.transforms.MelSpectrogram(n_fft=nfft, hop_length=hop_length, n_mels=n_mels)

diff --git a/data_processing/data_processing.ipynb b/data_processing/data_processing.ipynb
diff --git a/inference.py b/inference.py
@@ -57,7 +57,7 @@ def plot_result(self, audio_path):
     parser = argparse.ArgumentParser()
     parser.add_argument('input_file', type=str, help='path to file to predict VAD')
     parser.add_argument('-p', '--plot_results', action='store_true', default=False, help='Plot spectrogram and model predictions')
-    parser.add_argument('-c', '--ckpt_folder', default='./checkpoints/128_mels', help='Plot spectrogram and model predictions')
+    parser.add_argument('-c', '--ckpt_folder', default='./checkpoints/128_mels', help='Path to model checkpoint')
     args = parser.parse_args()
 
     predictor = VADPredictor(ckpt_folder=args.ckpt_folder, device='cpu')

diff --git a/model.py b/model.py
@@ -5,7 +5,6 @@
 
 from modules import *
 from data import *
-from utils import *
 
 
 class VAD(pl.LightningModule):
@@ -27,6 +26,7 @@ def forward(self, x):
         return probs
 
     def configure_optimizers(self):
+
         optim_type = self.hparams.training["optim"]
         assert  optim_type in ['Adam', 'SDG']
 
@@ -36,6 +36,7 @@ def configure_optimizers(self):
             return th.optim.SGD(self.model.parameters() ,lr=self.hparams.training["lr"], weight_decay=self.hparams.training["weight_decay"])
 
     def training_step(self, batch, batch_idx):
+
         x, t = batch['spectro'], batch['targets'].squeeze(1)
         probs = self.forward(x).squeeze(-1)
         loss = self.loss(probs, t)

diff --git a/modules.py b/modules.py
@@ -55,10 +55,10 @@ def forward(self, x):
         x = x.permute(0, 2, 1)
         x = res = self.fc1(x)
         x, _ = self.self_attention(x, x, x)
-        x += res
+        x += res # Residual connection
         x = res = self.layer_norm1(x)
         x = self.mlp(x)
-        x += res
+        x += res # Residual connection
         x = self.layer_norm2(x)
         x = self.fc2(x)
-        return x
+        return x
diff --git a/testing.ipynb b/testing.ipynb