Update to silero v5

rhasspy · Jul 8, 2024 · e7db821 · e7db821
1 parent fc1e3f7
commit e7db821
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## 2.0.0
+
+- Update to Silero VAD v5
+- Audio size **must** be 512 samples in `process_chunk`
+- Add `process_chunks` to handle longer audio
+
 ## 1.0.0
 
 - Initial version
diff --git a/pysilero_vad/__init__.py b/pysilero_vad/__init__.py
@@ -1,18 +1,25 @@
 import logging
 from pathlib import Path
-from typing import Final, Union
+from typing import Final, Iterable, Union
 
 import numpy as np
 import onnxruntime
 
-_RATE: Final = 16000
+_RATE: Final = 16000  # Khz
 _MAX_WAV: Final = 32767
 _DIR = Path(__file__).parent
 _DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
+_CONTEXT_SIZE: Final = 64  # 16Khz
+_CHUNK_SAMPLES: Final = 512
+_CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2  # 16-bit
 
 _LOGGER = logging.getLogger()
 
 
+class InvalidChunkSizeError(Exception):
+    """Error raised when chunk size is not correct."""
+
+
 class SileroVoiceActivityDetector:
     """Detects speech/silence using Silero VAD.
 
@@ -30,31 +37,71 @@ def __init__(self, onnx_path: Union[str, Path] = _DEFAULT_ONNX_PATH) -> None:
             onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
         )
 
-        self._h = np.zeros((2, 1, 64)).astype("float32")
-        self._c = np.zeros((2, 1, 64)).astype("float32")
+        self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
+        self._state = np.zeros((2, 1, 128), dtype=np.float32)
+        self._sr = np.array(_RATE, dtype=np.int64)
+
+    @staticmethod
+    def chunk_samples() -> int:
+        """Return number of samples required for an audio chunk."""
+        return _CHUNK_SAMPLES
+
+    @staticmethod
+    def chunk_bytes() -> int:
+        """Return number of bytes required for an audio chunk."""
+        return _CHUNK_BYTES
 
     def reset(self) -> None:
         """Reset state."""
-        self._h = np.zeros((2, 1, 64)).astype("float32")
-        self._c = np.zeros((2, 1, 64)).astype("float32")
+        self._state = np.zeros((2, 1, 128)).astype("float32")
 
     def __call__(self, audio: bytes) -> float:
-        """Return probability of speech in audio [0-1].
+        """Return probability of speech [0-1] in a single audio chunk.
 
-        Audio must be 16Khz 16-bit mono PCM.
+        Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
+        """
+        return self.process_chunk(audio)
+
+    def process_chunk(self, audio: bytes) -> float:
+        """Return probability of speech [0-1] in a single audio chunk.
+
+        Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
         """
+        if len(audio) != _CHUNK_BYTES:
+            # Window size is fixed at 512 samples in v5
+            raise InvalidChunkSizeError
+
         audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV
 
-        # Add batch dimension
-        audio_array = np.expand_dims(audio_array, 0)
+        # Add batch dimension and context
+        audio_array = np.concatenate(
+            (self._context, audio_array[np.newaxis, :]), axis=1
+        )
+        self._context = audio_array[:, -_CONTEXT_SIZE:]
 
+        # ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
         ort_inputs = {
-            "input": audio_array,
-            "h": self._h,
-            "c": self._c,
-            "sr": np.array(_RATE, dtype=np.int64),
+            "input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
+            "state": self._state,
+            "sr": self._sr,
         }
         ort_outs = self.session.run(None, ort_inputs)
-        out, self._h, self._c = ort_outs
+        out, self._state = ort_outs
 
         return out.squeeze()
+
+    def process_chunks(self, audio: bytes) -> Iterable[float]:
+        """Return probability of speech in audio [0-1] for each chunk of audio.
+
+        Audio must be 16Khz 16-bit mono PCM.
+        """
+        if len(audio) < _CHUNK_BYTES:
+            # Window size is fixed at 512 samples in v5
+            raise InvalidChunkSizeError
+
+        num_audio_bytes = len(audio)
+        audio_idx = 0
+
+        while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
+            yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
+            audio_idx += _CHUNK_BYTES
diff --git a/pysilero_vad/models/silero_vad.onnx b/pysilero_vad/models/silero_vad.onnx
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-onnxruntime>=1.10.0,<2
-numpy<1.26
+onnxruntime>=1.18.0,<2
+numpy<2
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
 
 setup(
     name="pysilero_vad",
-    version="1.0.0",
+    version="2.0.0",
     description="Pre-packaged voice activity detector using silero-vad",
     long_description=long_description,
     long_description_content_type="text/markdown",

diff --git a/tests/test_vad.py b/tests/test_vad.py
@@ -2,7 +2,8 @@
 from pathlib import Path
 from typing import Union
 
-from pysilero_vad import SileroVoiceActivityDetector
+import pytest
+from pysilero_vad import SileroVoiceActivityDetector, InvalidChunkSizeError
 
 _DIR = Path(__file__).parent
 
@@ -20,10 +21,24 @@ def _load_wav(wav_path: Union[str, Path]) -> bytes:
 def test_silence() -> None:
     """Test VAD on recorded silence."""
     vad = SileroVoiceActivityDetector()
-    assert vad(_load_wav(_DIR / "silence.wav")) < 0.5
+    assert all(p < 0.5 for p in vad.process_chunks(_load_wav(_DIR / "silence.wav")))
 
 
 def test_speech() -> None:
     """Test VAD on recorded speech."""
     vad = SileroVoiceActivityDetector()
-    assert vad(_load_wav(_DIR / "speech.wav")) >= 0.5
+    assert any(p >= 0.5 for p in vad.process_chunks(_load_wav(_DIR / "speech.wav")))
+
+def test_invalid_chunk_size() -> None:
+    """Test that chunk size must be 512 samples."""
+    vad = SileroVoiceActivityDetector()
+
+    # Should work
+    vad(bytes(SileroVoiceActivityDetector.chunk_bytes()))
+
+    # Should fail
+    with pytest.raises(InvalidChunkSizeError):
+        vad(bytes(SileroVoiceActivityDetector.chunk_bytes() * 2))
+
+    with pytest.raises(InvalidChunkSizeError):
+        vad(bytes(SileroVoiceActivityDetector.chunk_bytes() // 2))