Skip to content

Commit

Permalink
Update to silero v5
Browse files Browse the repository at this point in the history
  • Loading branch information
synesthesiam committed Jul 8, 2024
1 parent fc1e3f7 commit e7db821
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 21 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# Changelog

## 2.0.0

- Update to Silero VAD v5
- Audio size **must** be 512 samples in `process_chunk`
- Add `process_chunks` to handle longer audio

## 1.0.0

- Initial version
77 changes: 62 additions & 15 deletions pysilero_vad/__init__.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import logging
from pathlib import Path
from typing import Final, Union
from typing import Final, Iterable, Union

import numpy as np
import onnxruntime

_RATE: Final = 16000
_RATE: Final = 16000 # Khz
_MAX_WAV: Final = 32767
_DIR = Path(__file__).parent
_DEFAULT_ONNX_PATH = _DIR / "models" / "silero_vad.onnx"
_CONTEXT_SIZE: Final = 64 # 16Khz
_CHUNK_SAMPLES: Final = 512
_CHUNK_BYTES: Final = _CHUNK_SAMPLES * 2 # 16-bit

_LOGGER = logging.getLogger()


class InvalidChunkSizeError(Exception):
"""Error raised when chunk size is not correct."""


class SileroVoiceActivityDetector:
"""Detects speech/silence using Silero VAD.
Expand All @@ -30,31 +37,71 @@ def __init__(self, onnx_path: Union[str, Path] = _DEFAULT_ONNX_PATH) -> None:
onnx_path, providers=["CPUExecutionProvider"], sess_options=opts
)

self._h = np.zeros((2, 1, 64)).astype("float32")
self._c = np.zeros((2, 1, 64)).astype("float32")
self._context = np.zeros((1, _CONTEXT_SIZE), dtype=np.float32)
self._state = np.zeros((2, 1, 128), dtype=np.float32)
self._sr = np.array(_RATE, dtype=np.int64)

@staticmethod
def chunk_samples() -> int:
"""Return number of samples required for an audio chunk."""
return _CHUNK_SAMPLES

@staticmethod
def chunk_bytes() -> int:
"""Return number of bytes required for an audio chunk."""
return _CHUNK_BYTES

def reset(self) -> None:
"""Reset state."""
self._h = np.zeros((2, 1, 64)).astype("float32")
self._c = np.zeros((2, 1, 64)).astype("float32")
self._state = np.zeros((2, 1, 128)).astype("float32")

def __call__(self, audio: bytes) -> float:
"""Return probability of speech in audio [0-1].
"""Return probability of speech [0-1] in a single audio chunk.
Audio must be 16Khz 16-bit mono PCM.
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
"""
return self.process_chunk(audio)

def process_chunk(self, audio: bytes) -> float:
"""Return probability of speech [0-1] in a single audio chunk.
Audio *must* be 512 samples of 16Khz 16-bit mono PCM.
"""
if len(audio) != _CHUNK_BYTES:
# Window size is fixed at 512 samples in v5
raise InvalidChunkSizeError

audio_array = np.frombuffer(audio, dtype=np.int16).astype(np.float32) / _MAX_WAV

# Add batch dimension
audio_array = np.expand_dims(audio_array, 0)
# Add batch dimension and context
audio_array = np.concatenate(
(self._context, audio_array[np.newaxis, :]), axis=1
)
self._context = audio_array[:, -_CONTEXT_SIZE:]

# ort_inputs = {"input": audio_array, "state": self._state, "sr": self._sr}
ort_inputs = {
"input": audio_array,
"h": self._h,
"c": self._c,
"sr": np.array(_RATE, dtype=np.int64),
"input": audio_array[:, : _CHUNK_SAMPLES + _CONTEXT_SIZE],
"state": self._state,
"sr": self._sr,
}
ort_outs = self.session.run(None, ort_inputs)
out, self._h, self._c = ort_outs
out, self._state = ort_outs

return out.squeeze()

def process_chunks(self, audio: bytes) -> Iterable[float]:
"""Return probability of speech in audio [0-1] for each chunk of audio.
Audio must be 16Khz 16-bit mono PCM.
"""
if len(audio) < _CHUNK_BYTES:
# Window size is fixed at 512 samples in v5
raise InvalidChunkSizeError

num_audio_bytes = len(audio)
audio_idx = 0

while (audio_idx + _CHUNK_BYTES) < num_audio_bytes:
yield self.process_chunk(audio[audio_idx : audio_idx + _CHUNK_BYTES])
audio_idx += _CHUNK_BYTES
Binary file modified pysilero_vad/models/silero_vad.onnx
Binary file not shown.
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
onnxruntime>=1.10.0,<2
numpy<1.26
onnxruntime>=1.18.0,<2
numpy<2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

setup(
name="pysilero_vad",
version="1.0.0",
version="2.0.0",
description="Pre-packaged voice activity detector using silero-vad",
long_description=long_description,
long_description_content_type="text/markdown",
Expand Down
21 changes: 18 additions & 3 deletions tests/test_vad.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
from pathlib import Path
from typing import Union

from pysilero_vad import SileroVoiceActivityDetector
import pytest
from pysilero_vad import SileroVoiceActivityDetector, InvalidChunkSizeError

_DIR = Path(__file__).parent

Expand All @@ -20,10 +21,24 @@ def _load_wav(wav_path: Union[str, Path]) -> bytes:
def test_silence() -> None:
"""Test VAD on recorded silence."""
vad = SileroVoiceActivityDetector()
assert vad(_load_wav(_DIR / "silence.wav")) < 0.5
assert all(p < 0.5 for p in vad.process_chunks(_load_wav(_DIR / "silence.wav")))


def test_speech() -> None:
"""Test VAD on recorded speech."""
vad = SileroVoiceActivityDetector()
assert vad(_load_wav(_DIR / "speech.wav")) >= 0.5
assert any(p >= 0.5 for p in vad.process_chunks(_load_wav(_DIR / "speech.wav")))

def test_invalid_chunk_size() -> None:
"""Test that chunk size must be 512 samples."""
vad = SileroVoiceActivityDetector()

# Should work
vad(bytes(SileroVoiceActivityDetector.chunk_bytes()))

# Should fail
with pytest.raises(InvalidChunkSizeError):
vad(bytes(SileroVoiceActivityDetector.chunk_bytes() * 2))

with pytest.raises(InvalidChunkSizeError):
vad(bytes(SileroVoiceActivityDetector.chunk_bytes() // 2))

0 comments on commit e7db821

Please sign in to comment.