Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimized Background Noise Augmentation for Large Background Files #360

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 24 additions & 22 deletions audiomentations/augmentations/add_background_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import numpy as np
from numpy.typing import NDArray
import librosa

from audiomentations.core.audio_loading_utils import load_sound_file
from audiomentations.core.transforms_interface import BaseWaveformTransform
Expand Down Expand Up @@ -67,6 +68,7 @@ def __init__(
:param p: The probability of applying this transform
:param lru_cache_size: Maximum size of the LRU cache for storing noise files in memory
"""

super().__init__(p)
self.sound_file_paths = find_audio_files_in_paths(sounds_path)
self.sound_file_paths = [str(p) for p in self.sound_file_paths]
Expand All @@ -92,42 +94,42 @@ def __init__(
AddBackgroundNoise._load_sound
)
self.noise_transform = noise_transform
self.time_info_arr = np.zeros(shape = (len(self.sound_file_paths,)),dtype=np.float32)
self.time_info_arr.fill(-1.0)

@staticmethod
def _load_sound(file_path, sample_rate):
return load_sound_file(file_path, sample_rate)
def _load_sound(file_path, sample_rate, offset = 0.0, duration = None):
return load_sound_file(file_path, sample_rate,offset=offset,duration=duration)

def randomize_parameters(self, samples: NDArray[np.float32], sample_rate: int):
super().randomize_parameters(samples, sample_rate)

if self.parameters["should_apply"]:
self.parameters["snr_db"] = random.uniform(self.min_snr_db, self.max_snr_db)
self.parameters["rms_db"] = random.uniform(
self.min_absolute_rms_db, self.max_absolute_rms_db
)
self.parameters["noise_file_path"] = random.choice(self.sound_file_paths)
file_idx = random.randint(0,len(self.sound_file_paths)-1)
self.parameters["noise_file_path"] = self.sound_file_paths[file_idx]

num_samples = len(samples)
noise_sound, _ = self._load_sound(
self.parameters["noise_file_path"], sample_rate
)
if self.time_info_arr[file_idx] != -1.0:
self.time_info_arr[file_idx] = librosa.get_duration(path = self.parameters['noise_file_path'])

noise_files_seconds = self.time_info_arr[file_idx]

signal_file_seconds = len(samples)/sample_rate

num_noise_samples = len(noise_sound)
min_noise_offset = 0
max_noise_offset = max(0, num_noise_samples - num_samples - 1)
self.parameters["noise_start_index"] = random.randint(
min_noise_offset, max_noise_offset
)
self.parameters["noise_end_index"] = (
self.parameters["noise_start_index"] + num_samples
)
min_noise_offset = 0.0
max_noise_offset = max(0.0,noise_files_seconds - signal_file_seconds)

self.parameters['offset'] = random.uniform(min_noise_offset,max_noise_offset)
self.parameters['duration'] = signal_file_seconds

def apply(self, samples: NDArray[np.float32], sample_rate: int):

def apply(self, samples: NDArray[np.float32], sample_rate: int) -> NDArray[np.float32]:
noise_sound, _ = self._load_sound(
self.parameters["noise_file_path"], sample_rate
noise_sound,_ = self._load_sound(
self.parameters["noise_file_path"], sample_rate,offset=self.parameters['offset'],duration=self.parameters['duration']
)
noise_sound = noise_sound[
self.parameters["noise_start_index"] : self.parameters["noise_end_index"]
]

if self.noise_transform:
noise_sound = self.noise_transform(noise_sound, sample_rate)
Expand Down
5 changes: 2 additions & 3 deletions audiomentations/core/audio_loading_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import warnings

import librosa
import numpy as np


def load_sound_file(file_path, sample_rate, mono=True, resample_type="auto"):
def load_sound_file(file_path, sample_rate, mono=True, resample_type="auto", offset = 0.0, duration = None):
"""
Load an audio file as a floating point time series. Audio will be automatically
resampled to the given sample rate.
Expand All @@ -17,7 +16,7 @@ def load_sound_file(file_path, sample_rate, mono=True, resample_type="auto"):
"""
file_path = str(file_path)
samples, actual_sample_rate = librosa.load(
str(file_path), sr=None, mono=mono, dtype=np.float32
str(file_path), sr=None, mono=mono, dtype=np.float32, offset = offset, duration = duration
)

if sample_rate is not None and actual_sample_rate != sample_rate:
Expand Down