-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature_extraction.py
154 lines (125 loc) · 5.56 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Add-on packages
import numpy as np
# Short-time Fourier transform
from librosa import stft # If there is an issue with this pip uninstall and re-install soundfile "pip soundfile"
from librosa.feature import melspectrogram
# Project packages
from library.conversions import ms_to_samples
from library.audio_io import read_wav
from library.endpointer import speech_detector
from library.visualize import plot_spectrogram
def format_training_features(utterences, corpus, encoder, adv_ms, len_ms):
"""
description
Formats features and labels for training.
Utterances must be given in increasing categorical order.
i.e
index 0 in utterenace corresponding to all audio files
that corresponding to the first category in the corpus
parameters
utterences - the audio files containing utterances features will be extracted from
corpus - the corpus used to maintain categories and speakers
encoder - strategy for how labels should be encoded
adv_ms - frame advance in milliseconds
len_ms - Length of frames in milliseconds
returns
samples - all samples
labels - in corresponding order to the samples, encoded
"""
samples = []
labels = []
current_category = 0
for speaker_utterences in utterences:
speaker_id = corpus.category_to_speaker(current_category)
for file in speaker_utterences:
# Retrieve features and labels from audio file
features, new_labels = get_features(file, adv_ms, len_ms, speaker_id, debug=False)
# Format labels to be accepted by the encoder and produce encoded labels
new_labels = encoder.transform(new_labels.reshape(-1, 1)).A
samples.append(features)
labels.append(new_labels)
# Move onto the next category
current_category += 1
# Concatenate to match expected format for training
# Concatenate at end to save on memory allocation operation
samples = np.concatenate(samples)
labels = np.concatenate(labels)
return samples, labels
def get_features(filename: str, adv_ms:float, len_ms: float, label: int,
spectral_means_subtraction=True,
feature="dB",
debug=False):
"""
:param filename:
:param adv_ms:
:param len_ms:
:param label: speaker number
:param spectral_means_subtraction: noise reduction
:param feature: Feature to be computed:
dB - spectrogram log mag^2 power in dB
mel - perceptually filtered spectrogram using a Mel filterbank
Number of mel filters is based librosa defaults
:param debug:
:return:
"""
data = read_wav(filename) # Read data into memory
# Find where the speech is
speechI = speech_detector(data.samples, data.Fs, adv_ms, len_ms)
# Frame the speech and extract Mel-filtered features
adv_N = ms_to_samples(data.Fs, adv_ms)
len_N = ms_to_samples(data.Fs, len_ms)
# Convert samples to floating point (librosa requirement) and
# compute discrete Fourier transfrm of each frame
# Default uses Hann window
pcm = data.samples.astype(float).T
# Returns complex DFT
eps = 1e-6 # we will add a small epsilon to prevent log 0
specgram = stft(y=pcm, hop_length=adv_N,
win_length=len_N, n_fft=len_N)
# Magnitude spectrogram
mag_specgram = np.abs(specgram)
if feature == "dB":
# Convert complex DFT to dB: 10 log10 mag^2,
specgram = 20 * np.log10(mag_specgram + eps)
features = specgram
elif feature == "mel":
# Computes Mel power (squared)
filters = 18 if data.Fs <= 8000 else 24
melgram = melspectrogram(S=mag_specgram, sr=data.Fs,
n_mels=filters, n_fft=len_N)
# Convert to dB
melgram = 10 * np.log10(melgram + eps)
features = melgram
else:
raise ValueError(f"Bad feature specification {feature}")
features = np.squeeze(features) # Remove singleton
# Spectrogram is now frequency X frames
noiseI = np.logical_not(speechI) # indicator for noise
# Use spectral means subtraction *Noise Reduction technique
if spectral_means_subtraction:
# Find mean of each frequency bin across the noise regions and
# normalize spectrogram
mean = np.mean(features[:,noiseI], axis=1)
# Can only subtract a vector if it matches the last axis,
# so we transpose to frames X frequency, subtract the mean
# and put it back
features = (features.T - mean).T
if debug:
# Need to fix for Mel frequencies, displays everything as linear
plot_spectrogram(features, adv_ms, len_ms, speechI)
# Delta features approximate derivatives of features.
# This is not used in this set of experiments.
delta = 0
if delta:
# Only implemented for delta == 1, will probably break with longer deltas
# Compute the diff +/- delta steps
padded = np.pad(features, ((0,), (delta,)), mode='edge')
slope = padded[:,delta+1:] - padded[:,0:padded.shape[1]-(delta+1)]
features = np.concatenate((features, slope), axis=0)
# Drop portions of spectrogram that are associated with noise
speech_features = np.delete(features, noiseI, axis=1)
# Create label array - must have a speaker label for each
# frame of speech associated with the speaker
labels = np.ones(speech_features.shape[1]) * label
# Tensorflow will expect examples X features (frequencies)
return speech_features.T, labels