diff --git a/docs/build/doctrees/environment.pickle b/docs/build/doctrees/environment.pickle new file mode 100644 index 0000000..26e3304 Binary files /dev/null and b/docs/build/doctrees/environment.pickle differ diff --git a/docs/build/doctrees/getting_started.doctree b/docs/build/doctrees/getting_started.doctree new file mode 100644 index 0000000..ab9bb8a Binary files /dev/null and b/docs/build/doctrees/getting_started.doctree differ diff --git a/docs/build/doctrees/index.doctree b/docs/build/doctrees/index.doctree new file mode 100644 index 0000000..5d22ff1 Binary files /dev/null and b/docs/build/doctrees/index.doctree differ diff --git a/docs/build/doctrees/index_pitchshift.doctree b/docs/build/doctrees/index_pitchshift.doctree new file mode 100644 index 0000000..ad1b360 Binary files /dev/null and b/docs/build/doctrees/index_pitchshift.doctree differ diff --git a/docs/build/doctrees/index_tsm.doctree b/docs/build/doctrees/index_tsm.doctree new file mode 100644 index 0000000..2f41e2d Binary files /dev/null and b/docs/build/doctrees/index_tsm.doctree differ diff --git a/docs/build/doctrees/index_utils.doctree b/docs/build/doctrees/index_utils.doctree new file mode 100644 index 0000000..e8b6c74 Binary files /dev/null and b/docs/build/doctrees/index_utils.doctree differ diff --git a/docs/build/html/.buildinfo b/docs/build/html/.buildinfo new file mode 100644 index 0000000..147fd86 --- /dev/null +++ b/docs/build/html/.buildinfo @@ -0,0 +1,4 @@ +# Sphinx build info version 1 +# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. +config: 333cbe205267b0df993670ab9c9c68af +tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/docs/build/html/_modules/index.html b/docs/build/html/_modules/index.html new file mode 100644 index 0000000..04aa94a --- /dev/null +++ b/docs/build/html/_modules/index.html @@ -0,0 +1,112 @@ + + +
+ + +
+"""
+Description: libtsm pitch-shifting functions
+Contributors: Sebastian Rosenzweig, Simon Schwär, Jonathan Driedger, Meinard Müller
+License: The MIT license, https://opensource.org/licenses/MIT
+This file is part of libtsm (https://www.audiolabs-erlangen.de/resources/MIR/2021-DAFX-AdaptivePitchShifting)
+"""
+
+import numpy as np
+import scipy as sc
+import scipy.signal
+import scipy.interpolate
+from fractions import Fraction as frac
+from .tsm import hps_tsm
+from .utils import normalize_length
+
+
+[docs]def pitch_shift_original(x, n, Fs=22050) -> np.ndarray:
+ """
+ Pitch modification algorithm via time-scale modification. The input signal is stretched via TSM and then resampled.
+ The code closely follows the Matlab implementation.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real - valued
+ Signal to be transformed
+
+ n : int
+ Amount of pitch shifting to be applied, given in cents. Positive n indicates pitch rising, negative n a pitch
+ lowering
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,1)], real - valued
+ The time-scale modified output signal
+ """
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ alpha = np.power(np.power(2, 1 / 12), (n / 100))
+ y_tsm = hps_tsm(x, alpha, Fs=Fs)
+ const = Fs / np.around(alpha * Fs)
+ f = frac(str(const)).limit_denominator(100)
+ p = f.numerator
+ q = f.denominator
+ x_f = sc.signal.resample_poly(y_tsm, int(p), int(q), axis=0) # deviations from Matlab's resample()
+ y = normalize_length(x_f, x.shape[0])
+
+ return y
+
+
+[docs]def pitch_shift(x, p, t_p=None, Fs=22050, order="res-tsm", **kwargs) -> np.ndarray:
+ """
+ (Non-linear) pitch-shifting via time-scale modification and resampling.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real - valued
+ Signal to be transformed
+
+ p : float or np.ndarray [shape=(M,)], real - valued
+ Amount of pitch shifting to be applied, given in cents. Positive p indicates pitch rising, negative p a pitch
+ lowering.
+
+ t_p : np.ndarray [shape=(M,)], real - valued
+ Array of time instances in seconds for adaptive pitch shifting, same length as p. If t==None, a fixed
+ pitch-shift is assumed.
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ order : Order of TSM and resampling, either "res-tsm" or "tsm-res".
+
+ **kwargs : Parameters for hps_tsm
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,1)], real - valued
+ The time-scale modified output signal
+ """
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ t_x = np.linspace(0, (len(x) - 1) / Fs, len(x))
+
+ if not np.isscalar(p):
+ if t_p is None:
+ raise Exception("t must be specified if p is an array!")
+ if len(p) != len(t_p):
+ raise Exception("t must have the same length as p!")
+ if t_p[0] != 0: # time axis should start with 0
+ t_p = t_p.astype(float)
+ t_p = np.insert(t_p, 0, 0)
+ p = np.insert(p, 0, 0)
+ if t_p[-1] != t_x[-1]: # time axis should end with the last time instance
+ t_p = t_p.astype(float)
+ t_p = np.insert(t_p, len(t_p), t_x[-1])
+ p = np.insert(p, len(p), 0)
+
+ # account for sign change when order of resampling and TSM is exchanged
+ if order == "res-tsm":
+ alpha = 2 ** (-p / 1200)
+ elif order == "tsm-res":
+ alpha = 2 ** (p / 1200)
+ else:
+ raise Exception("Order must be either res-tsm or tsm-res!")
+
+ # convert pitch shift in cents to (non-linear) time-stretch function tau
+ if np.isscalar(p):
+ tau = np.array([[0, 0], [x.shape[0] - 1, x.shape[0] * alpha - 1]]) / Fs # given in seconds
+ else:
+ # compute tau
+ tau = np.zeros((len(alpha), 2))
+ tau[:, 0] = t_p
+
+ for i in range(1, len(alpha)):
+ dt = tau[i, 0] - tau[i - 1, 0]
+ tau[i, 1] = dt * alpha[i-1] + tau[i - 1, 1]
+
+ # Pitch-shifting
+ if order == "res-tsm":
+ # (Non-linear) Resampling
+ fi = sc.interpolate.interp1d(tau[:, 0], tau[:, 1], kind='linear', fill_value="extrapolate")
+ time_input = fi(t_x)
+ fi = sc.interpolate.interp1d(time_input, x[:, 0], kind='cubic', fill_value="extrapolate")
+ t_res = np.arange(0, tau[-1, 1] + 1 / Fs, 1 / Fs)
+ y_ps = fi(t_res)
+
+ # use inverse tau points as anchor points for TSM
+ anchor_points = np.ceil(tau * Fs).astype(int)
+ anchor_points = np.flip(anchor_points, axis=1)
+ # only keep unique indices
+ anchor_points = anchor_points[np.unique(anchor_points[:, 1], return_index=True)[1], :]
+
+ # Time-Scale Modification
+ y_ps = hps_tsm(y_ps, anchor_points, Fs=Fs, **kwargs)
+
+ elif order == "tsm-res":
+ # compute anchor points
+ anchor_points = np.ceil(tau * Fs).astype(int)
+ # only keep unique indices
+ anchor_points = anchor_points[np.unique(anchor_points[:, 1], return_index=True)[1], :]
+
+ # Time-Scale Modification
+ y_tsm = hps_tsm(x, anchor_points, Fs=Fs, **kwargs)
+
+ # (Non-linear) resampling
+ time_output = np.linspace(0, (y_tsm.shape[0] - 1) / Fs, y_tsm.shape[0])
+ fi = sc.interpolate.interp1d(tau[:, 1], tau[:, 0], kind='linear', fill_value="extrapolate")
+ time_input = fi(time_output)
+ fi = sc.interpolate.interp1d(time_input, y_tsm[:, 0], kind='cubic', fill_value="extrapolate")
+ y_ps = fi(t_x)
+
+ # crop if pitch-shifted signal is longer than x
+ y_ps = y_ps.reshape(-1, 1)[:len(x), :]
+
+ return y_ps
+
+
+"""
+Description: libtsm time-scale modification functions
+Contributors: Sebastian Rosenzweig, Simon Schwär, Jonathan Driedger, Meinard Müller
+License: The MIT license, https://opensource.org/licenses/MIT
+This file is part of libtsm (https://www.audiolabs-erlangen.de/resources/MIR/2021-DAFX-AdaptivePitchShifting)
+"""
+
+import numpy as np
+import scipy.interpolate
+from .utils import win, stft, istft, cross_corr, hps, find_peaks
+
+
+[docs]def pv_tsm(x, alpha, syn_hop=512, win_length=2048, win_beta=1, Fs=22050, zero_pad=0, restore_energy=False,
+ fft_shift=False, phase_locking=False) -> np.ndarray:
+ """Time scale modification based on a phase vocoder
+
+ Rescales the time-axis of the input signal x according to the time-stretch function alpha
+ without altering the pitch of x.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, C)], real-valued
+ Signal to be transformed, second dimension is an optional channel dimension
+
+ alpha : float or np.ndarray [shape=(S, 2)]
+ Time stretch function, given by a constant (float) or a set of S anchor points (int).
+ A valid anchor point sequence
+ (1) contains only non-negative values,
+ (2) both sequences along the first axis are strictly increasing,
+ (3) starts with position (m, 0), where m is an intereger >= 0.
+ These conditions will be checked and an error is thrown if they are not met.
+ See `libtsm.ensure_validity` for one way to ensure that condition (2) is fulfilled.
+
+ syn_hop : int
+ hop size of the synthesis window
+
+ win_length : int
+ length of analysis and synthesis window for STFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ zero_pad : int
+ For FFT. Number of zeros padded to the window to increase the fft size
+
+ restore_energy : bool
+ For FFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
+
+ fft_shift : bool
+ For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
+
+ phase_locking : bool
+ when True, Applies identity phase locking
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,1)], real - valued
+ """
+
+ # Pre-calculations
+ window = win(win_length, win_beta)
+
+ w = np.concatenate((np.zeros(int(np.floor(zero_pad / 2))), window, np.zeros(int(np.floor(zero_pad / 2)))))
+ win_len = len(w)
+ win_len_half = win_len // 2
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ num_of_chan = x.shape[1]
+
+ # Time-stretch function
+ if np.isscalar(alpha):
+ anchor_points = np.array([[0, 0], [int(x.shape[0]) - 1, int(np.ceil(alpha * x.shape[0])) - 1]])
+ else:
+ anchor_points = alpha.astype(int)
+
+ output_length = anchor_points[-1, 1] + 1
+ syn_win_pos = np.arange(0, output_length + win_len_half, syn_hop) # positions of the synthesis winLenHalf windows
+ # in the output
+
+ fi = scipy.interpolate.interp1d(anchor_points[:, 1], anchor_points[:, 0], kind='linear', fill_value='extrapolate')
+ ana_win_pos = fi(syn_win_pos)
+ ana_win_pos = np.round(ana_win_pos).astype(int) # positions of the analysis windows in the input
+ ana_hop = np.append([0], ana_win_pos[1:] - ana_win_pos[:-1]) # analysis hop sizes
+
+ # check conditions
+ assert anchor_points[0,1] == 0, "First sample for target sequence must be zero."
+ assert anchor_points[0,0] >= 0, "All anchor points must be non-negative."
+ assert np.all(ana_hop[1:] > 0), "The anchor point sequences must be stricly increasing for both source and target."
+
+ # Phase Vocoder
+ y = np.zeros((output_length, num_of_chan)) # initialize output
+
+ for c in range(num_of_chan): # loop over channels
+
+ x_c = x[:, c]
+
+ # STFT
+ X, f, t = stft(x_c, ana_hop=ana_win_pos, win_length=win_length, win_beta=win_beta, Fs=Fs, num_of_frames=-1,
+ fft_shift=fft_shift, zero_pad=zero_pad)
+
+ # Phase adaptation
+ Y = np.zeros(X.shape, dtype=complex) # the spectrogram of the output
+ Y[:, 0] = X[:, 0] # phase initialization
+ N = len(w)
+ k = np.arange(0, N // 2 + 1) # center frequencies of the N/2+1 first bins of the spectrum in
+ # 'oscillations per frame'
+ omega = 2 * np.pi * k / N # phase advances per sample for the frequencies k
+
+ for i in range(1, X.shape[1]):
+ dphi = omega * ana_hop[i] # expected phase advances from the last to the current input frame
+ ph_curr = np.angle(X[:, i]) # phases of the current input frame
+ ph_last = np.angle(X[:, i - 1]) # phases of the last input frame
+ hpi = (ph_curr - ph_last) - dphi # heterodyned phase increments
+ # hpi = np.array([num - 2 * np.pi * round(num/(2*np.pi)) for num in hpi]) # reduce to the range -pi:pi,
+ # np.round() deviates from Matlab's round()
+ # matlab_round = np.vectorize(lambda v: round(v))
+ hpi = hpi - 2 * np.pi * np.round(hpi / (2 * np.pi)) # reduce to the range -pi:pi
+ ipa_sample = omega + hpi / ana_hop[i] # instantaneous phase advances per sample
+ ipa_hop = ipa_sample * syn_hop # instantaneous phase advances per synthesis hopsize
+ ph_syn = np.angle(Y[:, i - 1]) # phases of the last synthesized frame
+
+ # We now compute a phasor that rotates the phase angles of the current input frame by angles theta such
+ # that no phase discontinuities occur when re-synthesizing the resulting spectrogram with the synthesis
+ # hopsize
+ if not phase_locking: # standard phase vocoder: the phase continuity of every bin is preserved separately
+ theta = ph_syn + ipa_hop - ph_curr # phases of the last output frame Instantaneous phase advance
+ # Phases of the current input frame
+
+ else: # Phase vocoder with identity phase locking: the phase relationships from the input frame are
+ # partially preserved by 'locking' the phases of bins in the region of influence of a peak in the
+ # sprectrum to the phase of the peak's bin
+ p, irs, ire = find_peaks(X[:, i]) # Get the peaks in the spectrum together with their regions of
+ # influence
+ theta = np.zeros(Y[:, i].shape)
+
+ for n in range(0, len(p)):
+ theta[irs[n]:ire[n]] = ph_syn[p[n]] + ipa_hop[p[n]] - ph_curr[p[n]] # Phases of the last
+ # output frame, Instantaneous phase advance, Phases of the current input frame
+
+ Y[:, i] = np.exp(1j * theta) * X[:, i]
+
+ # ISTFT
+ y_c = istft(Y, syn_hop=syn_hop, win_length=win_length, win_beta=win_beta, zero_pad=zero_pad, num_of_iter=1,
+ orig_sig_len=output_length, restore_energy=restore_energy, fft_shift=fft_shift)
+
+ y[:, c] = y_c[:, 0]
+
+ return y
+
+
+[docs]def wsola_tsm(x, alpha, syn_hop=512, win_length=1024, win_beta=2, tol=512) -> np.ndarray:
+ """
+ Waveform Similarity Overlap and Add (WSOLA) algorithm that rescales the time-axis of the input signal x
+ according to the time-stretch function s without altering the pitch of x.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N,num_of_chan)], real - valued
+ Signal to be transformed
+
+ alpha : float or np.ndarray [shape=(S, 2)]
+ Time stretch function, given by a constant (float) or a set of S anchor points (int).
+ A valid anchor point sequence
+ (1) contains only non-negative values,
+ (2) both sequences along the first axis are strictly increasing,
+ (3) starts with position (m, 0), where m is an intereger >= 0.
+ These conditions will be checked and an error is thrown if they are not met.
+ See `libtsm.ensure_validity` for one way to ensure that condition (2) is fulfilled.
+
+ syn_hop : int
+ hop size of the synthesis window
+
+ win_length : int
+ length of the analysis and synthesis window
+
+ win_beta : int
+ exponent of sin^beta window
+
+ tol : int
+ Amount of samples the window will be shifted to avoid phase discontinuities when overlap-adding to form the
+ output signal
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,num_of_chan)], real - valued
+ The time-scale modified output signal
+ """
+
+ # Pre-calculations
+ window = win(win_length, win_beta)
+
+ w = window
+ win_len = len(w)
+ win_len_half = np.around(win_len / 2).astype(int)
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ num_of_chan = x.shape[1]
+
+ # Time-stretch function
+ if np.isscalar(alpha):
+ anchor_points = np.array([[0, 0], [int(x.shape[0]) - 1, int(np.ceil(alpha * x.shape[0])) - 1]])
+ else:
+ anchor_points = alpha.astype(int)
+
+ output_length = anchor_points[-1, 1] + 1
+ syn_win_pos = np.arange(0, output_length + win_len_half, syn_hop) # positions of the synthesis winLenHalf
+ # windows in the output
+
+ fi = scipy.interpolate.interp1d(anchor_points[:, 1], anchor_points[:, 0], kind='linear', fill_value='extrapolate')
+ ana_win_pos = fi(syn_win_pos)
+ ana_win_pos = np.round(ana_win_pos).astype(int) # positions of the analysis windows in the input
+ ana_hop = np.append([0], ana_win_pos[1:] - ana_win_pos[:-1]) # analysis hop sizes
+
+ # check conditions
+ assert anchor_points[0,1] == 0, "First sample for target sequence must be zero."
+ assert anchor_points[0,0] >= 0, "All anchor points must be non-negative."
+ assert np.all(ana_hop[1:] > 0), "The anchor point sequences must be stricly increasing for both source and target."
+
+ # WSOLA
+ y = np.zeros((output_length, num_of_chan)) # initialize output
+ min_fac = np.min(syn_hop / ana_hop[1:]) # the minimal local stretching factor
+ # to avoid that we access x outside its range, we need to zero pad it appropriately
+ x = np.pad(x, [(win_len_half + tol, int(np.ceil(1 / min_fac)) * win_len + tol), (0, 0)])
+ ana_win_pos += tol # compensate for the extra 'tol' padded zeros at the beginning of x
+
+ for c in range(num_of_chan): # loop over channels
+ x_c = x[:, c]
+ y_c = np.zeros((output_length + 2 * win_len, 1)) # initialize the output signal
+ ow = np.zeros((output_length + 2 * win_len, 1)) # keep track of overlapping windows
+ delay = 0 # shift of the current analysis window position
+
+ for i in range(len(ana_win_pos) - 1):
+ # OLA
+ curr_syn_win_ran = np.arange(syn_win_pos[i], syn_win_pos[i] + win_len, dtype=int) # range of current
+ # synthesis window
+ curr_ana_win_ran = np.arange(ana_win_pos[i] + delay, ana_win_pos[i] + win_len + delay, dtype=int) # range
+ # of the current analysis window, shift by 'del' offset
+ y_c[curr_syn_win_ran, 0] += x_c[curr_ana_win_ran] * w # overlap and add
+ ow[curr_syn_win_ran, 0] += w # update the sum of overlapping windows
+ nat_prog = x_c[curr_ana_win_ran + syn_hop] # 'natural progression' of the last copied audio segment
+ next_ana_win_ran = np.arange(ana_win_pos[i + 1] - tol, ana_win_pos[i + 1] + win_len + tol, dtype=int) #
+ # range where the next analysis window could be located (including the tolerance region)
+ x_next_ana_win_ran = x_c[next_ana_win_ran] # corresponding segment in x
+
+ # Cross Correlation
+ cc = cross_corr(x_next_ana_win_ran, nat_prog, win_len) # compute the cross correlation
+ max_index = np.argmax(cc) # pick the optimizing index in the cross correlation
+ delay = tol - max_index # infer the new 'delay'
+
+ # process last frame
+ y_c[syn_win_pos[-1]:syn_win_pos[-1] + win_len, 0] += x_c[ana_win_pos[i] + delay:ana_win_pos[
+ i] + win_len + delay] * w
+ ow[syn_win_pos[-1]:syn_win_pos[-1] + win_len, 0] += w
+
+ # re-normalize the signal by dividing by the added windows
+ ow[ow < 10 ** (-3)] = 1 # avoid potential division by zero
+ y_c /= ow
+
+ # remove zero-padding at the beginning
+ y_c = y_c[win_len_half:]
+
+ # remove zero-padding at the end
+ y_c = y_c[0:output_length]
+
+ y[:, c] = y_c[:, 0]
+
+ return y
+
+
+[docs]def hps_tsm(x, alpha, Fs=22050, hps_ana_hop=256, hps_win_length=1024, hps_win_beta=2, hps_zero_pad=0,
+ hps_fil_len_harm=10, hps_fil_len_perc=10, pv_syn_hop=512, pv_win_length=2048, pv_win_beta=2, pv_zero_pad=0,
+ pv_restore_energy=False, pv_fft_shift=False, ola_syn_hop=128, ola_win_length=256, ola_win_beta=2) \
+ -> np.ndarray:
+ """
+ Time Scale Modification algorithm based on Harmonic - Percussive separation. After separation is
+ performed, the algorithm uses two phase vocoder TSM and WSOLA TSM algorithms for the Harmonic and percussive part
+ separately.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real - valued
+ Signal to be transformed
+
+ alpha : float or np.ndarray [shape=(S, 2)]
+ Time stretch function, given by a constant (float) or a set of S anchor points (int).
+ A valid anchor point sequence
+ (1) contains only non-negative values,
+ (2) both sequences along the first axis are strictly increasing,
+ (3) starts with position (m, 0), where m is an intereger >= 0.
+ These conditions will be checked and an error is thrown if they are not met.
+ See `libtsm.ensure_validity` for one way to ensure that condition (2) is fulfilled.
+
+ Fs : int
+ Sampling rate
+
+ hps_ana_hop : int
+ hop size for HPS
+
+ hps_win_length : int
+ window length for HPS
+
+ hps_win_beta : int
+ exponent of sin^beta window
+
+ hps_zero_pad : int
+ For FFT. Number of zeros padded to the window to increase the fft size
+
+ hps_fil_len_harm: int
+ Length of the median filter in time direction.
+
+ hps_fil_len_perc: int
+ Length of the median filter in frequency direction.
+
+ pv_syn_hop : int
+ hop size for synthesize windows of phase vocoder
+
+ pv_win_length : int
+ window length for phase vocoder
+
+ pv_win_beta : int
+ exponent of sin^beta window
+
+ pv_zero_pad : int
+ phase vocoder zero padding
+
+ pv_restore_energy : bool
+ restore energy of signal in phase vocoder
+
+ pv_fft_shift : bool
+ fft shift in phase vocoder
+
+ ola_syn_hop : int
+ synthesis hop size of OLA
+
+ ola_win_length : int
+ window length for OLA
+
+ ola_win_beta : int
+ exponent of sin^beta window
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,1)], real - valued
+ The time-scale modified output signal
+ """
+
+ # Harmonic-Percussive Separation
+ x_harm, x_perc = hps(x, ana_hop=hps_ana_hop, win_length=hps_win_length, win_beta=hps_win_beta, Fs=Fs,
+ zero_pad=hps_zero_pad, fil_len_harm=hps_fil_len_harm, fil_len_perc=hps_fil_len_perc,
+ masking_mode='binary')
+
+ # Phase Vocoder for harmonic part
+ y_harm = pv_tsm(x_harm, alpha=alpha, syn_hop=pv_syn_hop, win_length=pv_win_length, win_beta=pv_win_beta, Fs=Fs,
+ zero_pad=pv_zero_pad, restore_energy=pv_restore_energy, fft_shift=pv_fft_shift, phase_locking=True)
+
+ # OLA for percussive part
+ y_perc = wsola_tsm(x_perc, alpha=alpha, syn_hop=ola_syn_hop, win_length=ola_win_length, win_beta=ola_win_beta,
+ tol=0)
+
+ # Synthesis
+ y = y_harm + y_perc
+
+ return y
+
+
+[docs]def pv_int_tsm(x, alpha, syn_hop=512, win_length=2048, win_beta=2, Fs=22050, zero_pad=-1, restore_energy=False,
+ fft_shift=True) -> np.ndarray:
+ """
+ Phase Vocoder Time scale modification algorithm, that rescales the time-axis of the input signal x
+ according to the time-stretch function s without altering the pitch of x. This algorithm is optimized for integer
+ values of the time stretching function.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N,)], real - valued
+ Signal to be transformed
+
+ alpha : int or np.ndarray
+ Time stretch factor
+
+ syn_hop : int
+ hop size of the synthesis window
+
+ win_length : int
+ length of analysis and synthesis window for STFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ zero_pad : int
+ For FFT. Number of zeros padded to the window to increase the fft size
+
+ restore_energy : bool
+ For FFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
+
+ fft_shift: bool
+ For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
+
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,1)], real - valued
+ The time-scale modified output signal
+ """
+
+ # Pre-Calculations
+ window = win(win_length, win_beta)
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ num_of_chan = x.shape[1]
+
+ if zero_pad == -1:
+ zero_pad = alpha * window.shape[0] // 2
+
+ wn = np.hstack((np.zeros((int(np.floor(zero_pad / 2)))),
+ window,
+ np.zeros((int(np.floor(zero_pad / 2))))))
+ win_len = wn.shape[0]
+ win_len_half = int(np.round(win_len / 2))
+
+ if (np.isscalar(alpha)) and (np.mod(alpha, 1) == 0) and (alpha >= 1):
+ anchor_points = np.array([[0, 0], [int(x.shape[0]), int(np.ceil(alpha * x.shape[0]))]])
+ else:
+ raise Exception("alpha needs to be an integer >= 1 !")
+
+ while np.mod(syn_hop, alpha) != 0:
+ syn_hop = syn_hop + 1
+
+ output_length = int(anchor_points[-1, 1])
+ output_window_pos = np.arange(0, output_length + win_len_half, syn_hop) # positions of the synthesis winLenHalf
+ # windows in the output
+ input_window_pos = output_window_pos // alpha
+
+ y = np.zeros((output_length, num_of_chan))
+
+ for c in range(num_of_chan):
+ # stft
+ X, f, t = stft(x[:, c], ana_hop=input_window_pos, win_length=win_length, win_beta=win_beta, Fs=Fs,
+ num_of_frames=-1, fft_shift=fft_shift, zero_pad=zero_pad)
+
+ # Phase Adaption
+ Y = np.abs(X) * np.exp(1j * alpha * np.angle(X))
+
+ # istft
+ y_c = istft(Y, syn_hop=syn_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, zero_pad=zero_pad,
+ num_of_iter=1, orig_sig_len=output_length, restore_energy=restore_energy, fft_shift=fft_shift)
+
+ y[:, c] = y_c[:, 0]
+
+ return y
+
+
+[docs]def two_step_tsm(x, alpha, Fs=22050, order='exact-coarse') -> np.ndarray:
+ """
+ Time Scale Modification algorithm, where the signal is stretched by the integer and decimal part
+ of the time stretch function using two different algorithms.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real - valued
+ Signal to be transformed
+
+ alpha : float
+ Scalar time stretch factor
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ order : 'exact-coarse' or 'coarse-exact'
+ Decides which of the two time stretching functions will be computed first, coarse corresponding to the integer
+ part
+
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,num_of_chan)], real - valued
+ The time-scale modified output signal
+ """
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ alpha_rough = np.max([1, np.round(alpha)]).astype(int)
+ alpha_exact = alpha / alpha_rough
+
+ if order == 'exact-coarse':
+ y_exact = hps_tsm(x, alpha_exact, Fs=Fs)
+ y = pv_int_tsm(y_exact[:, 0], alpha_rough, Fs=Fs)
+
+ elif order == 'coarse-exact':
+ y_coarse = pv_int_tsm(x, alpha_rough, Fs=Fs)
+ y = hps_tsm(y_coarse[:, 0], alpha_exact, Fs=Fs)
+ else:
+ raise Exception("Invalid order!")
+
+ return y
+
+"""
+Description: libtsm utility functions
+Contributors: Sebastian Rosenzweig, Simon Schwär, Jonathan Driedger, Meinard Müller
+License: The MIT license, https://opensource.org/licenses/MIT
+This file is part of libtsm (https://www.audiolabs-erlangen.de/resources/MIR/2021-DAFX-AdaptivePitchShifting)
+"""
+
+import numpy as np
+import scipy as sc
+import scipy.signal
+from typing import Tuple
+
+
+[docs]def ensure_validity(alpha, syn_hop=128):
+ """Remove points from a sequence of anchor points to ensure a valid TSM input
+
+ This function removes any anchor points that lead to a non-monotonous increase of the analysis window and
+ returns a new sequence of anchor points that is a valid TSM input.
+
+ Note that, depending on the data, this is not a sensible way to ensure a valid TSM input. Often, it may be
+ advisable to use coarser and semantically meaningful positions for the anchor points.
+
+ Parameters
+ ----------
+ alpha : float or np.ndarray [shape=(S, 2)]
+ Time stretch function, given by a set of S anchor points (int).
+
+ syn_hop : int
+ (smallest) hop size of the synthesis window (default: 128)
+ """
+ d_ana = np.diff(alpha[:,0])
+ d_syn = np.diff(alpha[:,1])
+ not_too_steep = np.pad(np.round(d_syn / d_ana) <= syn_hop, (1,0), constant_values=True)
+
+ # call recursively to remove consecutive points that are too steep
+ if not np.all(not_too_steep):
+ return ensure_validity(alpha[not_too_steep,:], syn_hop)
+ else:
+ return alpha
+
+
+[docs]def win(win_len, beta) -> np.ndarray:
+ """
+ Generates a sin^beta window.
+
+ Parameters
+ ----------
+ win_len : int
+ length of the window
+
+ beta : int
+ Exponent of the window
+ Returns
+ -------
+ w : np.ndarray [shape=(win_len, )]
+ The desired window
+ """
+ w = np.sin((np.pi * np.arange(0, win_len)) / win_len) ** beta
+
+ return w
+
+
+[docs]def hps(x, ana_hop=256, win_length=1024, win_beta=2, Fs=22050, zero_pad=0, fil_len_harm=10, fil_len_perc=10,
+ masking_mode='binary') -> Tuple[np.ndarray, np.ndarray]:
+ """
+ Harmonic - Percussive separation usign median filters.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real - valued
+ Signal to be transformed
+
+ ana_hop : int
+ hop size of the synthesis window
+
+ win_length : int
+ length of analysis and synthesis window for STFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ zero_pad : int
+ For FFT. Number of zeros padded to the window to increase the fft size
+
+ fil_len_harm: int
+ Length of the median filter in time direction. A shorter filter makes it more likely
+ that the signal is interpreted as harmonic.
+
+ fil_len_perc: int
+ Length of the median filter in frequency direction. A shorter filter makes it more likely
+ that the signal is interpreted as percussive.
+
+ masking_mode : either "binary" or "relative"
+ Selects Harmonic Percussive separation masking mode (soft or binary masking)
+
+ Returns
+ -------
+ x_harm : np.ndarray
+ Harmonic Component of input signal x
+
+ x_perc : np.ndarray
+ Percussive Component of input signal x
+ """
+ # Pre calculations
+ window = win(win_length, win_beta)
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ num_of_chan = x.shape[1]
+
+ # harmonic-percussive separation
+ x_harm = np.zeros(x.shape) # Initialize output
+ x_perc = np.zeros(x.shape) # Initialize output
+
+ for c in range(num_of_chan): # loop over channels
+ x_c = x[:, c]
+
+ # stft
+ spec, f, t = stft(x_c, ana_hop=ana_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, num_of_frames=-1,
+ fft_shift=False, zero_pad=zero_pad)
+ mag_spec = np.abs(spec)
+
+ # harmonic-percussive separation
+ mag_spec_perc = median_filter(mag_spec, fil_len_perc, 0)
+ mag_spec_harm = median_filter(mag_spec, fil_len_harm, 1)
+
+ if masking_mode == 'binary':
+ mask_harm = mag_spec_harm > mag_spec_perc
+ mask_perc = mag_spec_harm <= mag_spec_perc
+
+ elif masking_mode == 'relative':
+ mask_harm = mag_spec_harm / (mag_spec_harm + mag_spec_perc + np.finfo(float).eps)
+ mask_perc = mag_spec_perc / (mag_spec_harm + mag_spec_perc + np.finfo(float).eps)
+
+ else:
+ raise Exception('masking mode must either be "binary" or "relative"!')
+
+ spec_harm = mask_harm * spec
+ spec_perc = mask_perc * spec
+
+ # istft
+ x_harm_c = istft(spec_harm, syn_hop=ana_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, zero_pad=zero_pad,
+ num_of_iter=1, orig_sig_len=x.shape[0], restore_energy=False, fft_shift=False)
+ x_perc_c = istft(spec_perc, syn_hop=ana_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, zero_pad=zero_pad,
+ num_of_iter=1, orig_sig_len=x.shape[0], restore_energy=False, fft_shift=False)
+
+ x_harm[:, c] = x_harm_c[:, 0]
+ x_perc[:, c] = x_perc_c[:, 0]
+
+ return x_harm, x_perc
+
+
+[docs]def median_filter(X, filt_len, dim) -> np.ndarray:
+ """
+ Median filter implementation.
+
+ Parameters
+ ----------
+ X : np.ndarray
+ Spectrogram
+ filt_len : int
+ Median filter length
+ dim : int
+ Dimension in which median filter should be applied
+
+ Returns
+ -------
+ Y : np.ndarray
+ Median-filtered spectrogram
+
+ """
+ s = X.shape
+ Y = np.zeros(s)
+
+ if dim == 0:
+ X_padded = np.vstack((np.zeros((int(np.floor(filt_len / 2)), s[1])),
+ X,
+ np.zeros((int(np.ceil(filt_len / 2)), s[1]))))
+ for i in range(s[0]):
+ Y[i, :] = np.median(X_padded[i:i + filt_len, :], axis=0)
+
+ elif dim == 1:
+ X_padded = np.hstack((np.zeros((s[0], int(np.floor(filt_len / 2)))),
+ X,
+ np.zeros((s[0], int(np.ceil(filt_len / 2))))))
+ for i in range(s[1]):
+ Y[:, i] = np.median(X_padded[:, i:i + filt_len], axis=1)
+
+ else:
+ raise Exception("Invalid div!")
+
+ return Y
+
+
+[docs]def stft(x, ana_hop=2048, win_length=4096, win_beta=2, Fs=22050, num_of_frames=-1, fft_shift=False, zero_pad=0) -> \
+ Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Computes the Short-Time Fourier Transform (STFT) of the input audio signal.
+
+ Parameters
+ ----------
+ x : np.ndarray, real-valued
+ Signal to be transformed
+
+ ana_hop : int or np.ndarray
+ hop size of the analysis window
+
+ win_length : int
+ length of analysis window for STFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ Fs : int
+ Sampling rate of the input audio signal x
+
+ num_of_frames : int
+ Fixes the number of FFT frames to be computed
+
+ fft_shift: bool
+ For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
+
+ zero_pad : int
+ For FFT. Number of zeros padded to the window to increase the fft size
+
+ Returns
+ -------
+ X : np.ndarray [shape=(K, M + 1)], complex-valued
+ The discrete short-time Fourier transform
+
+ f : np.ndarray [shape=(K, )], real-valued
+ Center frequencies of all Fourier bins given in Hertz
+
+ t : np.ndarray [shape=(M+1, )], real-valued
+ Time instances where the respective Fourier spectra were computed, given in seconds
+
+ """
+ # Pre-calculations
+ window = win(win_length, win_beta)
+
+ # Zero-pad the window
+ w = np.concatenate((np.zeros(int(np.floor(zero_pad / 2))), window, np.zeros(int(np.floor(zero_pad / 2)))))
+ win_len = int(len(w))
+ win_len_half = np.around(win_len / 2).astype(int)
+
+ max_ana_hop = int(np.max(ana_hop))
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ # Pad the audio to center the windows and to avoid problems at the end
+ x_padded = np.vstack((np.zeros((win_len_half, 1)), x, np.zeros((win_len+max_ana_hop+1, 1))))
+
+ # In case ana_hop is a scalar, sample the window positions evenly in the input signal
+ if np.isscalar(ana_hop):
+ if num_of_frames < 0:
+ num_of_frames = int(np.floor((len(x_padded) - win_len)/ana_hop + 1))
+
+ win_pos = np.arange(num_of_frames).astype(int) * ana_hop
+ else:
+ if num_of_frames < 0:
+ num_of_frames = len(ana_hop)
+
+ win_pos = ana_hop[0:num_of_frames].astype(int)
+
+ # Spectrogram calculation
+ spec = np.zeros((win_len_half + 1, num_of_frames), dtype=complex)
+
+ for i in range(num_of_frames):
+ xi = x_padded[win_pos[i]:win_pos[i] + win_len] * w.reshape(-1, 1)
+
+ if fft_shift == 1:
+ xi = np.fft.fftshift(xi)
+
+ Xi = np.fft.fft(xi, axis=0)
+
+ spec[:, i] = Xi[0:win_len_half + 1, 0]
+
+ # Axis calculation
+ t = win_pos / Fs
+ f = np.arange(0, win_len_half + 1) * Fs / win_len
+
+ return spec, f, t
+
+
+[docs]def istft(spec, syn_hop=2048, win_length=4096, win_beta=2, Fs=22050, zero_pad=0, num_of_iter=1, orig_sig_len=-1,
+ restore_energy=False, fft_shift=False) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Computes the 'inverse' Short Time Fourier Transform, according to the paper "Signal Estimation from Modified
+ Short-Time Fourier Transform" by Griffin and Lim.
+
+ Parameters
+ ----------
+ spec : np.ndarray [shape=(K,M+1)] , complex-valued
+ A complex spectrogram generated by STFT.
+
+ syn_hop : int
+ hop size of the synthesis window
+
+ win_length : int
+ length of synthesis window for ISTFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ Fs : int
+ sampling rate
+
+ zero_pad : int
+ For IFFT. Number of zeros padded to the window to increase the fft size
+
+ num_of_iter : int
+ number of iterations for synthesis
+
+ orig_sig_len : int
+ Original length of the audio signal such that the output can be trimmed accordingly, in samples
+
+ restore_energy : bool
+ For IFFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
+
+ fft_shift : bool
+ For IFFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
+
+ Returns
+ -------
+ y : np.ndarray [shape=(L,1)], real - valued
+ The time-domain signal.
+ """
+
+ # Pre-calculations
+ num_of_frames = spec.shape[1]
+
+ # First iteration
+ Y_i = spec
+ y_i = lsee_mstft(Y_i, syn_hop=syn_hop, win_length=win_length, win_beta=win_beta, zero_pad=zero_pad,
+ restore_energy=restore_energy, fft_shift=fft_shift)
+
+ # Remaining iterations
+ for j in range(1, num_of_iter):
+ Y_i = np.abs(spec) * np.exp(1j*np.angle(stft(y_i, ana_hop=syn_hop, win_length=win_length, win_beta=win_beta,
+ Fs=Fs, num_of_frames=num_of_frames, fft_shift=fft_shift,
+ zero_pad=zero_pad)[0]))
+ y_i = lsee_mstft(Y_i, syn_hop=syn_hop, win_length=win_length, win_beta=win_beta, zero_pad=zero_pad,
+ restore_energy=restore_energy, fft_shift=fft_shift)
+
+ y = y_i
+
+ # If the original Length of the signal is known, also remove the zero padding at the end
+ if orig_sig_len > 0:
+ y = y[:orig_sig_len]
+
+ return y
+
+
+[docs]def lsee_mstft(X, syn_hop=2048, win_length=4096, win_beta=2, zero_pad=0, restore_energy=0, fft_shift=0) -> np.ndarray:
+ """
+ Computes the 'inverse' Short Time Fourier Transform (ISTFT) using the Griffin Lim procedure.
+
+ Parameters
+ ----------
+ X : np.ndarray [shape=(K,M+1)] , complex-valued
+ A complex spectrogram generated by STFT.
+
+ syn_hop : int
+ hop size of the synthesis window
+
+ win_length : int
+ length of analysis and synthesis window for STFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ zero_pad : int
+ For IFFT. Number of zeros padded to the window to increase the fft size
+
+ restore_energy : bool
+ For IFFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
+
+ fft_shift : bool
+ For IFFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
+
+ Returns
+ -------
+ x: np.ndarray [shape=(L,1)], real-valued
+ The time-domain signal.
+ """
+
+ # Pre-calculations
+ window = win(win_length, win_beta)
+ w = np.concatenate((np.zeros(int(np.floor(zero_pad / 2))), window, np.zeros(int(np.floor(zero_pad / 2)))))
+ win_len = int(len(w))
+ win_len_half = np.around(win_len / 2).astype(int)
+ num_of_frames = X.shape[1]
+ win_pos = np.arange(0, num_of_frames).astype(int) * syn_hop
+ signal_length = win_pos[-1] + win_len
+
+ x = np.zeros((signal_length, 1)) # re-synthesized signal
+ ow = np.zeros((signal_length, 1)) # sum of overlapping windows
+
+ for i in range(num_of_frames):
+ curr_spec = X[:, i]
+
+ # add the conjugate complex symmetric upper half of the spectrum
+ Xi = np.concatenate((curr_spec, np.conj(curr_spec[-2:0:-1])))
+ xi = np.real(np.fft.ifft(Xi, axis=0))
+
+ if fft_shift == 1:
+ xi = np.fft.fftshift(xi)
+
+ xiw = xi * w
+
+ if restore_energy == 1:
+ xi_energy = np.sum(np.abs(xi))
+ xi_w_energy = np.sum(np.abs(xiw))
+ xiw = xiw * (xi_energy/(xi_w_energy+np.finfo(float).eps))
+
+ x[win_pos[i]:win_pos[i] + win_len, 0] += xiw
+ ow[win_pos[i]:win_pos[i] + win_len, 0] += w**2
+
+ ow[ow < 10**-3] = 1 # avoid potential division by zero
+ x = x / ow
+
+ # knowing the zeropads that were added in the stft computation, we can remove them again now. But since we do not
+ # know exactly how many zeros were padded at the end of the signal, it is only safe to remove winLenHalf zeros.
+ x = x[win_len_half:-win_len_half, :]
+
+ return x
+
+
+[docs]def find_peaks(X) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+ """
+ Finds peaks on spectrum X. An index in X is considered a peak if its value is the largest among its four nearest
+ neighbours.
+
+ Parameters
+ ----------
+ X : np.ndarray [shape=(K, )] , complex-valued
+ An FFT vector.
+
+ Returns
+ -------
+ peaks : np.ndarray [shape=(P, )] , real-valued
+ Vector with P peaks found
+ """
+ mag_spec = np.abs(X)
+ mag_spec_padded = np.hstack((np.zeros(2), mag_spec, np.zeros(2)))
+ peaks = np.where((mag_spec_padded[4:] < mag_spec_padded[2:-2]) &
+ (mag_spec_padded[3:-1] < mag_spec_padded[2:-2]) &
+ (mag_spec_padded[1:-3] < mag_spec_padded[2:-2]) &
+ (mag_spec_padded[0:-4] < mag_spec_padded[2:-2]))[0]
+
+ infl_region_start = np.zeros(peaks.shape, dtype=int)
+ infl_region_end = np.zeros(peaks.shape, dtype=int)
+
+ if peaks.size == 0:
+ return peaks, infl_region_start, infl_region_end
+
+ infl_region_start[0] = 0
+ infl_region_start[1:] = np.ceil((peaks[1:] + peaks[0:-1])/2)
+ infl_region_end[0:-1] = infl_region_start[1:]
+ infl_region_end[-1] = len(infl_region_end)
+
+ return peaks, infl_region_start, infl_region_end
+
+
+[docs]def cross_corr(x, y, win_len) -> np.ndarray:
+ """
+ Computes cross correlation between signals x and y over a window of size win_len.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real or complex - valued
+ Signal to be cross-correlated
+
+ y : np.ndarray [shape=(N, )], real or complex - valued
+ Signal to be cross-correlated
+
+ win_len : int
+ Cross correlation window, in samples
+
+ Returns
+ -------
+ y : np.ndarray [shape=(2N+1,)], real - valued
+ Crosscorrelated signal
+ """
+ # cross correlation is essentially the same as convolution with the first signal being reverted. In principle, we
+ # also need to take the complex conjugate of the reversed x, but since audio signals are real valued, we can skip
+ # this operation.
+ cc = np.convolve(np.flip(x), y)
+
+ # restrict the cross correlation result to just the relevant values
+ # Values outside of this range are related to deltas bigger or smaller than our tolerance values.
+ cc = cc[win_len-1:-(win_len-1)]
+
+ return cc
+
+
+[docs]def normalize_length(xf, length) -> np.ndarray:
+ """
+ Adjusts the length of signal xf to variable "length".
+
+ Parameters
+ ----------
+ xf : np.ndarray [shape=(N, )], real or complex - valued
+ Signal to be processed
+
+ length : int
+ Signal to be cross-correlated
+
+ Returns
+ -------
+ y : np.ndarray [shape=(length, )], real or complex - valued
+ Signal with modified length
+ """
+
+ if len(xf[:, 0]) < length:
+ pad_len = length - len(xf[:, 0])
+ y = np.concatenate((xf, np.zeros((pad_len, xf.shape[1]))), axis=0)
+ else:
+ y = xf[0:length, :]
+ return y
+
+
+[docs]def modify_spectral_envelope(x, y, ana_hop=64, win_length=1024, win_beta=1, Fs=22050, filt_len=24) -> np.ndarray:
+ """
+ Complement to the pitch shifting algorithm, that modifies the formants of the
+ pitch-shifted signal to match them with those of the original signal.
+
+ Parameters
+ ----------
+ x : np.ndarray [shape=(N, )], real - valued
+ Original input signal
+
+ y : np.ndarray [shape=(N, )], real - valued
+ Pitch-shifted signal
+
+ ana_hop : int
+ hop size of the STFT analysis and synthesis window
+
+ win_length : int
+ length of the analysis and synthesis window for STFT
+
+ win_beta : int
+ exponent of sin^beta window
+
+ Fs : int
+ Sampling rate of audio signals x and y
+
+ filt_len : int
+ number of samples for envelope modifying function
+
+ Returns
+ -------
+ y_spec_env_X : np.ndarray [shape=(N,)], real - valued
+ Pitch-shifted signal with modified spectral envelope
+ """
+
+ if len(x.shape) == 1:
+ x = x.reshape(-1, 1)
+
+ if len(y.shape) == 1:
+ y = y.reshape(-1, 1)
+
+ num_of_chan = x.shape[1]
+ y_spec_env_x = np.zeros(y.shape)
+
+ for c in range(num_of_chan):
+ x_c = x[:, c]
+ y_c = y[:, c]
+
+ # stft
+ X, _, _ = stft(x_c, ana_hop=ana_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, num_of_frames=-1,
+ fft_shift=False, zero_pad=0)
+ Y, _, _ = stft(y_c, ana_hop=ana_hop, win_length=win_length, win_beta=win_beta, Fs=Fs, num_of_frames=-1,
+ fft_shift=False, zero_pad=0)
+
+ # Compute spectral envelopes
+ env_X = comp_env(X, filt_len)
+ env_Y = comp_env(Y, filt_len)
+ Y_spec_env_X = np.multiply(np.divide(Y, env_Y), env_X)
+
+ # istft
+ y_spec_env_x[:, c] = istft(Y_spec_env_X, syn_hop=ana_hop, win_length=win_length, win_beta=win_beta, Fs=Fs,
+ zero_pad=0, num_of_iter=1, orig_sig_len=len(x), restore_energy=False,
+ fft_shift=False)[:, 0]
+
+ return y_spec_env_x
+
+
+[docs]def comp_env(X, filt_len) -> np.ndarray:
+ """
+ Computes the envelope of a given signal spectrum.
+
+ Parameters
+ ----------
+ X : np.ndarray [shape=(K,M+1)] , complex-valued
+ A complex spectrogram
+
+ filt_len : int
+ Length of the convolution window
+
+ Returns
+ -------
+ env : np.ndarray [shape=(K,M+1)] , real-valued
+ Spectral Envelope
+ """
+ kern = win(filt_len, 2) # Hann Window
+ kern.shape = (-1, 1) # Turn the window into a 2d array
+ env = sc.signal.convolve2d(np.abs(X), kern, mode='same') # not equivalent to Matlab's conv2()
+ env = np.divide(env, np.finfo(float).eps + np.tile(np.max(env, axis=0), (env.shape[0], 1))) # Normalization
+ env = np.multiply(env, np.tile(np.max(np.abs(X), axis=0), (np.abs(X).shape[0], 1))) # Scaling
+ env[env < 0.01] = 0.01
+
+ return env
+
' + + '' + + _("Hide Search Matches") + + "
" + ) + ); + }, + + /** + * helper function to hide the search marks again + */ + hideSearchWords: () => { + document + .querySelectorAll("#searchbox .highlight-link") + .forEach((el) => el.remove()); + document + .querySelectorAll("span.highlighted") + .forEach((el) => el.classList.remove("highlighted")); + localStorage.removeItem("sphinx_highlight_terms") + }, + + initEscapeListener: () => { + // only install a listener if it is really needed + if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return; + + document.addEventListener("keydown", (event) => { + // bail for input elements + if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return; + // bail with special keys + if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return; + if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) { + SphinxHighlight.hideSearchWords(); + event.preventDefault(); + } + }); + }, +}; + +_ready(SphinxHighlight.highlightSearchWords); +_ready(SphinxHighlight.initEscapeListener); diff --git a/docs/build/html/genindex.html b/docs/build/html/genindex.html new file mode 100644 index 0000000..e9c7d4f --- /dev/null +++ b/docs/build/html/genindex.html @@ -0,0 +1,278 @@ + + + + + ++ | + |
+ |
+ |
+ | + |
+ |
+ |
|
+
|
+
+ |
+ | + |
+ |
+ |
+ | + |
With Python >= 3.6, you can install libtsm
using the Python package manager pip:
pip install libtsm
+
For development, testing, or generating the API documentation, clone the git repository and install:
+git clone https://github.com/meinardmueller/libtsm.git
+cd libtsm
+pip install -e .[dev,tests,docs]
+
libtsm
is a Python toolbox for time-scale modification (TSM) and pitch-shifting.
Details and example application:
+https://www.audiolabs-erlangen.de/resources/MIR/2021-DAFX-AdaptivePitchShifting
+The toolbox is based on a re-implementation of the +Matlab TSM toolbox by Jonathan Driedger and Meinard Müller.
+If you use this toolbox, please consider the following references:
+ +Functions for Pitch-Shifting
+(Non-linear) pitch-shifting via time-scale modification and resampling.
+x (np.ndarray [shape=(N, )], real - valued) – Signal to be transformed
p (float or np.ndarray [shape=(M,)], real - valued) – Amount of pitch shifting to be applied, given in cents. Positive p indicates pitch rising, negative p a pitch +lowering.
t_p (np.ndarray [shape=(M,)], real - valued) – Array of time instances in seconds for adaptive pitch shifting, same length as p. If t==None, a fixed +pitch-shift is assumed.
Fs (int) – Sampling rate of the input audio signal x
order (Order of TSM and resampling, either "res-tsm" or "tsm-res".) –
**kwargs (Parameters for hps_tsm) –
y – The time-scale modified output signal
+np.ndarray [shape=(L,1)], real - valued
+Pitch modification algorithm via time-scale modification. The input signal is stretched via TSM and then resampled. +The code closely follows the Matlab implementation.
+x (np.ndarray [shape=(N, )], real - valued) – Signal to be transformed
n (int) – Amount of pitch shifting to be applied, given in cents. Positive n indicates pitch rising, negative n a pitch +lowering
Fs (int) – Sampling rate of the input audio signal x
y – The time-scale modified output signal
+np.ndarray [shape=(L,1)], real - valued
+Functions for Time-Scale Modification
+Time Scale Modification algorithm based on Harmonic - Percussive separation. After separation is +performed, the algorithm uses two phase vocoder TSM and WSOLA TSM algorithms for the Harmonic and percussive part +separately.
+x (np.ndarray [shape=(N, )], real - valued) – Signal to be transformed
alpha (float or np.ndarray [shape=(S, 2)]) –
Time stretch function, given by a constant (float) or a set of S anchor points (int). +A valid anchor point sequence
++++
+- +
contains only non-negative values,
- +
both sequences along the first axis are strictly increasing,
- +
starts with position (m, 0), where m is an intereger >= 0.
These conditions will be checked and an error is thrown if they are not met. +See libtsm.ensure_validity for one way to ensure that condition (2) is fulfilled.
+Fs (int) – Sampling rate
hps_ana_hop (int) – hop size for HPS
hps_win_length (int) – window length for HPS
hps_win_beta (int) – exponent of sin^beta window
hps_zero_pad (int) – For FFT. Number of zeros padded to the window to increase the fft size
hps_fil_len_harm (int) – Length of the median filter in time direction.
hps_fil_len_perc (int) – Length of the median filter in frequency direction.
pv_syn_hop (int) – hop size for synthesize windows of phase vocoder
pv_win_length (int) – window length for phase vocoder
pv_win_beta (int) – exponent of sin^beta window
pv_zero_pad (int) – phase vocoder zero padding
pv_restore_energy (bool) – restore energy of signal in phase vocoder
pv_fft_shift (bool) – fft shift in phase vocoder
ola_syn_hop (int) – synthesis hop size of OLA
ola_win_length (int) – window length for OLA
ola_win_beta (int) – exponent of sin^beta window
y – The time-scale modified output signal
+np.ndarray [shape=(L,1)], real - valued
+Phase Vocoder Time scale modification algorithm, that rescales the time-axis of the input signal x +according to the time-stretch function s without altering the pitch of x. This algorithm is optimized for integer +values of the time stretching function.
+x (np.ndarray [shape=(N,)], real - valued) – Signal to be transformed
alpha (int or np.ndarray) – Time stretch factor
syn_hop (int) – hop size of the synthesis window
win_length (int) – length of analysis and synthesis window for STFT
win_beta (int) – exponent of sin^beta window
Fs (int) – Sampling rate of the input audio signal x
zero_pad (int) – For FFT. Number of zeros padded to the window to increase the fft size
restore_energy (bool) – For FFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
fft_shift (bool) – For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
y – The time-scale modified output signal
+np.ndarray [shape=(L,1)], real - valued
+Time scale modification based on a phase vocoder
+Rescales the time-axis of the input signal x according to the time-stretch function alpha +without altering the pitch of x.
+x (np.ndarray [shape=(N, C)], real-valued) – Signal to be transformed, second dimension is an optional channel dimension
alpha (float or np.ndarray [shape=(S, 2)]) –
Time stretch function, given by a constant (float) or a set of S anchor points (int). +A valid anchor point sequence
++++
+- +
contains only non-negative values,
- +
both sequences along the first axis are strictly increasing,
- +
starts with position (m, 0), where m is an intereger >= 0.
These conditions will be checked and an error is thrown if they are not met. +See libtsm.ensure_validity for one way to ensure that condition (2) is fulfilled.
+syn_hop (int) – hop size of the synthesis window
win_length (int) – length of analysis and synthesis window for STFT
win_beta (int) – exponent of sin^beta window
Fs (int) – Sampling rate of the input audio signal x
zero_pad (int) – For FFT. Number of zeros padded to the window to increase the fft size
restore_energy (bool) – For FFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
fft_shift (bool) – For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
phase_locking (bool) – when True, Applies identity phase locking
y
+np.ndarray [shape=(L,1)], real - valued
+Time Scale Modification algorithm, where the signal is stretched by the integer and decimal part +of the time stretch function using two different algorithms.
+x (np.ndarray [shape=(N, )], real - valued) – Signal to be transformed
alpha (float) – Scalar time stretch factor
Fs (int) – Sampling rate of the input audio signal x
order ('exact-coarse' or 'coarse-exact') – Decides which of the two time stretching functions will be computed first, coarse corresponding to the integer +part
y – The time-scale modified output signal
+np.ndarray [shape=(L,num_of_chan)], real - valued
+Waveform Similarity Overlap and Add (WSOLA) algorithm that rescales the time-axis of the input signal x +according to the time-stretch function s without altering the pitch of x.
+x (np.ndarray [shape=(N,num_of_chan)], real - valued) – Signal to be transformed
alpha (float or np.ndarray [shape=(S, 2)]) –
Time stretch function, given by a constant (float) or a set of S anchor points (int). +A valid anchor point sequence
++++
+- +
contains only non-negative values,
- +
both sequences along the first axis are strictly increasing,
- +
starts with position (m, 0), where m is an intereger >= 0.
These conditions will be checked and an error is thrown if they are not met. +See libtsm.ensure_validity for one way to ensure that condition (2) is fulfilled.
+syn_hop (int) – hop size of the synthesis window
win_length (int) – length of the analysis and synthesis window
win_beta (int) – exponent of sin^beta window
tol (int) – Amount of samples the window will be shifted to avoid phase discontinuities when overlap-adding to form the +output signal
y – The time-scale modified output signal
+np.ndarray [shape=(L,num_of_chan)], real - valued
+Utility Functions
+Computes the envelope of a given signal spectrum.
+X (np.ndarray [shape=(K,M+1)] , complex-valued) – A complex spectrogram
filt_len (int) – Length of the convolution window
env – Spectral Envelope
+np.ndarray [shape=(K,M+1)] , real-valued
+Computes cross correlation between signals x and y over a window of size win_len.
+x (np.ndarray [shape=(N, )], real or complex - valued) – Signal to be cross-correlated
y (np.ndarray [shape=(N, )], real or complex - valued) – Signal to be cross-correlated
win_len (int) – Cross correlation window, in samples
y – Crosscorrelated signal
+np.ndarray [shape=(2N+1,)], real - valued
+Remove points from a sequence of anchor points to ensure a valid TSM input
+This function removes any anchor points that lead to a non-monotonous increase of the analysis window and +returns a new sequence of anchor points that is a valid TSM input.
+Note that, depending on the data, this is not a sensible way to ensure a valid TSM input. Often, it may be +advisable to use coarser and semantically meaningful positions for the anchor points.
+alpha (float or np.ndarray [shape=(S, 2)]) – Time stretch function, given by a set of S anchor points (int).
syn_hop (int) – (smallest) hop size of the synthesis window (default: 128)
Finds peaks on spectrum X. An index in X is considered a peak if its value is the largest among its four nearest +neighbours.
+X (np.ndarray [shape=(K, )] , complex-valued) – An FFT vector.
+peaks – Vector with P peaks found
+np.ndarray [shape=(P, )] , real-valued
+Harmonic - Percussive separation usign median filters.
+x (np.ndarray [shape=(N, )], real - valued) – Signal to be transformed
ana_hop (int) – hop size of the synthesis window
win_length (int) – length of analysis and synthesis window for STFT
win_beta (int) – exponent of sin^beta window
Fs (int) – Sampling rate of the input audio signal x
zero_pad (int) – For FFT. Number of zeros padded to the window to increase the fft size
fil_len_harm (int) – Length of the median filter in time direction. A shorter filter makes it more likely +that the signal is interpreted as harmonic.
fil_len_perc (int) – Length of the median filter in frequency direction. A shorter filter makes it more likely +that the signal is interpreted as percussive.
masking_mode (either "binary" or "relative") – Selects Harmonic Percussive separation masking mode (soft or binary masking)
x_harm (np.ndarray) – Harmonic Component of input signal x
x_perc (np.ndarray) – Percussive Component of input signal x
Computes the ‘inverse’ Short Time Fourier Transform, according to the paper “Signal Estimation from Modified +Short-Time Fourier Transform” by Griffin and Lim.
+spec (np.ndarray [shape=(K,M+1)] , complex-valued) – A complex spectrogram generated by STFT.
syn_hop (int) – hop size of the synthesis window
win_length (int) – length of synthesis window for ISTFT
win_beta (int) – exponent of sin^beta window
Fs (int) – sampling rate
zero_pad (int) – For IFFT. Number of zeros padded to the window to increase the fft size
num_of_iter (int) – number of iterations for synthesis
orig_sig_len (int) – Original length of the audio signal such that the output can be trimmed accordingly, in samples
restore_energy (bool) – For IFFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
fft_shift (bool) – For IFFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
y – The time-domain signal.
+np.ndarray [shape=(L,1)], real - valued
+Computes the ‘inverse’ Short Time Fourier Transform (ISTFT) using the Griffin Lim procedure.
+X (np.ndarray [shape=(K,M+1)] , complex-valued) – A complex spectrogram generated by STFT.
syn_hop (int) – hop size of the synthesis window
win_length (int) – length of analysis and synthesis window for STFT
win_beta (int) – exponent of sin^beta window
zero_pad (int) – For IFFT. Number of zeros padded to the window to increase the fft size
restore_energy (bool) – For IFFT. When True, rescales every windowed synthesis frame to compensate for synthesis energy leakage
fft_shift (bool) – For IFFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
x – The time-domain signal.
+np.ndarray [shape=(L,1)], real-valued
+Median filter implementation.
+X (np.ndarray) – Spectrogram
filt_len (int) – Median filter length
dim (int) – Dimension in which median filter should be applied
Y – Median-filtered spectrogram
+np.ndarray
+Complement to the pitch shifting algorithm, that modifies the formants of the +pitch-shifted signal to match them with those of the original signal.
+x (np.ndarray [shape=(N, )], real - valued) – Original input signal
y (np.ndarray [shape=(N, )], real - valued) – Pitch-shifted signal
ana_hop (int) – hop size of the STFT analysis and synthesis window
win_length (int) – length of the analysis and synthesis window for STFT
win_beta (int) – exponent of sin^beta window
Fs (int) – Sampling rate of audio signals x and y
filt_len (int) – number of samples for envelope modifying function
y_spec_env_X – Pitch-shifted signal with modified spectral envelope
+np.ndarray [shape=(N,)], real - valued
+Adjusts the length of signal xf to variable “length”.
+xf (np.ndarray [shape=(N, )], real or complex - valued) – Signal to be processed
length (int) – Signal to be cross-correlated
y – Signal with modified length
+np.ndarray [shape=(length, )], real or complex - valued
+Computes the Short-Time Fourier Transform (STFT) of the input audio signal.
+x (np.ndarray, real-valued) – Signal to be transformed
ana_hop (int or np.ndarray) – hop size of the analysis window
win_length (int) – length of analysis window for STFT
win_beta (int) – exponent of sin^beta window
Fs (int) – Sampling rate of the input audio signal x
num_of_frames (int) – Fixes the number of FFT frames to be computed
fft_shift (bool) – For FFT. When True, applies a circular shift to each frame of half its length, prior to computing the FFT
zero_pad (int) – For FFT. Number of zeros padded to the window to increase the fft size
X (np.ndarray [shape=(K, M + 1)], complex-valued) – The discrete short-time Fourier transform
f (np.ndarray [shape=(K, )], real-valued) – Center frequencies of all Fourier bins given in Hertz
t (np.ndarray [shape=(M+1, )], real-valued) – Time instances where the respective Fourier spectra were computed, given in seconds
+ l | ||
+ |
+ libtsm | + |
+ |
+ libtsm.pitchshift | + |
+ |
+ libtsm.tsm | + |
+ |
+ libtsm.utils | + |