ref.bib

@article{van1988beamforming,
  title={Beamforming: A versatile approach to spatial filtering},
  author={Van Veen, Barry D. and Buckley, Kevin M.},
  journal={IEEE ASSP Magazine},
  volume={5},
  number={2},
  pages={4--24},
  year={1988},
}
  
@inproceedings{erdogan2016improved,
  title={Improved MVDR beamforming using single-channel mask prediction networks},
  author={Erdogan, Hakan and Hershey, John R. and Watanabe, Shinji and Mandel, Michael and Le Roux, Jonathan},
  booktitle={Interspeech},
  year={2016}
}

@article{gannot2017consolidated,
  title={A consolidated perspective on multimicrophone speech enhancement and source separation},
  author={Gannot, Sharon and Vincent, Emmanuel and Markovich-Golan, Shmulik and Ozerov, Alexey},
  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  volume={25},
  number={4},
  pages={692--730},
  year={2017},
  publisher={IEEE}
}

@inproceedings{heymann2016neural,
  title={Neural network based spectral mask estimation for acoustic beamforming},
  author={Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  year={2016}
}

@inproceedings{higuchi2017deep,
  title={Deep clustering-based beamforming for separation with unknown number of sources},
  author={Higuchi, Takuya and Kinoshita, Keisuke and Delcroix, Marc and Zmolkova, Katerina and Nakatani, Tomohiro},
  booktitle={Interspeech},
  year={2017}
}

@inproceedings{leglaive2019semi,
  title={Semi-supervised multichannel speech enhancement with variational autoencoders and non-negative matrix factorization},
  author={Leglaive, Simon and Girin, Laurent and Horaud, Radu},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  pages={101--105},
  year={2019}
}

@article{li2017acoustic,
  title={Acoustic modeling for Google Home},
  author={Li, Bo and Sainath, Tara and Narayanan, Arun and Caroselli, Joe and Bacchiani, Michiel and Misra, Ananya and Shafran, Izhak and Sak, Hasim and Pundak, Golan and Chin, Kean and others},
  journal={Interspeech},
  year={2017}
}

@inproceedings{li2016neural,
  title={Neural network adaptive beamforming for robust multichannel speech recognition.},
  author={Li, Bo and Sainath, Tara N. and Weiss, Ron J. and Wilson, Kevin W. and Bacchiani, Michiel},
  booktitle={Interspeech},
  year={2016}
}

@inproceedings{meng2017deep,
  title={Deep long short-term memory adaptive beamforming networks for multichannel robust speech recognition},
  author={Meng, Zhong and Watanabe, Shinji and Hershey, John R. and Erdogan, Hakan},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  year={2017}
}

@article{nugraha2016multichannel,
  title={Multichannel audio source separation with deep neural networks},
  author={Nugraha, Aditya A. and Liutkus, Antoine and Vincent, Emmanuel},
  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
  volume={24},
  number={9},
  pages={1652--1664},
  year={2016}
  }

@inproceedings{perotin2018multichannel,
  title={Multichannel speech separation with recurrent neural networks from high-order Ambisonics recordings},
  author={Perotin, Laur{\'e}line and Serizel, Romain and Vincent, Emmanuel and Gu{\'e}rin, Alexandre},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  year={2018}
}

@article{wang2017supervised,
  title={Supervised speech separation based on deep learning: an overview},
  author={Wang, DeLiang and Chen, Jitong},
  journal={arXiv:1708.07524},
  year={2017}
}

@article{duong2010un,
  title={Under-determined reverberant audio source separation using a full-rank spatial covariance model},
  author={Duong, Ngoc QK and Vincent, Emmanuel and Gribonval, R{\'e}mi},
  journal={IEEE Transactions on Audio, Speech, and Language Processing},
  volume={18},
  number={7},
  pages={1830--1840},
  year={2010}
}

@inproceedings{heymann2017beamnet,
  title={Beamnet: End-to-end training of a beamformer-supported multi-channel ASR system},
  author={Heymann, Jahn and Drude, Lukas and Boeddeker, Christoph and Hanebrink, Patrick and Haeb-Umbach, Reinhold},
  booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
  year={2017}
}


@inproceedings{pasha_towards_2017,
	address = {San Francisco, CA},
	title = {Towards real-time source counting by estimation of coherent-to-diffuse ratios from ad-hoc microphone array recordings},
	isbn = {978-1-5090-5925-6},
	url = {http://ieeexplore.ieee.org/document/7895582/},
	doi = {10.1109/HSCMA.2017.7895582},
	abstract = {Coherent-to-diffuse ratio (CDR) estimates over short time frames are utilised for source counting using ad-hoc microphone arrays to record speech from multiple participants in scenarios such as a meeting. It is shown that the CDR estimates obtained at ad-hoc dual (two channel) microphone nodes, located at unknown locations within an unknown reverberant room, can detect time frames with more than one active source and are informative for source counting applications. Results show that interfering sources can be detected with accuracies ranging from 69\% to 89\% for delays ranging from 20 ms to 300 ms, with source counting accuracies ranged from 61\% to 81\% for two sources and the same range of delays.},
	language = {en},
	urldate = {2019-01-10},
	booktitle = {Hands-free {Speech} {Communications} and {Microphone} {Arrays}},
	publisher = {IEEE},
	author = {Pasha, Shahab and Donley, Jacob and Ritz, Christian and Zou, Yue Xian},
	year = {2017},
	keywords = {non-lu},
	pages = {161--165},
	file = {Pasha et al. - 2017 - Towards real-time source counting by estimation of.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\3NWMK2DK\\Pasha et al. - 2017 - Towards real-time source counting by estimation of.pdf:application/pdf},
}

@book{jacobsen_fundamentals_2013,
	title = {Fundamentals of general linear acoustics},
	language = {en},
	publisher = {John Wiley \& Sons},
	author = {Jacobsen, Finn and Juhl, Peter Moller},
	month = jul,
	year = {2013},
	keywords = {partiel-lu},
	file = {Jacobsen - Fundamentals of General Linear Acoustics.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\NP2YZPLB\\Jacobsen - Fundamentals of General Linear Acoustics.pdf:application/pdf},
}

@phdthesis{baque_analyse_2017,
	address = {Le Mans},
	title = {Analyse de scène sonore multi-capteurs : un front-end temps-réel pour la manipulation de scène},
	abstract = {La thèse s’inscrit dans un contexte d’essor de l’audio spatialisé (5.1, Dolby Atmos...). Parmi les formats audio 3D existants, l’ambisonie permet une représentation spatiale homogène du champ sonore et se prête naturellement à des manipulations : rotations, distorsion du champ sonore. L’objectif de cette thèse est de fournir un outil d’analyse et de manipulation de contenus audio (essentiellement vocaux) au format ambisonique. Un fonctionnement temps-réel et en conditions acoustiques réelles sont les principales contraintes à respecter. L’algorithme mis au point est basé sur une analyse en composantes indépendantes (ACI) appliquée trame à trame qui permet de décomposer le champ acoustique en un ensemble de contributions, correspondant à des sources (champ direct) ou à de la réverbération. Une étape de classification bayésienne, appliquée aux composantes extraites, permet alors l’identification et le dénombrement des sources sonores contenues dans le mélange. Les sources identifiées sont localisées grâce à la matrice de mélange obtenue par ACI, pour fournir une cartographie de la scène sonore. Une étude exhaustive des performances est menée sur des contenus réels en fonction de plusieurs paramètres : nombre de sources, environnement acoustique, longueur des trames, ou ordre ambisonique utilisé. Des résultats fiables en terme de localisation et de comptage de sources ont été obtenus pour des trames de quelques centaines de ms. L’algorithme, exploité comme prétraitement dans un prototype d’assistant vocal domestique, permet d’améliorer significativement les performances de reconnaissance, notamment en prise de son lointaine et en présence de sources interférentes.},
	language = {fr},
	urldate = {2018-10-18},
	school = {Université du Maine},
	author = {Baque, Mathieu},
	year = {2017},
	keywords = {partiel-lu},
	file = {Baque - 2017 - Analyse de scène sonore multi-capteurs  un front-.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\J9AR8IWT\\Baque - 2017 - Analyse de scène sonore multi-capteurs  un front-.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\Y4CMKVJG\\tel-01792433.html:text/html},
}

@article{williams_fourier_2000,
	title = {Fourier acoustics: sound radiation and nearfield acoustical holography},
	volume = {108},
	issn = {0001-4966},
	doi = {10.1121/1.1289662},
	language = {en},
	number = {4},
	journal = {The Journal of the Acoustical Society of America},
	author = {Williams, Earl G. and Mann, J. Adin},
	month = oct,
	year = {2000},
	keywords = {partiel-lu},
	pages = {1373--1373},
	file = {Williams et Mann - 2000 - Fourier Acoustics Sound Radiation and Nearfield A.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\5QUVK8GF\\Williams et Mann - 2000 - Fourier Acoustics Sound Radiation and Nearfield A.pdf:application/pdf},
}

@article{rafaely_analysis_2005,
	title = {Analysis and design of spherical microphone arrays},
	volume = {13},
	issn = {1063-6676},
	doi = {10.1109/TSA.2004.839244},
	abstract = {Spherical microphone arrays have been recently studied for sound-ﬁeld recordings, beamforming, and sound-ﬁeld analysis which use spherical harmonics in the design. Although the microphone arrays and the associated algorithms were presented, no comprehensive theoretical analysis of performance was provided. This paper presents a spherical-harmonics-based design and analysis framework for spherical microphone arrays. In particular, alternative spatial sampling schemes for the positioning of microphones on a sphere are presented, and the errors introduced by ﬁnite number of microphones, spatial aliasing, inaccuracies in microphone positioning, and measurement noise are investigated both theoretically and by using simulations. The analysis framework can also provide a useful guide for the design and analysis of more general spherical microphone arrays which do not use spherical harmonics explicitly.},
	language = {en},
	number = {1},
	journal = {IEEE Transactions on Speech and Audio Processing},
	author = {Rafaely, B.},
	month = jan,
	year = {2005},
	keywords = {partiel-lu},
	pages = {135--143},
	file = {Rafaely - 2005 - Analysis and design of spherical microphone arrays.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\KIEPWA8Z\\Rafaely - 2005 - Analysis and design of spherical microphone arrays.pdf:application/pdf},
}

@book{vincent_audio_2018,
	title = {Audio source separation and speech enhancement},
	publisher = {John Wiley \& Sons},
	author = {Vincent, Emmanuel and Virtanen, Tuomas and Gannot, Sharon},
	year = {2018},
	keywords = {non-lu},
	file = {Vincent et al. - 2018 - Audio Source Separation and Speech Enhancement.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\9Y4MT6VY\\Vincent et al. - 2018 - Audio Source Separation and Speech Enhancement.pdf:application/pdf},
}

@phdthesis{merimaa_analysis_2006,
	address = {Helsinki},
	title = {Analysis, synthesis, and perception of spatial sound - binaural localization modeling and multichannel loudspeaker reproduction},
	abstract = {In everyday audio environments, sound from several sources arrives at a listening position both directly from the sources and as reflections from the acoustical environment. This thesis deals, within some limitations, with analysis of the resulting spatial sound field, reproduction of perceptually relevant features of the sound as measured in a chosen listening position, as well as with modeling of the related auditory localization.},
	language = {en},
	school = {Helsinki University of Technology},
	author = {Merimaa, Juha},
	year = {2006},
	keywords = {partiel-lu},
	file = {Merimaa - 2006 - Analysis, Synthesis, and Perception of Spatial Sou.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AMGGKJ2K\\Merimaa - 2006 - Analysis, Synthesis, and Perception of Spatial Sou.pdf:application/pdf},
}

@phdthesis{daniel_representation_2001,
	address = {Paris},
	title = {Représentation de champs acoustiques, application à la transmission et à la reproduction de scènes sonores complexes dans un contexte multimédia},
	language = {French},
	school = {Paris VI},
	author = {Daniel, Jerôme},
	year = {2001},
	keywords = {partiel-lu},
	file = {Daniel - 2001 - Représentation de champs acoustiques, application .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\WT68JQ6H\\Daniel - 2001 - Représentation de champs acoustiques, application .pdf:application/pdf},
}

@article{kingma_adam:_2014,
	title = {Adam: {A} {Method} for {Stochastic} {Optimization}},
	shorttitle = {Adam},
	abstract = {We introduce Adam, an algorithm for ﬁrst-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efﬁcient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the inﬁnity norm.},
	language = {en},
	urldate = {2019-04-11},
	journal = {arXiv:1412.6980},
	author = {Kingma, Diederik P. and Ba, Jimmy},
	year = {2014},
	keywords = {Computer Science - Machine Learning},
	file = {Kingma et Ba - 2014 - Adam A Method for Stochastic Optimization.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AM7Q3XPX\\Kingma et Ba - 2014 - Adam A Method for Stochastic Optimization.pdf:application/pdf},
}

@inproceedings{vecchiotti_end--end_2019,
	title = {End-to-end binaural sound localisation from the raw waveform},
	abstract = {A novel end-to-end binaural sound localisation approach is proposed which estimates the azimuth of a sound source directly from the waveform. Instead of employing hand-crafted features commonly employed for binaural sound localisation, such as the interaural time and level difference, our end-to-end system approach uses a convolutional neural network (CNN) to extract speciﬁc features from the waveform that are suitable for localisation. Two systems are proposed which differ in the initial frequency analysis stage. The ﬁrst system is auditory-inspired and makes use of a gammatone ﬁltering layer, while the second system is fully data-driven and exploits a trainable convolutional layer to perform frequency analysis. In both systems, a set of dedicated convolutional kernels are then employed to search for speciﬁc localisation cues, which are coupled with a localisation stage using fully connected layers. Localisation experiments using binaural simulation in both anechoic and reverberant environments show that the proposed systems outperform a state-ofthe-art deep neural network system. Furthermore, our investigation of the frequency analysis stage in the second system suggests that the CNN is able to exploit different frequency bands for localisation according to the characteristics of the reverberant environment.},
	language = {en},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Vecchiotti, Paolo and Ma, Ning and Squartini, Stefano and Brown, Guy J.},
	year = {2019},
	keywords = {Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, lu},
	pages = {451--455},
	file = {Vecchiotti et al. - 2019 - End-to-end Binaural Sound Localisation from the Ra.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\YF5WBJEN\\Vecchiotti et al. - 2019 - End-to-end Binaural Sound Localisation from the Ra.pdf:application/pdf},
}

@article{stoter_countnet:_2019,
	title = {{CountNet}: estimating the number of concurrent speakers using supervised learning},
	volume = {27},
	issn = {2329-9290, 2329-9304},
	shorttitle = {{CountNet}},
	doi = {10.1109/TASLP.2018.2877892},
	abstract = {Estimating the maximum number of concurrent speakers from single-channel mixtures is a challenging problem and an essential ﬁrst step to address various audio-based tasks such as blind source separation, speaker diarization, and audio surveillance. We propose a unifying probabilistic paradigm, where deep neural network architectures are used to infer output posterior distributions. These probabilities are in turn processed to yield discrete point estimates. Designing such architectures often involves two important and complementary aspects that we investigate and discuss. First, we study how recent advances in deep architectures may be exploited for the task of speaker count estimation. In particular, we show that convolutional recurrent neural networks outperform recurrent networks used in a previous study when adequate input features are used. Even for short segments of speech mixtures, we can estimate up to ﬁve speakers, with a signiﬁcantly lower error than other methods. Second, through comprehensive evaluation, we compare the best-performing method to several baselines, as well as the inﬂuence of gain variations, different data sets, and reverberation. The output of our proposed method is compared to human performance. Finally, we give insights into the strategy used by our proposed method.},
	language = {en},
	number = {2},
	urldate = {2019-01-15},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {St\"{o}ter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuel A. P.},
	year = {2019},
	keywords = {lu},
	pages = {268--282},
	file = {Stoter et al. - 2019 - CountNet Estimating the Number of Concurrent Spea.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AZLNM27I\\Stoter et al. - 2019 - CountNet Estimating the Number of Concurrent Spea.pdf:application/pdf},
}

@inproceedings{comminiello_quaternion_2019,
	title = {Quaternion convolutional neural networks for detection and localization of {3D} sound events},
	abstract = {Learning from data in the quaternion domain enables us to exploit internal dependencies of 4D signals and treating them as a single entity. One of the models that perfectly suits with quaternion-valued data processing is represented by 3D acoustic signals in their spherical harmonics decomposition. In this paper, we address the problem of localizing and detecting sound events in the spatial sound field by using quaternion-valued data processing. In particular, we consider the spherical harmonic components of the signals captured by a first-order ambisonic microphone and process them by using a quaternion convolutional neural network. Experimental results show that the proposed approach exploits the correlated nature of the ambisonic signals, thus improving accuracy results in 3D sound event detection and localization.},
	language = {en},
	urldate = {2018-12-18},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Comminiello, Danilo and Lella, Marco and Scardapane, Simone and Uncini, Aurelio},
	year = {2019},
	keywords = {lu},
	file = {Comminiello et al. - 2018 - Quaternion Convolutional Neural Networks for Detec.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\LQEXE6IA\\Comminiello et al. - 2018 - Quaternion Convolutional Neural Networks for Detec.pdf:application/pdf},
}

@inproceedings{yang_multiple_2017,
	title = {Multiple sound source counting and localization based on spatial principal eigenvector},
	doi = {10.21437/Interspeech.2017-940},
	abstract = {Multiple sound source localization remains a challenging issue due to the interaction between sources. Although traditional approaches can locate multiple sources effectively, most of them require the number of sound sources as a priori knowledge. However, the number of sound sources is generally unknown in practical applications. To overcome this problem, a spatial principal eigenvector based approach is proposed to estimate the number and the direction of arrivals (DOAs) of multiple speech sources. Firstly, a time-frequency (TF) bin weighting scheme is utilized to select the TF bins dominated by single source. Then, for these selected bins, the spatial principal eigenvectors are extracted to construct a contribution function which is used to simultaneously estimate the number of sources and corresponding coarse DOAs. Finally, the coarse DOA estimations are reﬁned by iteratively optimizing the assignment of selected TF bins to each source. Experimental results validate that the proposed approach yields favorable performance for multiple sound source counting and localization in the environment with different levels of noise and reverberation.},
	language = {en},
	urldate = {2019-01-10},
	booktitle = {Interspeech},
	publisher = {ISCA},
	author = {Yang, Bing and Liu, Hong and Pang, Cheng},
	year = {2017},
	keywords = {lu},
	pages = {1924--1928},
	file = {Yang et al. - 2017 - Multiple Sound Source Counting and Localization Ba.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\A5KNKDQW\\Yang et al. - 2017 - Multiple Sound Source Counting and Localization Ba.pdf:application/pdf},
}

@inproceedings{arai_estimating_2003,
	title = {Estimating number of speakers by the modulation characteristics of speech},
	volume = {2},
	isbn = {978-0-7803-7663-2},
	doi = {10.1109/ICASSP.2003.1202328},
	abstract = {A method for estimating number of speakers of mixed speech signals was proposed. The algorithm was based on the modulation characteristics of speech, specifically that a single speech utterance typically has a distinct modulation pattern with a peak around 4-5 Hz. Having observed that the modulation peak decreases as number of speakers increases, our estimation algorithm used the region of the modulation frequency between 2 and 8 Hz. We obtained a novel parameter we called “equivalent number of speakers” to estimate the number of simultaneous speakers when speech signals contain multiple speakers.},
	language = {en},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech}, and {Signal} {Processing}},
	author = {Arai, Takayuki},
	year = {2003},
	keywords = {lu},
	pages = {197--200},
	file = {Arai - 2003 - Estimating number of speakers by the modulation ch.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\85VA3DGU\\Arai - 2003 - Estimating number of speakers by the modulation ch.pdf:application/pdf},
}

@techreport{habets_room_2006,
	title = {Room impulse response generator},
	institution = {Technische Universiteit Eindhoven},
	author = {Habets, Emanuel A. P.},
	year = {2006},
	file = {rir_generator.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\U6W3GTW5\\rir_generator.pdf:application/pdf},
}

@article{schmidt_multiple_1986,
	title = {Multiple emitter location and signal parameter estimation},
	volume = {34},
	issn = {0018-926X},
	doi = {10.1109/TAP.1986.1143830},
	abstract = {Processing the signals received on an array of sensors for the location of the emitter is of great enough interest to have been treated under many special case assumptions. The general problem considers sensors with arbitrary locations and arbitrary directional characteristics (gain/phase/polarization) in a noise/interference environment of arbitrary covariance matrix. This report is concerned first with the multiple emitter aspect of this problem and second with the generality of solution. A description is given of the multiple signal classification (MUSIC) algorithm, which provides asymptotically unbiased estimates of 1) number of incident wavefronts present; 2) directions of arrival (DOA) (or emitter locations); 3) strengths and cross correlations among the incident waveforms; 4) noise/interference strength. Examples and comparisons with methods based on maximum likelihood (ML) and maximum entropy (ME), as well as conventional beamforming are included. An example of its use as a multiple frequency estimator operating on time series is included.},
	number = {3},
	journal = {IEEE Transactions on Antennas and Propagation},
	author = {Schmidt, Ralph},
	month = mar,
	year = {1986},
	keywords = {Adaptive arrays, Direction of arrival estimation, Direction-of-arrival estimation, Frequency estimation, Interference, Multiple signal classification, non-lu, Parameter estimation, Polarization, Sensor arrays, Sensor phenomena and characterization, Signal processing, Signal processing antennas, Working environment noise},
	pages = {276--280},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\K2W5KTQI\\1143830.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\YZJPU4U2\\Schmidt - 1986 - Multiple emitter location and signal parameter est.pdf:application/pdf},
}

@article{hou_squared_2016,
	title = {Squared earth mover's distance-based loss for training deep neural networks},
	abstract = {In the context of single-label classiﬁcation, despite the huge success of deep learning, the commonly used crossentropy loss function ignores the intricate inter-class relationships that often exist in real-life tasks such as age classiﬁcation. In this work, we propose to leverage these relationships between classes by training deep nets with the exact squared Earth Mover’s Distance (also known as Wasserstein distance) for single-label classiﬁcation. The EMD2 loss uses the predicted probabilities of all classes and penalizes the miss-predictions according to a ground distance matrix that quantiﬁes the dissimilarities between classes. We demonstrate that on datasets with strong inter-class relationships such as an ordering between classes, our exact EMD2 losses yield new state-of-the-art results. Furthermore, we propose a method to automatically learn this matrix using the CNN’s own features during training. We show that our method can learn a ground distance matrix efﬁciently with no inter-class relationship priors and yield the same performance gain. Finally, we show that our method can be generalized to applications that lack strong interclass relationships and still maintain state-of-the-art performance. Therefore, with limited computational overhead, one can always deploy the proposed loss function on any dataset over the conventional cross-entropy.},
	language = {en},
	journal = {arXiv:1611.05916},
	author = {Hou, Le and Yu, Chen-Ping and Samaras, Dimitris},
	month = nov,
	year = {2016},
	keywords = {lu},
	file = {Hou et al. - 2016 - Squared Earth Mover's Distance-based Loss for Trai.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\93ZHVN7F\\Hou et al. - 2016 - Squared Earth Mover's Distance-based Loss for Trai.pdf:application/pdf},
}

@article{knapp_generalized_1976,
	title = {The generalized correlation method for estimation of time delay},
	volume = {24},
	issn = {0096-3518},
	doi = {10.1109/TASSP.1976.1162830},
	abstract = {A maximum likelihood (ML) estimator is developed for determining time delay between signals received at two spatially separated sensors in the presence of uncorrelated noise. This ML estimator can be realized as a pair of receiver prefilters followed by a cross correlator. The time argument at which the correlator achieves a maximum is the delay estimate. The ML estimator is compared with several other proposed processors of similar form. Under certain conditions the ML estimator is shown to be identical to one proposed by Hannan and Thomson [10] and MacDonald and Schultheiss [21]. Qualitatively, the role of the prefilters is to accentuate the signal passed to the correlator at frequencies for which the signal-to-noise (S/N) ratio is highest and, simultaneously, to suppress the noise power. The same type of prefiltering is provided by the generalized Eckart filter, which maximizes the S/N ratio of the correlator output. For low S/N ratio, the ML estimator is shown to be equivalent to Eckart prefiltering.},
	number = {4},
	journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
	author = {Knapp, Charles and Carter, Glifford},
	month = aug,
	year = {1976},
	keywords = {non-lu},
	pages = {320--327},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\XWS8HP6Q\\1162830.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\RFTA2NH4\\Knapp and Carter - 1976 - The generalized correlation method for estimation .pdf:application/pdf},
}

@article{jacobsen_note_1991,
	title = {A note on instantaneous and time-averaged active and reactive sound intensity},
	volume = {147},
	issn = {0022460X},
	doi = {10.1016/0022-460X(91)90496-7},
	language = {en},
	number = {3},
	urldate = {2018-10-09},
	journal = {Journal of Sound and Vibration},
	author = {Jacobsen, Finn},
	month = jun,
	year = {1991},
	keywords = {lu},
	pages = {489--496},
	file = {Jacobsen - 1991 - A note on instantaneous and time-averaged active a.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\J4YL3SVR\\Jacobsen - 1991 - A note on instantaneous and time-averaged active a.pdf:application/pdf},
}

@inproceedings{he_joint_2018,
	title = {Joint localization and classification of multiple sound sources using a multi-task neural network},
	abstract = {We propose a novel multi-task neural network-based approach for joint sound source localization and speech/non-speech classiﬁcation in noisy environments. The network takes raw short time Fourier transform as input and outputs the likelihood values for the two tasks, which are used for the simultaneous detection, localization and classiﬁcation of an unknown number of overlapping sound sources, Tested with real recorded data, our method achieves signiﬁcantly better performance in terms of speech/non-speech classiﬁcation and localization of speech sources, compared to method that performs localization and classiﬁcation separately. In addition, we demonstrate that incorporating the temporal context can further improve the performance.},
	language = {en},
	urldate = {2018-11-22},
	booktitle = {Interspeech},
	author = {He, Weipeng and Motlicek, Petr and Odobez, Jean-Marc},
	year = {2018},
	keywords = {lu},
	pages = {312--316},
	file = {He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\Z8KXJUSB\\He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:application/pdf;He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\SYBZYA4I\\He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:application/pdf},
}

@phdthesis{moreau_etude_2006,
	address = {Le Mans},
	title = {Étude et réalisation d’outils avancés d’encodage spatial pour la technique de spatialisation sonore {Higher} {Order} {Ambisonics} : microphone {3D} et contrôle de distance},
	language = {fr},
	school = {Université du Main},
	author = {Moreau, Sébastien},
	year = {2006},
	keywords = {partiel-lu},
	file = {Library Catalog Entry Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\UVS59NXC\\SRCH.html:text/html;Moreau - Étude et réalisation d’outils avancés d’encodage s.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\KDITCCBI\\Moreau - Étude et réalisation d’outils avancés d’encodage s.pdf:application/pdf},
}

@inproceedings{li_online_2018,
	title = {Online direction of arrival estimation based on deep learning},
	isbn = {978-1-5386-4658-8},
	doi = {10.1109/ICASSP.2018.8461386},
	abstract = {Direction of arrival (DOA) estimation is an important topic in microphone array processing. Conventional methods work well in relatively clean conditions but suffer from noise and reverberation distortions. Recently, deep learning-based methods show the robustness to noise and reverberation. However, the performance is degraded rapidly or even model cannot work when microphone array structure changes. So it has to retrain the model with new data, which is a huge work. In this paper, we propose a supervised learning algorithm for DOA estimation combining convolutional neural network (CNN) and long short term memory (LSTM). Experimental results show that the proposed method can improve the accuracy signiﬁcantly. In addition, due to an input feature design, the proposed method can adapt to a new microphone array conveniently only use a very small amount of data.},
	language = {en},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Li, Qinglong and Zhang, Xueliang and Li, Hao},
	month = apr,
	year = {2018},
	keywords = {lu},
	pages = {2616--2620},
	file = {Li et al. - 2018 - Online Direction of Arrival Estimation Based on De.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\E6GCSTTT\\Li et al. - 2018 - Online Direction of Arrival Estimation Based on De.pdf:application/pdf},
}

@inproceedings{xiao_learning-based_2015,
	title = {A learning-based approach to direction of arrival estimation in noisy and reverberant environments},
	doi = {10.1109/ICASSP.2015.7178484},
	abstract = {This paper presents a learning-based approach to the task of direction of arrival estimation (DOA) from microphone array input. Traditional signal processing methods such as the classic least square (LS) method rely on strong assumptions on signal models and accurate estimations of time delay of arrival (TDOA) . They only work well in relatively clean conditions, but suffer from noise and reverberation distortions. In this paper, we propose a learning-based approach that can learn from a large amount of simulated noisy and reverberant microphone array inputs for robust DOA estimation. Specifically, we extract features from the generalised cross correlation (GCC) vectors and use a multilayer perceptron neural network to learn the nonlinear mapping from such features to the DOA. One advantage of the learning based method is that as more and more training data becomes available, the DOA estimation will become more and more accurate. Experimental results on simulated data show that the proposed learning based method produces much better results than the state-of-the-art LS method. The testing results on real data recorded in meeting rooms show improved root-mean-square error (RMSE) compared to the LS method.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Xiao, Xiong and Zhao, Shengkui and Zhong, Xionghu and Jones, Douglas L. and Chng, Eng S. and Li, Haizhou},
	year = {2015},
	keywords = {lu},
	pages = {2814--2818},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\BP2U7Q7A\\7178484.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\DMFDRYEJ\\Xiao et al. - 2015 - A learning-based approach to direction of arrival .pdf:application/pdf},
}

@inproceedings{nicol_sound_2010,
	title = {Sound spatialization by higher order {Ambisonics}: encoding and decoding a sound scene in practice from a theoretical point of view},
	abstract = {An overview of HOA technology is presented. First, HOA deﬁnes a format of spatial audio which has many attractive properties, such as scalability and ﬂexibility. Besides, this format is independent of the encoding (i.e. microphone signals) and decoding (i.e. loudspeaker signals) formats. Second, HOA provides tools to record, or create, and render a spatial sound scene. These tools, which rely on a speciﬁc encoding and decoding of spatial information, will be analysed and discussed from a both theoretical and practical point of view. Third, the ﬁnal issue is the assessment of the virtual sound scene that is (re)created by HOA. The toolkit of available methodologies and criteria is examined.},
	language = {en},
	booktitle = {International {Symposium} on {Ambisonics} and {Spherical} {Acoustics}},
	author = {Nicol, Rozenn},
	year = {2010},
	keywords = {non-lu},
	pages = {9},
	file = {Nicol - 2010 - Orange Labs TECHOPERATPS 2 Avenue Pierre Marzin,.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\EHFBR5TG\\Nicol - 2010 - Orange Labs TECHOPERATPS 2 Avenue Pierre Marzin,.pdf:application/pdf},
}

@article{kitic_tramp:_2018,
	title = {{TRAMP}: {TRacking} by a realtime {AMbisonic}-based {Particle} filter},
	abstract = {This article presents a multiple sound source localization and tracking system, fed by the Eigenmike array. The First Order Ambisonics (FOA) format is used to build a pseudointensity-based spherical histogram, from which the source position estimates are deduced. These instantaneous estimates are processed by a well-known tracking system relying on a set of particle ﬁlters. While the novelty within localization and tracking is incremental, the fully-functional, complete and real-time running system based on these algorithms is proposed for the ﬁrst time. As such, it could serve as an additional baseline method of the LOCATA challenge.},
	language = {en},
	journal = {LOCATA Challenge Workshop},
	author = {Kitic, Srdan and Guérin, Alexandre},
	year = {2018},
	keywords = {lu},
	file = {Guérin - 2018 - Orange Labs 4 Rue du Clos Courtel 35510 Cesson-Sév.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\V85SDXXK\\Guérin - 2018 - Orange Labs 4 Rue du Clos Courtel 35510 Cesson-Sév.pdf:application/pdf},
}

@inproceedings{vesperini_neural_2016,
	title = {A neural network based algorithm for speaker localization in a multi-room environment},
	doi = {10.1109/MLSP.2016.7738817},
	abstract = {A Speaker Localization algorithm based on Neural Networks for multi-room domestic scenarios is proposed in this paper. The approach is fully data-driven and employs a Neural Network fed by GCC-PHAT (Generalized Cross Correlation Phase Transform) Patterns, calculated by means of the microphone signals, to determine the speaker position in the room under analysis. In particular, we deal with a multi-room case study, in which the acoustic scene of each room is influenced by sounds emitted in the other rooms. The algorithm is tested against the home recorded DIRHA dataset, characterized by multiple wall and ceiling microphone signals for each room. In particular, we focused on the speaker localization problem in two distinct neighbouring rooms. We assumed the presence of an Oracle multi-room Voice Activity Detector (VAD) in our experiments. A three-stage optimization procedure has been adopted to find the best network configuration and GCC-PHAT Patterns combination. Moreover, an algorithm based on Time Difference of Arrival (TDOA), recently proposed in literature for the addressed applicative context, has been considered as term of comparison. As result, the proposed algorithm outperforms the reference one, providing an average localization error, expressed in terms of RMSE, equal to 525 mm against 1465 mm. Concluding, we also assessed the algorithm performance when a real VAD, recently proposed by some of the authors, is used. Even though a degradation of localization capability is registered (an average RMSE equal to 770 mm), still a remarkable improvement with respect to the state of the art performance is obtained.},
	booktitle = {{IEEE} {International} {Workshop} on {Machine} {Learning} for {Signal} {Processing}},
	author = {Vesperini, Fabio and Vecchiotti, Paolo and Principi, Emanuele and Squartini, Stefano and Piazza, Francesco},
	year = {2016},
	keywords = {lu},
	pages = {1--6},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\WWKEZENN\\7738817.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\MJVSURQC\\Vesperini et al. - 2016 - A neural network based algorithm for speaker local.pdf:application/pdf},
}

@inproceedings{xu_crowd++:_2013,
	title = {Crowd++: unsupervised speaker count with smartphones},
	abstract = {Smartphones are excellent mobile sensing platforms, with the microphone in particular being exercised in several audio inference applications. We take smartphone audio inference a step further and demonstrate for the ﬁrst time that it’s possible to accurately estimate the number of people talking in a certain place – with an average error distance of 1.5 speakers – through unsupervised machine learning analysis on audio segments captured by the smartphones. Inference occurs transparently to the user and no human intervention is needed to derive the classiﬁcation model. Our results are based on the design, implementation, and evaluation of a system called Crowd++, involving 120 participants in 10 very different environments. We show that no dedicated external hardware or cumbersome supervised learning approaches are needed but only off-the-shelf smartphones used in a transparent manner. We believe our ﬁndings have profound implications in many research ﬁelds, including social sensing and personal wellbeing assessment.},
	language = {en},
	booktitle = {{ACM} {International} {Joint} {Conference} on {Pervasive} and {Ubiquitous} {Computing}},
	author = {Xu, Chenren and Li, Sugang and Liu, Gang and Zhang, Yanyong and Miluzzo, Emiliano and Chen, Yih-Farn and Li, Jun and Firner, Bernhard},
	year = {2013},
	keywords = {lu},
	pages = {43--52},
	file = {Xu et al. - Crowd++ Unsupervised Speaker Count with Smartphon.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\H3LEJAQU\\Xu et al. - Crowd++ Unsupervised Speaker Count with Smartphon.pdf:application/pdf},
}

@inproceedings{larnel_bref_1991,
	title = {{BREF}, a large vocabulary spoken corpus for {French}},
	abstract = {This paper presents some of the design considerations of BREF, a large read-speech corpus for French. BREF was designed to provide continuous speech data for the development of dictation machines, for the evaluation of continuous speech recognition systems (both speaker-dependent and speakerindependent), and for the study of phonological variations. The texts to be read were selected from 5 million words of the French newspaper, Le Monde. In total, 11,000 texts were selected, with selection criteria that emphasisized maximizing the number of distinct triphones. Separate text materials were selected for training and test corpora. Ninety speakers have been recorded, each providing between 5,000 and 10,000 words (approximately 40-70 min.) of speech.},
	booktitle = {Eurospeech},
	author = {Larnel, Lori F. and Gauvain, Jean-Luc and Eskénazi, Maxine},
	year = {1991},
	keywords = {Expect, Selection (user interface), Speech corpus, Speech recognition, Text corpus, Time-compressed speech, Triphone, Vocabulary},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\2NKUZWBU\\Larnel et al. - 1991 - BREF, a large vocabulary spoken corpus for French.pdf:application/pdf},
}

@inproceedings{wei_determining_2018,
	title = {Determining number of speakers from single microphone speech signals by multi-label convolutional neural network},
	doi = {10.1109/IECON.2018.8592773},
	abstract = {This paper presents a multi-label convolutional neural network approach to determine the number of speakers when using a single microphone which is more challenging than when using multiple microphones. Spectrograms of windowed noisy speech signals for 1talker, 2talkers and 3+talkers are used as inputs to a multi-label convolutional neural network. The architecture of the developed multi-label convolutional neural network is discussed and it is shown that this network with median filtering can achieve an overall accuracy of about 81\% for the noisy speech dataset examined.},
	booktitle = {Annual {Conference} of the {IEEE} {Industrial} {Electronics} {Society}},
	author = {Wei, Haoran and Kehtarnavaz, Nasser},
	month = oct,
	year = {2018},
	keywords = {Acoustics, Conferences, convolutional neural nets, Convolutional neural networks, determining number of speakers, Filtering, lu, median filtering, median filters, microphones, Microphones, multi-label convolutional neural network, multilabel convolutional neural network approach, noisy speech signals, single microphone speech signals, Spectrogram, speech enhancement, speech processing, Speech processing},
	pages = {2706--2710},
	file = {10.1109@IECON.2018.8592773.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\7E2J7JRY\\10.1109@IECON.2018.8592773.pdf:application/pdf},
}

@inproceedings{stoter_classification_2018,
	title = {Classification vs. regression in supervised learning for single channel speaker count estimation},
	doi = {10.1109/ICASSP.2018.8462159},
	abstract = {The task of estimating the maximum number of concurrent speakers from single channel mixtures is important for various audio-based applications, such as blind source separation, speaker diarisation, audio surveillance or auditory scene classification. Building upon powerful machine learning methodology, we develop a Deep Neural Network (DNN) that estimates a speaker count. While DNNs efficiently map input representations to output targets, it remains unclear how to best handle the network output to infer integer source count estimates, as a discrete count estimate can either be tackled as a regression or a classification problem. In this paper, we investigate this important design decision and also address complementary parameter choices such as the input representation. We evaluate a state-of-the-art DNN audio model based on a Bi-directional Long Short-Term Memory network architecture for speaker count estimations. Through experimental evaluations aimed at identifying the best overall strategy for the task and show results for five seconds speech segments in mixtures of up to ten speakers.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {St\"{o}ter, Fabien-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuel A. P.},
	year = {2018},
	keywords = {acoustic signal processing, audio based applications, audio signal processing, audio surveillance, auditory scene classification, bidirectional long short term memory network architecture, blind source separation, channel estimation, Channel estimation, classification problem, cocktail-party, Computer architecture, concurrent speakers, deep neural network, design decision, discrete count estimate, DNN audio model, Estimation, integer source count estimates, learning (artificial intelligence), lu, Machine learning, map input representations, maximum number, network output, neural nets, Neural networks, number of concurrent speakers, output targets, overlapped speech, pattern classification, powerful machine learning methodology, regression analysis, regression problem, single channel mixtures, single channel speaker count estimation, speaker count estimation, speaker diarisation, speaker recognition, speech segments, supervised learning, Task analysis, Training},
	pages = {436--440},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\4BL2RGGJ\\Stöter et al. - 2018 - Classification vs. Regression in Supervised Learni.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\E2NLA3V7\\1712.html:text/html},
}

@article{sayoud_proposal_2010,
	title = {Proposal of a new confidence parameter estimating the number of speakers-an experimental investigation-},
	volume = {1},
	abstract = {Abstract. Is it possible to know how many speakers are speaking simultaneously in case of speech overlap? If the human brain, creation not yet mastered, manages to do it and even to understand the mixed speech meaning, it is not yet the case for the existing systems of automatic speaker recognition. In practice, these systems present a strong degradation in such situations. For this task, we propose a new method able to estimate the number of speakers in a mixture of speech signals. The algorithm developed here is based on the computation of the statistical characteristic of the 7th Mel coefficient extracted by spectral analysis from the speech signal. This algorithm using a confidence parameter, which we called PENS, is tested on seven different sets of the ORATOR database, where each set contains seven multi-speaker files. Results show that the PENS parameter permits us to make a good discrimination, without any ambiguity, between a mono-speaker signal (only one speaker is speaking) and a mixed-speakers signal (several speakers are speaking simultaneously). Moreover, it permits us to estimate, in case of mixed speech signals, the number of speakers with a good precision, especially when the number of speakers is less than four.},
	number = {2},
	journal = {Journal of Information Hiding and Multimedia Signal Processing},
	author = {Sayoud, Halim and Ouamour, Siham},
	month = apr,
	year = {2010},
	keywords = {lu},
	file = {Citeseer - Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\9BSGC2K7\\summary.html:text/html;Sayoud and Ouamour - 2010 - Proposal of a New Confidence Parameter Estimating .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\DG3RFZBH\\Sayoud and Ouamour - 2010 - Proposal of a New Confidence Parameter Estimating .pdf:application/pdf},
}

@article{arberet_robust_2010,
	title = {A robust method to count and locate audio sources in a multichannel underdetermined mixture},
	volume = {58},
	issn = {1053-587X},
	doi = {10.1109/TSP.2009.2030854},
	abstract = {We propose a method to count and estimate the mixing directions in an underdetermined multichannel mixture. The approach is based on the hypothesis that in the neighborhood of \textit{some} time-frequency points, only one source essentially contributes to the mixture: such time-frequency points can provide robust local estimates of the corresponding source direction. At the core of our contribution is a statistical model to exploit a local confidence measure, which detects the time-frequency regions where such robust information is available. A clustering algorithm called DEMIX is proposed to merge the information from all time-frequency regions according to their confidence level. So as to estimate the delays of anechoic mixtures and overcome the intrinsic ambiguities of phase unwrapping as met with DUET, we propose a technique similar to GCC-PHAT that is able to estimate delays that can largely exceed one sample. We propose an extensive experimental study that shows the resulting method is more robust in conditions where all DUET-like comparable methods fail, that is, in particular, a) when time-delays largely exceed one sample and b) when the source directions are very close.},
	number = {1},
	journal = {IEEE Transactions on Signal Processing},
	author = {Arberet, Simon and Gribonval, Rémi and Bimbot, Frédéric},
	year = {2010},
	keywords = {Audio recording, audio signals, audio sources, Biomedical imaging, blind source separation, Blind source separation, Clustering algorithms, delay estimation, Delay estimation, DEMIX, direction of arrival, Direction of arrival estimation, direction-of-arrival estimation, lu, mixing directions, multichannel audio, multichannel underdetermined mixture, Phase estimation, Robustness, signal sources, Source separation, sparse component analysis, sparse matrices, Speech processing, Time frequency analysis, time-delays},
	pages = {121--133},
	file = {RR-6593.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\ZM7N943E\\RR-6593.pdf:application/pdf},
}

@article{anguera_speaker_2012,
	title = {Speaker diarization: a review of recent research},
	volume = {20},
	issn = {1558-7916},
	shorttitle = {Speaker {Diarization}},
	doi = {10.1109/TASL.2011.2125954},
	abstract = {Speaker diarization is the task of determining “who spoke when?” in an audio or video recording that contains an unknown amount of speech and also an unknown number of speakers. Initially, it was proposed as a research topic related to automatic speech recognition, where speaker diarization serves as an upstream processing step. Over recent years, however, speaker diarization has become an important key technology for many tasks, such as navigation, retrieval, or higher level inference on audio data. Accordingly, many important improvements in accuracy and robustness have been reported in journals and conferences in the area. The application domains, from broadcast news, to lectures and meetings, vary greatly and pose different problems, such as having access to multiple microphones and multimodal information or overlapping speech. The most recent review of existing technology dates back to 2006 and focuses on the broadcast news domain. In this paper, we review the current state-of-the-art, focusing on research developed since 2006 that relates predominantly to speaker diarization for conference meetings. Finally, we present an analysis of speaker diarization performance as reported through the NIST Rich Transcription evaluations on meeting data and identify important areas for future research.},
	number = {2},
	journal = {IEEE Transactions on Audio, Speech, and Language Processing},
	author = {Anguera, Xavier and Bozonnet, Simon and Evans, Nicholas and Fredouille, Corinne and Friedland, Gerald and Vinyals, Oriol},
	month = feb,
	year = {2012},
	keywords = {Acoustics, Adaptation models, audio data, audio recording, audio signal processing, automatic speech recognition, broadcast news, conference meetings, Data models, information resources, Meetings, Microphones, multimodal information, NIST, NIST Rich Transcription evaluations, non-lu, rich transcription, speaker diarization, speaker recognition, Speech, speech overlapping, Speech recognition, teleconferencing, television broadcasting, upstream processing, video recording},
	pages = {356--370},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\LY5XV48K\\6135543.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\55LN3XWQ\\Anguera et al. - 2012 - Speaker Diarization A Review of Recent Research.pdf:application/pdf},
}

@article{adavanne_localization_2019,
	title = {Localization, detection and tracking of multiple moving sound sources with a convolutional recurrent neural network},
	abstract = {This paper investigates the joint localization, detection, and tracking of sound events using a convolutional recurrent neural network (CRNN). We use a CRNN previously proposed for the localization and detection of stationary sources, and show that the recurrent layers enable the spatial tracking of moving sources when trained with dynamic scenes. The tracking performance of the CRNN is compared with a stand-alone tracking method that combines a multisource (DOA) estimator and a particle ﬁlter. Their respective performance is evaluated in various acoustic conditions such as anechoic and reverberant scenarios, stationary and moving sources at several angular velocities, and with a varying number of overlapping sources. The results show that the CRNN manages to track multiple sources more consistently than the parametric method across acoustic scenarios, but at the cost of higher localization error.},
	language = {en},
	urldate = {2019-04-30},
	journal = {arXiv:1904.12769},
	author = {Adavanne, Sharath and Politis, Archontis and Virtanen, Tuomas},
	month = apr,
	year = {2019},
	keywords = {Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, lu},
	file = {Adavanne et al. - 2019 - Localization, Detection and Tracking of Multiple M.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\APNFVL6N\\Adavanne et al. - 2019 - Localization, Detection and Tracking of Multiple M.pdf:application/pdf},
}

@article{purwins_deep_2019,
	title = {Deep learning for audio signal processing},
	volume = {13},
	issn = {1932-4553, 1941-0484},
	doi = {10.1109/JSTSP.2019.2908700},
	abstract = {Given the recent surge in developments of deep learning, this article provides a review of the state-of-the-art deep learning techniques for audio signal processing. Speech, music, and environmental sound processing are considered side-by-side, in order to point out similarities and differences between the domains, highlighting general methods, problems, key references, and potential for cross-fertilization between areas. The dominant feature representations (in particular, log-mel spectra and raw waveform) and deep learning models are reviewed, including convolutional neural networks, variants of the long short-term memory architecture, as well as more audio-specific neural network models. Subsequently, prominent deep learning application areas are covered, i.e. audio recognition (automatic speech recognition, music information retrieval, environmental sound detection, localization and tracking) and synthesis and transformation (source separation, audio enhancement, generative models for speech, sound, and music synthesis). Finally, key issues and future questions regarding deep learning applied to audio signal processing are identified.},
	number = {2},
	journal = {IEEE Journal of Selected Topics in Signal Processing},
	author = {Purwins, Hendrik and Li, Bo and Virtanen, Tuomas and Schlüter, Jan and Chang, Shuo-yiin and Sainath, Tara},
	month = apr,
	year = {2019},
	keywords = {lu},
	pages = {206--219},
	file = {arXiv\:1905.00078 PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\KHUSBDWQ\\Purwins et al. - 2019 - Deep Learning for Audio Signal Processing.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\YDMQEZL4\\1905.html:text/html},
}

@inproceedings{von_neumann_all-neural_2019,
	title = {All-neural online source separation, counting, and diarization for meeting analysis},
	doi = {10.1109/ICASSP.2019.8682572},
	abstract = {Automatic meeting analysis comprises the tasks of speaker counting, speaker diarization, and the separation of overlapped speech, followed by automatic speech recognition. This all has to be carried out on arbitrarily long sessions and, ideally, in an online or block-online manner. While significant progress has been made on individual tasks, this paper presents for the first time an all-neural approach to simultaneous speaker counting, diarization and source separation. The NN-based estimator operates in a block-online fashion and tracks speakers even if they remain silent for a number of time blocks, thus learning a stable output order for the separated sources. The neural network is recurrent over time as well as over the number of sources. The simulation experiments show that state of the art separation performance is achieved, while at the same time delivering good diarization and source counting results. It even generalizes well to an unseen large number of blocks.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {von Neumann, Thilo and Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro and Haeb-Umbach, Reinhold},
	month = may,
	year = {2019},
	keywords = {Artificial neural networks, Blind source separation, Estimation, Indexes, meeting diarization, neural network, non-lu, online processing, source counting, Source separation, Speech recognition, Task analysis},
	pages = {91--95},
	file = {Neumann et al. - 2019 - All-neural Online Source Separation, Counting, and.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\Z79VEYD8\\Neumann et al. - 2019 - All-neural Online Source Separation, Counting, and.pdf:application/pdf},
}

@article{oord_wavenet:_2016,
	title = {{WaveNet}: a generative model for raw audio},
	abstract = {This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efﬁciently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-ofthe-art performance, with human listeners rating it as signiﬁcantly more natural sounding than the best parametric and concatenative systems for both English and Mandarin. A single WaveNet can capture the characteristics of many different speakers with equal ﬁdelity, and can switch between them by conditioning on the speaker identity. When trained to model music, we ﬁnd that it generates novel and often highly realistic musical fragments. We also show that it can be employed as a discriminative model, returning promising results for phoneme recognition.},
	language = {en},
	journal = {arXiv:1609.03499},
	author = {Oord, Aaron van den and Dieleman, Sander and Zen, Heiga and Simonyan, Karen and Vinyals, Oriol and Graves, Alex and Kalchbrenner, Nal and Senior, Andrew and Kavukcuoglu, Koray},
	month = sep,
	year = {2016},
	keywords = {Computer Science - Machine Learning, Computer Science - Sound, lu},
	file = {Oord et al. - 2016 - WaveNet A Generative Model for Raw Audio.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\Z8FP47L6\\Oord et al. - 2016 - WaveNet A Generative Model for Raw Audio.pdf:application/pdf},
}

@book{chollet_deep_2017,
	title = {Deep learning with {Python}},
	isbn = {978-1-61729-443-3},
	language = {en},
	publisher = {Simon and Schuster},
	author = {Chollet, François},
	year = {2017},
	keywords = {lu, Machine learning, Neural networks (Computer science), Python (Computer program language)},
	file = {Chollet - 2018 - Deep learning with Python.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\S8KTC9RZ\\Chollet - 2018 - Deep learning with Python.pdf:application/pdf},
}

@article{silver_mastering_2017,
	title = {Mastering chess and shogi by self-play with a general reinforcement learning algorithm},
	abstract = {The game of chess is the most widely-studied domain in the history of artificial intelligence. The strongest programs are based on a combination of sophisticated search techniques, domain-specific adaptations, and handcrafted evaluation functions that have been refined by human experts over several decades. In contrast, the AlphaGo Zero program recently achieved superhuman performance in the game of Go, by tabula rasa reinforcement learning from games of self-play. In this paper, we generalise this approach into a single AlphaZero algorithm that can achieve, tabula rasa, superhuman performance in many challenging domains. Starting from random play, and given no domain knowledge except the game rules, AlphaZero achieved within 24 hours a superhuman level of play in the games of chess and shogi (Japanese chess) as well as Go, and convincingly defeated a world-champion program in each case.},
	journal = {arXiv:1712.01815},
	author = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and Lillicrap, Timothy and Simonyan, Karen and Hassabis, Demis},
	month = dec,
	year = {2017},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, lu},
	file = {arXiv\:1712.01815 PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\U2H9ZHWU\\Silver et al. - 2017 - Mastering Chess and Shogi by Self-Play with a Gene.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\L98AT8LM\\1712.html:text/html},
}

@inproceedings{perotin_regression_2019,
	title = {Regression versus classification for neural network based audio source localization},
	abstract = {We compare the performance of regression and classiﬁcation neural networks for single-source direction-of-arrival estimation. Since the output space is continuous and structured, regression seems more appropriate. However, classiﬁcation on a discrete spherical grid is widely believed to perform better and is predominantly used in the literature. For regression, we propose two ways to account for the spherical geometry of the output space based either on the angular distance between spherical coordinates or on the mean squared error between Cartesian coordinates. For classiﬁcation, we propose two alternatives to the classical one-hot encoding framework: we derive a Gibbs distribution from the squared angular distance between grid points and use the corresponding probabilities either as soft targets or as cross-entropy weights that retain a clear probabilistic interpretation. We show that regression on Cartesian coordinates is generally more accurate, except when localized interference is present, in which case classiﬁcation appears to be more robust.},
	language = {en},
	booktitle = {{IEEE} {Workshop} on {Applications} of {Signal} {Processing} to {Audio} and {Acoustics}},
	author = {Perotin, Lauréline and Défossez, Alexandre and Vincent, Emmanuel and Serizel, Romain and Guérin, Alexandre},
	year = {2019},
	keywords = {lu},
	file = {waspaa_perotin_camera_ready.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\VLDB6UZK\\waspaa_perotin_camera_ready.pdf:application/pdf},
}

@phdthesis{perotin_localisation_2019,
	title = {Localisation et rehaussement de sources de parole au format {Ambisonique}},
	language = {French},
	school = {Université de Lorraine},
	author = {Perotin, Lauréline},
	month = oct,
	year = {2019},
	keywords = {partiel-lu},
	file = {Perotin - 2019 - Localisation et rehaussement de sources de parole .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\DH3UF3GL\\Perotin - 2019 - Localisation et rehaussement de sources de parole .pdf:application/pdf},
}

@inproceedings{roden_sound_2015,
	title = {On sound source localization of speech signals using deep neural networks},
	copyright = {http://rightsstatements.org/vocab/InC/1.0/},
	isbn = {978-3-939296-08-9},
	abstract = {In recent years artificial neural networks are successfully applied especially in the context of automatic speech recognition. As information processing systems, neural networks are trained by, e.g., backpropagation or restricted Boltzmann machines to classify patterns at the input of the system. The current work presents the implementation of a deep neural network (DNN) architecture for acoustic source localization.},
	language = {en},
	booktitle = {Deutsche {Jahrestagung} für {Akustik}},
	author = {Roden, Reinhild and Moritz, Niko and Gerlach, Stephan and Weinzierl, Stefan and Goetze, Stefan},
	year = {2015},
	keywords = {lu},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\N69VE3R2\\Roden et al. - 2015 - On sound source localization of speech signals usi.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\5EMA4FWE\\9746.html:text/html},
}

@inproceedings{zermini_deep_2016,
	title = {Deep neural network based audio source separation},
	abstract = {Audio source separation aims to extract individual sources from mixtures of 
multiple sound sources. Many techniques have been developed such as independent compo- 
nent analysis, computational auditory scene analysis, and non-negative matrix factorisa- 
tion. A method based on Deep Neural Networks (DNNs) and time-frequency (T-F) mask- 
ing has been recently developed for binaural audio source separation. In this method, the 
DNNs are used to predict the Direction Of Arrival (DOA) of the audio sources with respect 
to the listener which is then used to generate soft T-F masks for the recovery/estimation 
of the individual audio sources.},
	booktitle = {{IMA} {International} {Conference} on {Mathematics} in {Signal} {Processing}},
	author = {Zermini, Alfredo and Yu, Yingfu and Xu, Yong and Wang, Wenwu and Plumbley, Mark D.},
	year = {2016},
	keywords = {lu, non-direct},
	file = {Zermini et al. - 2016 - Deep neural network based audio source separation.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\R5V9PPQL\\Zermini et al. - 2016 - Deep neural network based audio source separation.pdf:application/pdf},
}

@article{yalta_sound_2017,
	title = {Sound source localization using deep learning models},
	volume = {29},
	doi = {10.20965/jrm.2017.p0037},
	abstract = {Title: Sound Source Localization Using Deep Learning Models {\textbar} Keywords: sound source localization, deep learning, deep residual networks {\textbar} Author: Nelson Yalta, Kazuhiro Nakadai, and Tetsuya Ogata},
	number = {1},
	urldate = {2020-03-19},
	journal = {Journal of Robotics and Mechatronics},
	author = {Yalta, Nelson and Nakadai, Kazuhiro and Ogata, Tetsuya},
	year = {2017},
	keywords = {lu},
	pages = {37--48},
	file = {Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\QUWEG52T\\robot002900010037.html:text/html;Yalta et al. - 2017 - Sound Source Localization Using Deep Learning Mode.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\C9HEVQIM\\Yalta et al. - 2017 - Sound Source Localization Using Deep Learning Mode.pdf:application/pdf},
}

@article{suvorov_deep_2018,
	title = {Deep residual network for sound source localization in the time domain},
	abstract = {This study presents a system for sound source localization in time domain using a deep residual neural network. Data from the linear 8 channel microphone array with 3 cm spacing is used by the network for direction estimation. We propose to use the deep residual network for sound source localization considering the localization task as a classification task. This study describes the gathered dataset and developed architecture of the neural network. We will show the training process and its result in this study. The developed system was tested on validation part of the dataset and on new data capture in real time. The accuracy classification of 30 m sec sound frames is 99.2\%. The standard deviation of sound source localization is 4\{{\textbackslash}deg\}. The proposed method of sound source localization was tested inside of speech recognition pipeline. Its usage decreased word error rate by 1.14\% in comparison with similar speech recognition pipeline using GCC-PHAT sound source localization.},
	journal = {arXiv:1808.06429},
	author = {Suvorov, Dmitry and Dong, Ge and Zhukov, Roman},
	month = aug,
	year = {2018},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\HWHPTCSD\\Suvorov et al. - 2018 - Deep Residual Network for Sound Source Localizatio.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\MJVFTJ5E\\1808.html:text/html},
}

@article{he_deep_2018,
	title = {Deep neural networks for multiple speaker detection and localization},
	doi = {10.1109/ICRA.2018.8461267},
	abstract = {We propose to use neural networks for simultaneous detection and localization of multiple sound sources in human-robot interaction. In contrast to conventional signal processing techniques, neural network-based sound source localization methods require fewer strong assumptions about the environment. Previous neural network-based methods have been focusing on localizing a single sound source, which do not extend to multiple sources in terms of detection and localization. In this paper, we thus propose a likelihood-based encoding of the network output, which naturally allows the detection of an arbitrary number of sources. In addition, we investigate the use of sub-band cross-correlation information as features for better localization in sound mixtures, as well as three different network architectures based on different motivations. Experiments on real data recorded from a robot show that our proposed methods significantly outperform the popular spatial spectrum-based approaches.},
	journal = {IEEE International Conference on Robotics and Automation},
	author = {He, Weipeng and Motlicek, Petr and Odobez, Jean-Marc},
	year = {2018},
	keywords = {lu},
	pages = {74--79},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\K35TSYM4\\He et al. - 2018 - Deep Neural Networks for Multiple Speaker Detectio.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\D3P98NL3\\1711.html:text/html},
}

@inproceedings{hirvonen_classication_2015,
	title = {Classiﬁcation of spatial audio location and content using convolutional neural networks},
	abstract = {This paper investigates the use of Convolutional Neural Networks for spatial audio classiﬁcation. In contrast to traditional methods that use hand-engineered features and algorithms, we show that a Convolutional Network in combination with generic preprocessing can give good results, and allows for specialization to challenging conditions. The method can adapt to e.g. diﬀerent source distances and microphone arrays, as well as estimate both spatial location and audio content type jointly. For example, with typical single-source material in a simulated reverberant room, we can achieve cross-validation accuracy of 94.3\% for 40-ms frames across 16 classes (eight spatial directions, content type speech vs. music).},
	language = {en},
	booktitle = {Audio {Engineering} {Society} {Convention}},
	author = {Hirvonen, Toni},
	year = {2015},
	keywords = {lu},
	file = {Hirvonen - 2015 - Classiﬁcation of Spatial Audio Location and Conten.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\IWKKP9KV\\Hirvonen - 2015 - Classiﬁcation of Spatial Audio Location and Conten.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\MJYRHEX9\\browse.html:text/html},
}

@inproceedings{takeda_discriminative_2016,
	title = {Discriminative multiple sound source localization based on deep neural networks using independent location model},
	doi = {10.1109/SLT.2016.7846325},
	abstract = {We propose a training method for multiple sound source localization (SSL) based on deep neural networks (DNNs). Such networks function as posterior probability estimator of sound location in terms of position labels and achieve high localization correctness. Since the previous DNNs' configuration for SSL handles one-sound-source cases, it should be extended to multiple-sound-source cases to apply it to real environments. However, a naïve design causes 1) an increase in the number of labels and training data patterns and 2) a lack of label consistency across different numbers of sound sources, such as one and two-or-more-sound cases. These two problems were solved using our proposed method, which involves an independent location model for the former and an block-wise consistent labeling with ordering for the latter. Our experiments indicated that the SSL based on DNNs trained by our proposed training method out-performed a conventional SSL method by a maximum of 18 points in terms of block-level correctness.},
	booktitle = {{IEEE} {Spoken} {Language} {Technology} {Workshop}},
	author = {Takeda, Ryu and Komatani, Kazunori},
	year = {2016},
	keywords = {lu},
	pages = {603--609},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\7E7ZDJ5Z\\7846325.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\RZNETCRE\\Takeda et Komatani - 2016 - Discriminative multiple sound source localization .pdf:application/pdf},
}

@article{vera-diaz_towards_2018,
	title = {Towards end-to-end acoustic localization using deep learning: from audio signal to source position coordinates},
	volume = {18},
	issn = {1424-8220},
	shorttitle = {Towards {End}-to-{End} {Acoustic} {Localization} using {Deep} {Learning}},
	doi = {10.3390/s18103418},
	abstract = {This paper presents a novel approach for indoor acoustic source localization using microphone arrays and based on a Convolutional Neural Network (CNN). The proposed solution is, to the best of our knowledge, the first published work in which the CNN is designed to directly estimate the three dimensional position of an acoustic source, using the raw audio signal as the input information avoiding the use of hand crafted audio features. Given the limited amount of available localization data, we propose in this paper a training strategy based on two steps. We first train our network using semi-synthetic data, generated from close talk speech recordings, and where we simulate the time delays and distortion suffered in the signal that propagates from the source to the array of microphones. We then fine tune this network using a small amount of real data. Our experimental results show that this strategy is able to produce networks that significantly improve existing localization methods based on {\textbackslash}textit\{SRP-PHAT\} strategies. In addition, our experiments show that our CNN method exhibits better resistance against varying gender of the speaker and different window sizes compared with the other methods.},
	number = {10},
	journal = {Sensors},
	author = {Vera-Diaz, Juan Manuel and Pizarro, Daniel and Macias-Guarasa, Javier},
	year = {2018},
	keywords = {lu},
	pages = {3418},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\LUC4GIH3\\Vera-Diaz et al. - 2018 - Towards End-to-End Acoustic Localization using Dee.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\Y2WKDTWD\\1807.html:text/html},
}

@article{salvati_exploiting_2018,
	title = {Exploiting {CNNs} for improving acoustic source localization in noisy and reverberant conditions},
	volume = {2},
	issn = {2471-285X},
	doi = {10.1109/TETCI.2017.2775237},
	abstract = {This paper discusses the application of convolutional neural networks (CNNs) to minimum variance distortionless response localization schemes. We investigate the direction of arrival estimation problems in noisy and reverberant conditions using a uniform linear array (ULA). CNNs are used to process the multichannel data from the ULA and to improve the data fusion scheme, which is performed in the steered response power computation. CNNs improve the incoherent frequency fusion of the narrowband response power by weighting the components, reducing the deleterious effects of those components affected by artifacts due to noise and reverberation. The use of CNNs avoids the necessity of previously encoding the multichannel data into selected acoustic cues with the advantage to exploit its ability in recognizing geometrical pattern similarity. Experiments with both simulated and real acoustic data demonstrate the superior localization performance of the proposed SRP beamformer with respect to other state-ofthe-art techniques.},
	number = {2},
	journal = {IEEE Transactions on Emerging Topics in Computational Intelligence},
	author = {Salvati, Daniele and Drioli, Carlo and Foresti, Gian Luca},
	month = apr,
	year = {2018},
	keywords = {lu},
	pages = {103--116},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\DR4S4Y4X\\8323305.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\TWMD5XDG\\Salvati et al. - 2018 - Exploiting CNNs for Improving Acoustic Source Loca.pdf:application/pdf},
}

@inproceedings{thuillier_spatial_2018,
	title = {Spatial audio feature discovery with convolutional neural networks},
	isbn = {978-1-5386-4658-8},
	doi = {10.1109/ICASSP.2018.8462315},
	abstract = {The advent of mixed reality consumer products brings about a pressing need to develop and improve spatial sound rendering techniques for a broad user base. Despite a large body of prior work, the precise nature and importance of various sound localization cues and how they should be personalized for an individual user to improve localization performance is still an open research problem. Here we propose training a convolutional neural network (CNN) to classify the elevation angle of spatially rendered sounds and employing Layerwise Relevance Propagation (LRP) on the trained CNN model. LRP provides saliency maps that can be used to identify spectral features used by the network for classiﬁcation. These maps, in addition to the convolution ﬁlters learned by the CNN, are discussed in the context of listening tests reported in the literature. The proposed approach could potentially provide an avenue for future studies on modeling and personalization of head-related transfer functions (HRTFs).},
	language = {en},
	urldate = {2020-03-26},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Thuillier, Etienne and Gamper, Hannes and Tashev, Ivan J.},
	year = {2018},
	keywords = {lu},
	pages = {6797--6801},
	file = {Thuillier et al. - 2018 - Spatial Audio Feature Discovery with Convolutional.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\ILSIKGZ5\\Thuillier et al. - 2018 - Spatial Audio Feature Discovery with Convolutional.pdf:application/pdf},
}

@article{ma_phased_2018,
	title = {Phased microphone array for sound source localization with deep learning},
	volume = {2},
	abstract = {To phased microphone array for sound source localization, algorithm with both high computational efficiency and high precision is a persistent pursuit. In this paper convolutional neural network (CNN) a kind of deep learning is preliminarily applied as a new algorithm. At high frequency CNN can reconstruct the sound localizations with excellent spatial resolution as good as DAMAS, within a very short time as short as conventional beamforming. This exciting result means that CNN perfectly finds source distribution directly from cross-spectral matrix without given propagation function in advance, and thus CNN deserves to be further explored as a new algorithm.},
	number = {2},
	journal = {Aerospace Systems},
	author = {Ma, Wei and Liu, Xun},
	year = {2018},
	keywords = {lu},
	pages = {71--81},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\AMYA67JH\\Ma et Liu - 2018 - Phased Microphone Array for Sound Source Localizat.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\XIIWQVEX\\1802.html:text/html},
}

@inproceedings{pertila_robust_2017,
	title = {Robust direction estimation with convolutional neural networks based steered response power},
	doi = {10.1109/ICASSP.2017.7953333},
	abstract = {The steered response power (SRP) methods can be used to build a map of sound direction likelihood. In the presence of interference and reverberation, the map will exhibit multiple peaks with heights related to the corresponding sound's spectral content. Often in realistic use cases, the target of interest (such as speech) can exhibit a lower peak compared to an interference source. This will corrupt any direction dependent method, such as beamforming. Regression has been used to predict time-frequency (TF) regions corrupted by reverberation, and static broadband noise can be efficiently estimated for TF points. TF regions dominated by noise or reverberation can then be de-emphasized to obtain more reliable source direction estimates. In this work, we propose the use of convolutional neural networks (CNNs) for the prediction of a TF mask for emphasizing the direct path speech signal in time-varying interference. SRP with phase transform (SRP-PHAT) combined with the CNN-based masking is shown to be capable of reducing the impact of time-varying interference for speaker direction estimation using real speech sources in reverberation.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Pertilä, Pasi and Cakir, Emre},
	month = mar,
	year = {2017},
	keywords = {lu, non-direct},
	pages = {6125--6129},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\ZC44VLDM\\7953333.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\S5NCSRWV\\Pertilä et Cakir - 2017 - Robust direction estimation with convolutional neu.pdf:application/pdf},
}

@inproceedings{perotin_crnn-based_2018,
	title = {{CRNN}-based joint azimuth and elevation localization with the {Ambisonics} intensity vector},
	doi = {10.1109/IWAENC.2018.8521403},
	abstract = {We present a source localization system for first-order Ambisonics (FOA) contents based on a stacked convolutional and recurrent neural network (CRNN). We propose to use as input to the CRNN the FOA acoustic intensity vector, which is easy to compute and closely linked to the sound direction of arrival (DoA). The system estimates the DoA of a point source in both azimuth and elevation. We conduct an experimental evaluation in configurations including reverberation, noise, and various speaker w. r. t. microphone orientations. The results show that the proposed architecture and input allow the network to return accurate location estimates in realistic conditions compared to another recent CRNN-based system.},
	booktitle = {International {Workshop} on {Acoustic} {Signal} {Enhancement}},
	author = {Perotin, Lauréline and Serizel, Romain and Vincent, Emmanuel and Guérin, Alexandre},
	month = sep,
	year = {2018},
	keywords = {lu},
	pages = {241--245},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\24EP5U5G\\8521403.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\JBSR8ZXL\\Perotin et al. - 2018 - CRNN-based Joint Azimuth and Elevation Localizatio.pdf:application/pdf},
}

@article{xiao_improved_2020,
	title = {Improved source counting and separation for monaural mixture},
	abstract = {Single-channel speech separation in time domain and frequency domain has been widely studied for voice-driven applications over the past few years. Most of previous works assume known number of speakers in advance, however, which is not easily accessible through monaural mixture in practice. In this paper, we propose a novel model of single-channel multi-speaker separation by jointly learning the time-frequency feature and the unknown number of speakers. Specifically, our model integrates the time-domain convolution encoded feature map and the frequency-domain spectrogram by attention mechanism, and the integrated features are projected into high-dimensional embedding vectors which are then clustered with deep attractor network to modify the encoded feature. Meanwhile, the number of speakers is counted by computing the Gerschgorin disks of the embedding vectors which are orthogonal for different speakers. Finally, the modified encoded feature is inverted to the sound waveform using a linear decoder. Experimental evaluation on the GRID dataset shows that the proposed method with a single model can accurately estimate the number of speakers with 96.7 \% probability of success, while achieving the state-of-the-art separation results on multi-speaker mixtures in terms of scale-invariant signal-to-noise ratio improvement (SI-SNRi) and signal-to-distortion ratio improvement (SDRi).},
	journal = {arXiv:2004.00175},
	author = {Xiao, Yiming and Zhang, Haijian},
	month = mar,
	year = {2020},
	keywords = {non-lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\T3I9ICN3\\Xiao et Zhang - 2020 - Improved Source Counting and Separation for Monaur.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\G7A2QCMV\\2004.html:text/html},
}

@article{cohen_relative_2004,
	title = {Relative transfer function identification using speech signals},
	volume = {12},
	issn = {1558-2353},
	doi = {10.1109/TSA.2004.832975},
	abstract = {An important component of a multichannel hands-free communication system is the identification of the relative transfer function between sensors in response to a desired source signal. In this paper, a robust system identification approach adapted to speech signals is proposed. A weighted least-squares optimization criterion is introduced, which considers the uncertainty of the desired signal presence in the observed signals. An asymptotically unbiased estimate for the system's transfer function is derived, and a corresponding recursive online implementation is presented. We show that compared to a competing nonstationarity-based method, a smaller error variance is achieved and generally shorter observation intervals are required. Furthermore, in the case of a time-varying system, faster convergence and higher reliability of the system identification are obtained by using the proposed method than by using the nonstationarity-based method. Evaluation of the proposed system identification approach is performed under various noise conditions, including simulated stationary and nonstationary white Gaussian noise, and car interior noise in real pseudo-stationary and nonstationary environments. The experimental results confirm the advantages of proposed approach.},
	number = {5},
	journal = {IEEE Transactions on Speech and Audio Processing},
	author = {Cohen, Israel},
	month = sep,
	year = {2004},
	keywords = {lu},
	pages = {451--459},
	file = {Cohen - 2004 - Relative transfer function identification using sp.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\TA6NM6ZQ\\Cohen - 2004 - Relative transfer function identification using sp.pdf:application/pdf},
}

@article{shalvi_system_1996,
	title = {System identification using nonstationary signals},
	volume = {44},
	issn = {1941-0476},
	doi = {10.1109/78.533725},
	abstract = {The conventional method for identifying the transfer function of an unknown linear system consists of a least squares fit of its input to its output. It is equivalent to identifying the frequency response of the system by calculating the empirical cross-spectrum between the system's input and output, divided by the empirical auto-spectrum of the input process. However, if the additive noise at the system's output is correlated with the input process, e.g., in case of environmental noise that affects both system's input and output, the method may suffer from a severe bias effect. We present a modification of the cross-spectral method that exploits nonstationary features in the data in order to circumvent bias effects caused by correlated stationary noise. The proposed method is particularly attractive to problems of multichannel signal enhancement and noise cancellation, when the desired signal is nonstationary in nature, e.g., speech or image.},
	number = {8},
	journal = {IEEE Transactions on Signal Processing},
	author = {Shalvi, Ofir and Weinstein, Ehud},
	month = aug,
	year = {1996},
	pages = {2055--2063},
	file = {Shalvi et Weinstein - 1996 - System identification using nonstationary signals.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\IKYG2JS6\\Shalvi et Weinstein - 1996 - System identification using nonstationary signals.pdf:application/pdf},
}

@inproceedings{kinoshita_tackling_2020,
	title = {Tackling real noisy reverberant meetings with all-neural source separation, counting, and diarization system},
	doi = {10.1109/ICASSP40776.2020.9054577},
	abstract = {Automatic meeting analysis is an essential fundamental technology required to let, e.g. smart devices follow and respond to our conversations. To achieve an optimal automatic meeting analysis, we previously proposed an all-neural approach that jointly solves source separation, speaker diarization and source counting problems in an optimal way (in a sense that all the 3 tasks can be jointly optimized through error back-propagation). It was shown that the method could well handle simulated clean (noiseless and anechoic) dialog-like data, and achieved very good performance in comparison with several conventional methods. However, it was not clear whether such all-neural approach would be successfully generalized to more complicated real meeting data containing more spontaneously-speaking speakers, severe noise and reverberation, and how it performs in comparison with the state-of-the-art systems in such scenarios. In this paper, we first consider practical issues required for improving the robustness of the all-neural approach, and then experimentally show that, even in real meeting scenarios, the all-neural approach can perform effective speech enhancement, and simultaneously outperform state-of-the-art systems.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro},
	month = may,
	year = {2020},
	keywords = {non-lu},
	pages = {381--385},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\MUAYGZ7T\\9054577.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\2LVWW8LX\\Kinoshita et al. - 2020 - Tackling Real Noisy Reverberant Meetings with All-.pdf:application/pdf},
}

@article{chakrabarty_multi-speaker_2017,
	title = {Multi-speaker localization using convolutional neural network trained with noise},
	abstract = {The problem of multi-speaker localization is formulated as a multi-class multi-label classification problem, which is solved using a convolutional neural network (CNN) based source localization method. Utilizing the common assumption of disjoint speaker activities, we propose a novel method to train the CNN using synthesized noise signals. The proposed localization method is evaluated for two speakers and compared to a well-known steered response power method.},
	journal = {arXiv:1712.04276},
	author = {Chakrabarty, Soumitro and Habets, Emanuël A. P.},
	year = {2017},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\LYVS7JDU\\Chakrabarty et Habets - 2017 - Multi-Speaker Localization Using Convolutional Neu.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\8KWB9XAU\\1712.html:text/html;Chakrabarty and Habets - Multi-Speaker Localization Using Convolutional Neu.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\B6R2UM6T\\Chakrabarty and Habets - Multi-Speaker Localization Using Convolutional Neu.pdf:application/pdf},
}

@book{zotter_ambisonics_2019,
	title = {Ambisonics: a practical {3D} audio theory for recording, studio production, sound reinforcement, and virtual reality},
	isbn = {978-3-030-17206-0 978-3-030-17207-7},
	shorttitle = {Ambisonics},
	language = {en},
	urldate = {2020-05-02},
	publisher = {Springer Nature},
	author = {Zotter, Franz and Frank, Matthias},
	year = {2019},
	keywords = {non-lu},
	file = {Zotter et Frank - 2019 - Ambisonics A Practical 3D Audio Theory for Record.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\8EBYJ9NF\\Zotter et Frank - 2019 - Ambisonics A Practical 3D Audio Theory for Record.pdf:application/pdf},
}

@inproceedings{sundar_raw_2020,
	title = {Raw waveform based end-to-end deep convolutional network for spatial localization of multiple acoustic sources},
	doi = {10.1109/ICASSP40776.2020.9054090},
	abstract = {In this paper, we present an end-to-end deep convolutional neural network operating on multi-channel raw audio data to localize multiple simultaneously active acoustic sources in space. Previously reported deep learning based approaches work well in localizing a single source directly from multi-channel raw-audio, but are not easily extendable to localize multiple sources due to the well known permutation problem. We propose a novel encoding scheme to represent the spatial coordinates of multiple sources, which facilitates 2D localization of multiple sources in an end-to-end fashion, avoiding the permutation problem and achieving arbitrary spatial resolution. Experiments on a simulated data set and real recordings from the AV16.3 Corpus demonstrate that the proposed method generalizes well to unseen test conditions, and outperforms a recent time difference of arrival (TDOA) based multiple source localization approach reported in the literature.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Sundar, Harshavardhan and Wang, Weiran and Sun, Ming and Wang, Chao},
	month = may,
	year = {2020},
	keywords = {lu},
	pages = {4642--4646},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\H8RDS9U8\\9054090.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\N5QJ3UVC\\Sundar et al. - 2020 - Raw Waveform Based End-to-end Deep Convolutional N.pdf:application/pdf},
}

@book{jarrett_theory_2017,
	series = {Springer {Topics} in {Signal} {Processing}},
	title = {Theory and applications of spherical microphone array processing},
	volume = {9},
	isbn = {978-3-319-42209-1},
	abstract = {This book presents the signal processing algorithms that have been developed to process the signals acquired by a spherical microphone array. Spherical microphone arrays can be used to capture the sound field in three dimensions and have received significant interest from researchers and audio engineers. Algorithms for spherical array processing are different to corresponding algorithms already known in the literature of linear and planar arrays because the spherical geometry can be exploited to great beneficial effect. The authors aim to advance the field of spherical array processing by helping those new to the field to study it efficiently and from a single source, as well as by offering a way for more experienced researchers and engineers to consolidate their understanding, adding either or both of breadth and depth. The level of the presentation corresponds to graduate studies at MSc and PhD level. This book begins with a presentation of some of the essential mathematical and physical theory relevant to spherical microphone arrays, and of an acoustic impulse response simulation method, which can be used to comprehensively evaluate spherical array processing algorithms in reverberant environments. The chapter on acoustic parameter estimation describes the way in which useful descriptions of acoustic scenes can be parameterized, and the signal processing algorithms that can be used to estimate the parameter values using spherical microphone arrays. Subsequent chapters exploit these parameters including in particular measures of direction-of-arrival and of diffuseness of a sound field. The array processing algorithms are then classified into two main classes, each described in a separate chapter. These are signal-dependent and signal-independent beamforming algorithms. Although signal-dependent beamforming algorithms are in theory able to provide better performance compared to the signal-independent algorithms, they are currently rarely used in practice. The main reason for this is that the statistical information required by these algorithms is difficult to estimate. In a subsequent chapter it is shown how the estimated acoustic parameters can be used in the design of signal-dependent beamforming algorithms. This final step closes, at least in part, the gap between theory and practice.},
	language = {en},
	publisher = {Springer},
	author = {Jarrett, Daniel P. and Habets, Emanuël A. P. and Naylor, Patrick A.},
	year = {2017},
	file = {Jarrett et al. - 2017 - Theory and Applications of Spherical Microphone Ar.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\WYMT73ZW\\Jarrett et al. - 2017 - Theory and Applications of Spherical Microphone Ar.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\5FVYRLW2\\9783319422091.html:text/html},
}

@inproceedings{nguyen_autonomous_2018,
	title = {Autonomous sensorimotor learning for sound source localization by a humanoid robot},
	abstract = {We consider the problem of learning to localize a speech source using a humanoid robot equipped with a binaural hearing system. We aim to map binaural audio features into the relative angle between the robot’s head direction and the target source direction based on a sensorimotor training framework. To this end, we make the following contributions: (i) a procedure to automatically collect and label audio and motor data for sensorimotor training; (ii) the use of a convolutional neural network (CNN) trained with white noise signal and ground truth relative source direction. Experimental evaluation with speech signals shows that the CNN can localize the speech source even without an explicit algorithm for dealing with missing spectral features.},
	language = {en},
	booktitle = {Workshop on {Crossmodal} {Learning} for {Intelligent} {Robotics} in conjunction with {IEEE}/{RSJ} {IROS}},
	author = {Nguyen, Quan and Girin, Laurent and Bailly, Gérard and Elisei, Frédéric and Nguyen, Duc-Canh},
	year = {2018},
	keywords = {lu},
	file = {Nguyen et al. - Autonomous Sensorimotor Learning for Sound Source .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\434BHW29\\Nguyen et al. - Autonomous Sensorimotor Learning for Sound Source .pdf:application/pdf},
}

@inproceedings{daniel_time_2020,
	title = {Time domain velocity vector for retracing the multipath propagation},
	isbn = {978-1-5090-6631-5},
	doi = {10.1109/ICASSP40776.2020.9054561},
	abstract = {We propose a conceptually and computationally simple form of sound velocity that offers a readable view of the interference between direct and indirect sound waves. Unlike most approaches in the literature, it jointly exploits both active and reactive sound intensity measurements, as typically derived from a ﬁrst order ambisonics recording. This representation has a potential both as a valuable tool for directly analyzing sound multipath propagation, as well as being a new spatial feature format for machine learning algorithms in audio and acoustics. As a showcase, we demonstrate that the Directionof-Arrival and the range of a sound source can be estimated as a development of this approach. To the best knowledge of the authors, this is the ﬁrst time that range is estimated from an ambisonics recording.},
	language = {en},
	urldate = {2020-05-25},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Daniel, Jerome and Kitic, Srdan},
	year = {2020},
	keywords = {lu},
	pages = {421--425},
	file = {Daniel et Kitic - 2020 - Time Domain Velocity Vector for Retracing the Mult.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\7RWFEP3N\\Daniel et Kitic - 2020 - Time Domain Velocity Vector for Retracing the Mult.pdf:application/pdf},
}

@article{gannot_signal_2001,
	title = {Signal enhancement using beamforming and nonstationarity with applications to speech},
	volume = {49},
	issn = {1941-0476},
	doi = {10.1109/78.934132},
	abstract = {We consider a sensor array located in an enclosure, where arbitrary transfer functions (TFs) relate the source signal and the sensors. The array is used for enhancing a signal contaminated by interference. Constrained minimum power adaptive beamforming, which has been suggested by Frost (1972) and, in particular, the generalized sidelobe canceler (GSC) version, which has been developed by Griffiths and Jim (1982), are the most widely used beamforming techniques. These methods rely on the assumption that the received signals are simple delayed versions of the source signal. The good interference suppression attained under this assumption is severely impaired in complicated acoustic environments, where arbitrary TFs may be encountered. In this paper, we consider the arbitrary TF case. We propose a GSC solution, which is adapted to the general TF case. We derive a suboptimal algorithm that can be implemented by estimating the TFs ratios, instead of estimating the TFs. The TF ratios are estimated by exploiting the nonstationarity characteristics of the desired signal. The algorithm is applied to the problem of speech enhancement in a reverberating room. The discussion is supported by an experimental study using speech and noise signals recorded in an actual room acoustics environment.},
	number = {8},
	journal = {IEEE Transactions on Signal Processing},
	author = {Gannot, Sharon and Burshtein, David and Weinstein, Ehud},
	month = aug,
	year = {2001},
	pages = {1614--1626},
	file = {gannot2001.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\QDZ29X4E\\gannot2001.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\E7MV6WSX\\references.html:text/html;Version soumise:C\:\\Users\\RQML4978\\Zotero\\storage\\8S8AXFEY\\Gannot et al. - 2001 - Signal enhancement using beamforming and nonstatio.pdf:application/pdf},
}

@inproceedings{opochinsky_deep_2019,
	title = {Deep ranking-based sound source localization},
	isbn = {978-1-72811-123-0},
	doi = {10.1109/WASPAA.2019.8937159},
	abstract = {Sound source localization is a cumbersome task in challenging reverberation conditions. Recently, there is a growing interest in developing learning-based localization methods. In this approach, acoustic features are extracted from the measured signals and then given as input to a model that maps them to the corresponding source positions. Typically, a massive dataset of labeled samples from known positions is required to train such models.},
	language = {en},
	urldate = {2020-07-02},
	booktitle = {{IEEE} {Workshop} on {Applications} of {Signal} {Processing} to {Audio} and {Acoustics}},
	author = {Opochinsky, Renana and Laufer-Goldshtein, Bracha and Gannot, Sharon and Chechik, Gal},
	year = {2019},
	keywords = {lu},
	pages = {283--287},
	file = {Opochinsky et al. - 2019 - Deep Ranking-Based Sound Source Localization.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\K8NJHK4T\\Opochinsky et al. - 2019 - Deep Ranking-Based Sound Source Localization.pdf:application/pdf},
}

@inproceedings{wang_speaker_2020,
	title = {Speaker counting model based on transfer learning from {SincNet} bottleneck layer},
	doi = {10.1109/PerCom45495.2020.9127390},
	abstract = {People counting techniques have been widely researched recently and many different types of sensors can be used in this context. In this paper, we propose a system based on a deep-learning model able to identify the number of people in the crowded scenarios through the speech sound. In a nutshell the system relies on two components: counting concurrent speakers in overlapping talking sound directly and clustering single-speaker sound by speaker-identity over time. Compared to previously proposed speaker-counting systems models that only cluster single-speaker sound, this system is more accurate and less vulnerable to the overlapping sound in the crowded environment. In addition, counting speakers in overlapping sound also gives the minimal number of speakers so that it also improves the counting accuracy in a quiet environment.Our methodology is inspired by the newly proposed SincNet deep neural network framework which proves to be outstanding and highly efficient in sound processing with raw signals. By transferring the bottleneck layer of SincNet model as features fed to our speaker clustering model we reached a noticeably better performance than previous models who rely on the use MFCC and other engineered features.},
	booktitle = {{IEEE} {International} {Conference} on {Pervasive} {Computing} and {Communications}},
	author = {Wang, Wei and Seraj, Fatjon and Meratnia, Nirvana and Havinga, Paul J.M.},
	month = mar,
	year = {2020},
	keywords = {non-lu},
	pages = {1--8},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\8UFGTFJC\\9127390.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\NXDGL7KB\\Wang et al. - 2020 - Speaker Counting Model based on Transfer Learning .pdf:application/pdf},
}

@inproceedings{bianco_semi-supervised_2020,
	title = {Semi-supervised source localization with deep generative modeling},
	abstract = {We propose a semi-supervised localization approach based on deep generative modeling with variational autoencoders (VAE). Localization in reverberant environments remains a challenge, which machine learning (ML) has shown promise in addressing. Even with large data volumes, the number of labels available for supervised learning in reverberant environments is usually small. We address this issue by perform semi-supervised learning (SSL) with convolutional VAEs. The VAE is trained to generate the phase of relative transfer functions (RTFs), in parallel with a DOA classifier, on both labeled and unlabeled RTF samples. The VAE-SSL approach is compared with SRP-PHAT and fully-supervised CNNs. We find that VAE-SLL can outperform both SRP-PHAT and CNN in label-limited scenarios.},
	booktitle = {{IEEE} {International} {Workshop} on {Machine} {Learning} for {Signal} {Processing}},
	author = {Bianco, Michael J. and Gannot, Sharon and Gerstoft, Peter},
	month = jul,
	year = {2020},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\EVXQNC5L\\Bianco et al. - 2020 - Semi-supervised source localization with deep gene.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\24JUX9XQ\\2005.html:text/html},
}

@inproceedings{cornell_detecting_2020,
	title = {Detecting and counting overlapping speakers in distant speech scenarios},
	abstract = {We consider the problem of detecting the activity and counting overlapping speakers in distant-microphone recordings. We treat supervised Voice Activity Detection (VAD), Overlapped Speech Detection (OSD), joint VAD+OSD, and speaker counting as instances of a general Overlapped Speech Detection and Counting (OSDC) task, and we design a Temporal Convolu-tional Network (TCN) based method to address it. We show that TCNs significantly outperform state-of-the-art methods on two real-world distant speech datasets. In particular our best architecture obtains, for OSD, 29.1\% and 25.5\% absolute improvement in Average Precision over previous techniques on, respectively, the AMI and CHiME-6 datasets. Furthermore, we find that generalization for joint VAD+OSD improves by using a speaker counting objective rather than a VAD+OSD objective. We also study the effectiveness of forced alignment based labeling and data augmentation, and show that both can improve OSD performance.},
	language = {en},
	booktitle = {Interspeech},
	author = {Cornell, Samuele and Omologo, Maurizio and Squartini, Stefano and Vincent, Emmanuel},
	month = oct,
	year = {2020},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\G8SKUDNE\\Cornell et al. - 2020 - Detecting and counting overlapping speakers in dis.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\FP3V9DFA\\hal-02908241.html:text/html},
}

@book{rafaely_fundamentals_2019,
	title = {Fundamentals of spherical array processing},
	volume = {16},
	isbn = {978-3-319-99560-1 978-3-319-99561-8},
	language = {en},
	publisher = {Springer},
	author = {Rafaely, Boaz},
	year = {2019},
	keywords = {non-lu},
	file = {Rafaely - 2019 - Fundamentals of Spherical Array Processing.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\IVWT89WQ\\Rafaely - 2019 - Fundamentals of Spherical Array Processing.pdf:application/pdf},
}

@article{kim_direction_2011,
	title = {Direction of arrival estimation of humans with a small sensor array using an artificial neural network},
	volume = {27},
	issn = {1937-6472},
	doi = {10.2528/PIERB10100510},
	abstract = {An array processing algorithm based on artiﬁcial neural networks (ANNs) is proposed to estimate the directions of arrival (DOAs) of moving humans using a small sensor array. In the approach, software beamforming is ﬁrst performed on the received signals from the sensor elements to form a number of overlapping beams. The received signals from all the beams produce a unique “signature” in accordance with the target locations as well as the number of targets. The target tracking procedure is handled by two separate ANNs in sequence. The ﬁrst ANN determines the number of targets, and the second ANN estimates their respective DOAs. The ANNs are trained using simulation data generated based on a point scatterer model in free space. The proposed approach is tested using measurement data from two loudspeakers and two walking humans in line-of-sight and through-wall environments.},
	language = {en},
	urldate = {2020-09-02},
	journal = {Progress In Electromagnetics Research},
	author = {Kim, Youngwook and Ling, Hao},
	year = {2011},
	keywords = {lu},
	pages = {127--149},
	file = {Kim et Ling - 2011 - DIRECTION OF ARRIVAL ESTIMATION OF HUMANS WITH A S.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\465MVW59\\Kim et Ling - 2011 - DIRECTION OF ARRIVAL ESTIMATION OF HUMANS WITH A S.pdf:application/pdf},
}

@inproceedings{chazan_multi-microphone_2019,
	title = {Multi-microphone speaker separation based on deep {DoA} estimation},
	doi = {10.23919/EUSIPCO.2019.8903121},
	abstract = {In this paper, we present a multi-microphone speech separation algorithm based on masking inferred from the speakers direction of arrival (DOA). According to the W-disjoint orthogonality property of speech signals, each time-frequency (TF) bin is dominated by a single speaker. This TF bin can therefore be associated with a single DOA. In our procedure, we apply a deep neural network (DNN) with a U-net architecture to infer the DOA of each TF bin from a concatenated set of the spectra of the microphone signals. Separation is obtained by multiplying the reference microphone by the masks associated with the different DOAs. Our proposed deep direction estimation for speech separation (DDESS) method is inspired by the recent advances in deep clustering methods. Unlike already established methods that apply the clustering in a latent embedded space, in our approach the embedding is closely associated with the spatial information, as manifested by the different speakers' directions of arrival.},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Chazan, Shlomo E. and Hammer, Hodaya and Hazan, Gershon and Goldberger, Jacob and Gannot, Sharon},
	month = sep,
	year = {2019},
	keywords = {lu},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\2KVCMIAH\\8903121.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\5TRE96IN\\Chazan et al. - 2019 - Multi-Microphone Speaker Separation based on Deep .pdf:application/pdf},
}

@techreport{grondin_sound_2019,
	title = {Sound event localization and detection using {CRNN} on pairs of microphones},
	abstract = {This paper proposes sound event localization and detection methods from multichannel recording. The proposed system is based on two Convolutional Recurrent Neural Networks (CRNNs) to perform sound event detection (SED) and time difference of arrival (TDOA) estimation on each pair of microphones in a microphone array. In this paper, the system is evaluated with a four-microphone array, and thus combines the results from six pairs of microphones to provide a final classification and a 3-D direction of arrival (DOA) estimate. Results demonstrate that the proposed approach outperforms the DCASE 2019 baseline system.},
	author = {Grondin, Francois and Glass, James and Sobieraj, Iwona and Plumbley, Mark D.},
	month = oct,
	year = {2019},
	keywords = {dcase19, lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\ZHEUAKT8\\Grondin et al. - 2019 - Sound Event Localization and Detection Using CRNN .pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\4C42JSTC\\1910.html:text/html;Grondin et al. - 2019 - Sound Event Localization and Detection Using CRNN .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\NFTV2VIE\\Grondin et al. - 2019 - Sound Event Localization and Detection Using CRNN .pdf:application/pdf},
}

@article{varanasi_deep_2020,
	title = {A deep learning framework for robust {DoA} estimation using spherical harmonic decomposition},
	volume = {28},
	issn = {2329-9304},
	doi = {10.1109/TASLP.2020.2984852},
	abstract = {Spherical harmonic decomposition facilitates decomposing the sound pressure at different microphones into independent functions of frequency, azimuth and elevation of the source and microphone locations. This decomposition facilitates the extraction of two sets of features containing different information about elevation and azimuth of the source for the direction of arrival (DOA) estimation. These features can be given as input to a learning approach for the estimation of azimuth and elevation separately. This approach aims at breaking down the problem of DOA estimation into azimuth and elevation estimation separately. An advantage of this is the reduction in computational complexity when compared with the joint DOA estimation. This facilitates a straightforward extension of this approach to denser DOA search grids. The contribution of this paper is threefold. First, we propose spherical harmonic magnitude and phase features and discuss the information present in these features regarding the azimuth and elevation of the source. Second, we propose the convolutional neural network architectures for DOA estimation. Third, we analyse the training, run-time computational complexities and propose to extend the DOA estimation approach to dense DOA search grid rather than restricting to a sparse DOA search grid. The performance of conventional DOA estimation approaches degrades in case of a noisy and reverberant environment. Several advancements to the existing DOA estimation approaches have been recently proposed. However, to the best of the authors' knowledge, learning approaches to DOA estimation with dense DOA search grids with few frames in the context of spherical arrays have not been proposed. Performance evaluation is carried out using simulated as well as real datasets. The proposed approach is also evaluated on LOCATA dataset in the context of a moving source. The results are motivating enough to consider the application of the proposed method in practical scenarios.},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Varanasi, Vishnuvardhan and Gupta, Harshit and Hegde, Rajesh M.},
	year = {2020},
	keywords = {lu},
	pages = {1248--1259},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\CM268MLB\\9056464.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\5AYDAC7S\\Varanasi et al. - 2020 - A Deep Learning Framework for Robust DOA Estimatio.pdf:application/pdf},
}

@article{nguyen_robust_2020,
	title = {Robust source counting and {DoA} estimation using spatial pseudo-spectrum and convolutional neural network},
	volume = {28},
	issn = {2329-9304},
	doi = {10.1109/TASLP.2020.3019646},
	abstract = {Many signal processing-based methods for sound source direction-of-arrival estimation produce a spatial pseudo-spectrum of which the local maxima strongly indicate the source directions. Due to different levels of noise, reverberation and different number of overlapping sources, the spatial pseudo-spectra are noisy even after smoothing. In addition, the number of sources are often unknown. As a result, selecting the peaks from these spectra is susceptible to error. Convolutional neural network has been successfully applied to many image processing problems in general and direction-of-arrival estimation in particular. In addition, deep learning-based methods for direction-of-arrival estimation show good generalization to different environments. We propose to use a 2D convolutional neural network with multi-task learning to robustly estimate the number of sources and the directions-of-arrival from short-time spatial pseudo-spectra, which have useful directional information from audio input signals. This approach reduces the tendency of the neural network to learn unwanted association between sound classes and directional information, and helps the network generalize to unseen sound classes. The simulation and experimental results show that the proposed methods outperform other directional-of-arrival estimation methods in different levels of noise and reverberation, and different number of sources.},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Nguyen, Tho Thi Ngoc and Gan, Woon-Seng and Ranjan, Rishabh and Jones, Douglas L.},
	year = {2020},
	keywords = {lu},
	pages = {2626--2637},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\4Y5LZHH7\\9178434.html:text/html;IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\W4BLYYHV\\9178434.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\RZNNAFHI\\Nguyen et al. - 2020 - Robust Source Counting and DOA Estimation Using Sp.pdf:application/pdf},
}

@techreport{maruri_gcc-phat_2019,
	title = {{GCC}-{PHAT} cross-correlation audio features for simultaneous sound event localization and detection ({SELD}) in multiple rooms},
	abstract = {In this work, we show a simultaneous sound event localization and detection (SELD) system, with enhanced acoustic features, in which we propose using the well-known Generalized Cross Correlation (GCC) PATH algorithm, to augment the magnitude and phase regular Fourier spectra features at each frame. GCC-PHAT has already been used for some time to calculate the Time Difference of Arrival (TDoA) in simultaneous audio signals, in moderately reverberant environments, using classic signal processing techniques, and can assist audio source localization in current deep learning machines. The neural net architecture we used is a Convolutional Recurrent Neural Network (CRNN), and is tested using the sound database prepared for the Task 3 of the 2019 DCASE Challenge. In the challenge results, our proposed system was able to achieve 20.8 of direction of arrival error, 85.6\% frame recall, 86.5\% F-score and 0.22 error rate detection in evaluation samples.},
	language = {en},
	urldate = {2020-09-14},
	author = {Maruri, Hector A. Cordourier and Meyer, Paulo Lopez and Huang, Jonathan and Ontiveros, Juan Antonio del Hoyo and Lu, Hong},
	year = {2019},
	keywords = {dcase19, lu},
	file = {Maruri et al. - 2019 - GCC-PHAT Cross-Correlation Audio Features for Simu.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\NPHNJ6PV\\Maruri et al. - 2019 - GCC-PHAT Cross-Correlation Audio Features for Simu.pdf:application/pdf},
}

@inproceedings{he_adaptation_2019,
	title = {Adaptation of multiple sound source localization neural networks with weak supervision and domain-adversarial training},
	isbn = {978-1-4799-8131-1},
	doi = {10.1109/ICASSP.2019.8682655},
	abstract = {Despite the recent success of deep neural network-based approaches in sound source localization, these approaches suffer the limitations that the required annotation process is costly, and the mismatch between the training and test conditions undermines the performance. This paper addresses the question of how models trained with simulation can be exploited for multiple sound source localization in real scenarios by domain adaptation. In particular, two domain adaptation methods are investigated: weak supervision and domainadversarial training. Our experiments show that the weak supervision with the knowledge of the number of sources can signiﬁcantly improve the performance of an unadapted model. However, the domain-adversarial training does not yield signiﬁcant improvement for this particular problem.},
	language = {en},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {He, Weipeng and Motlicek, Petr and Odobez, Jean-Marc},
	month = may,
	year = {2019},
	keywords = {lu},
	pages = {770--774},
	file = {He et al. - 2019 - Adaptation of Multiple Sound Source Localization N.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\WY8CTDIU\\He et al. - 2019 - Adaptation of Multiple Sound Source Localization N.pdf:application/pdf},
}

@inproceedings{peng_competing_2020,
	title = {Competing speaker count estimation on the fusion of the spectral and spatial embedding space},
	abstract = {This paper presents a method for estimating the competing speaker count with deep spectral and spatial embedding fusion. The basic idea is that mixed speech can be projected into an embedding space using neural networks where embedding vectors are orthogonal for different speakers while parallel for the same speaker. Therefore, speaker count estimation can be performed by computing the rank of the mean covariance matrix of the embedding vectors. It is also a feature combination method in speaker embedding space instead of simply combining features at the input layer of neural networks. Experimental results show that embedding-based method is better than classiﬁcationbased method where the network directly predicts the count of speakers and spatial features help to speaker count estimation. In addition, the features combined in the embedding space can achieve more accurate speaker counting than features combined at the input layer of nueral networks when tested on anechoic and reverberant datasets.},
	language = {en},
	booktitle = {Interspeech},
	author = {Peng, Chao and Wu, Xihong and Qu, Tianshu},
	year = {2020},
	pages = {3077--3081},
	file = {Peng et al. - Competing speaker count estimation on the fusion o.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\RCVNDRYA\\Peng et al. - Competing speaker count estimation on the fusion o.pdf:application/pdf},
}

@inproceedings{cao_improved_2021,
	title = {An improved event-independent network for polyphonic sound event localization and detection},
	abstract = {Polyphonic sound event localization and detection (SELD), which jointly performs sound event detection (SED) and direction-of-arrival (DoA) estimation, has better real-world applicability than separate SED or DoA estimation. It detects the type and occurrence time of sound events as well as their corresponding DoA angles simultaneously. We study the SELD task from a multi-task learning perspective. Two open problems are addressed in the paper. Firstly, to detect overlapping sound events of the same type but with different DoAs, we propose to use a trackwise output format and solve the accompanying track permutation problem with permutation-invariant training. Multi-head self-attention is further used to separate tracks. Secondly, a previous finding is that, by using hard parameter-sharing, SELD suffers from a performance loss compared with learning the subtasks separately. This is solved by a soft parameter-sharing scheme. We term the proposed method as Event Independent Network V2 (EINV2), which is an improved version of our previously-proposed method and an end-to-end network for SELD. We show that our proposed EINV2 for joint SED and DoA estimation outperforms previous methods by a large margin. In addition, a single EINV2 model with a VGG-style architecture has comparable performance to state-of-the-art ensemble models. Source code is available.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Cao, Yin and Iqbal, Turab and Kong, Qiuqiang and An, Fengyan and Wang, Wenwu and Plumbley, Mark D.},
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\NTW6WW46\\Cao et al. - 2020 - An Improved Event-Independent Network for Polyphon.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\SZEMWB4Z\\2010.html:text/html},
}

@inproceedings{zhang_robust_2019,
	title = {Robust {DoA} estimation based on convolutional neural network and time-frequency masking},
	doi = {10.21437/Interspeech.2019-3158},
	abstract = {In the scenario with noise and reverberation, the performance of current methods for direction of arrival (DOA) estimation usually degrades signiﬁcantly. Inspired by the success of timefrequency masking in speech enhancement and speech separation, this paper proposes new methods to better utilize timefrequency masking in convolution neural network to improve the robustness of localization. First a mask estimation network is developed to assist DOA estimation by either appending or multiplying the estimated masks to the original input feature. Then we further propose a multi-task learning architecture to optimize the mask and DOA estimation networks jointly, and two modes are designed and compared. Experiments show that all the proposed methods have better robustness and generalization in noisy and reverberant conditions compared to the conventional methods, and the multi-task methods have the best performance among all approaches.},
	language = {en},
	booktitle = {Interspeech},
	author = {Zhang, Wangyou and Zhou, Ying and Qian, Yanmin},
	month = sep,
	year = {2019},
	keywords = {lu},
	pages = {2703--2707},
	file = {Zhang et al. - 2019 - Robust DOA Estimation Based on Convolutional Neura.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\655EYEM8\\Zhang et al. - 2019 - Robust DOA Estimation Based on Convolutional Neura.pdf:application/pdf},
}

@inproceedings{shimada_accdoa_2020,
	title = {{ACCDOA}: activity-coupled cartesian direction of arrival representation for sound event localization and detection},
	shorttitle = {{ACCDOA}},
	abstract = {Neural-network (NN)-based methods show high performance in sound event localization and detection (SELD). Conventional NN-based methods use two branches for a sound event detection (SED) target and a direction-of-arrival (DOA) target. The two-branch representation with a single network has to decide how to balance the two objectives during optimization. Using two networks dedicated to each task increases system complexity and network size. To address these problems, we propose an activity-coupled Cartesian DOA (ACCDOA) representation, which assigns a sound event activity to the length of a corresponding Cartesian DOA vector. The ACCDOA representation enables us to solve a SELD task with a single target and has two advantages: avoiding the necessity of balancing the objectives and model size increase. In experimental evaluations with the DCASE 2020 Task 3 dataset, the ACCDOA representation outperformed the two-branch representation in SELD metrics with a smaller network size. The ACCDOA-based SELD system also performed better than state-of-the-art SELD systems in terms of localization and location-dependent detection.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Shimada, Kazuki and Koyama, Yuichiro and Takahashi, Naoya and Takahashi, Shusuke and Mitsufuji, Yuki},
	month = oct,
	year = {2020},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\W6ST7R8P\\Shimada et al. - 2020 - ACCDOA Activity-Coupled Cartesian Direction of Ar.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\GMFVBF3W\\2010.html:text/html},
}

@inproceedings{youssef_learning-based_2013,
	title = {A learning-based approach to robust binaural sound localization},
	doi = {10.1109/IROS.2013.6696771},
	abstract = {Sound source localization is an important feature designed and implemented on robots and intelligent systems. Like other artificial audition tasks, it is constrained to multiple problems, notably sound reflections and noises. This paper presents a sound source azimuth estimation approach in reverberant environments. It exploits binaural signals in a humanoid robotic context. Interaural Time and Level Differences (ITD and ILD) are extracted on multiple frequency bands and combined with a neural network-based learning scheme. A cue filtering process is used to reduce the reverberations effects. The system has been evaluated with simulation and real data, in multiple aspects covering realistic robot operating conditions, and was proven satisfying and effective as will be shown and discussed in the paper.},
	booktitle = {{IEEE}/{RSJ} {International} {Conference} on {Intelligent} {Robots} and {Systems}},
	author = {Youssef, Karim and Argentieri, Sylvain and Zarader, Jean-Luc},
	year = {2013},
	keywords = {lu},
	pages = {2927--2932},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\UP8TU8LH\\6696771.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\XA7YE56N\\Youssef et al. - 2013 - A learning-based approach to robust binaural sound.pdf:application/pdf},
}

@inproceedings{takeda_unsupervised_2018,
	title = {Unsupervised adaptation of neural networks for discriminative sound source localization with eliminative constraint},
	doi = {10.1109/ICASSP.2018.8461723},
	abstract = {This paper describes an unsupervised adaptation method of deep neural networks (DNNs) regarding discriminative sound source localization (SSL). DNNs-based SSL and its unsupervised adaptation fail under different conditions from those during training. The estimations sometimes include incoherent unpredictable errors due to the NN's non-linearity. We propose an eliminative posterior probability constraint using a model-based SSL for unsupervised DNNs adaptation. This constraint forces the probability of “less possible candidates” to become zero to eliminate incoherent errors. The candidates are indicated by a model-based SSL method because it can estimate the azimuth of the sound source with moderate accuracy and explicit reasoning. As a result, the localization performance of adapted DNNs improved more than that of model-based SSL. Experimental results showed that our method improved localization correctness of 1D azimuth and 3D regions by a maximum of 13.3 and 5.9 points compared with the model-based SSL.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Takeda, Ryu and Kudo, Y. and Takashima, K. and Kitamura, Y. and Komatani, K.},
	year = {2018},
	keywords = {partiel-lu},
	pages = {3514--3518},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\HIKU7IH3\\8461723.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\Q73YHGH8\\Takeda et al. - 2018 - Unsupervised Adaptation of Neural Networks for Dis.pdf:application/pdf},
}

@inproceedings{sivasankaran_keyword-based_2018,
	title = {Keyword-based speaker localization: localizing a target speaker in a multi-speaker environment},
	abstract = {Speaker localization is a hard task, especially in adverse environmental conditions involving reverberation and noise. In this work we introduce the new task of localizing the speaker who uttered a given keyword, e.g., the wake-up word of a distantmicrophone voice command system, in the presence of overlapping speech. We employ a convolutional neural network based localization system and investigate multiple identiﬁers as additional inputs to the system in order to characterize this speaker. We conduct experiments using ground truth identiﬁers which are obtained assuming the availability of clean speech and also in realistic conditions where the identiﬁers are computed from the corrupted speech. We ﬁnd that the identiﬁer consisting of the ground truth time-frequency mask corresponding to the target speaker provides the best localization performance and we propose methods to estimate such a mask in adverse reverberant and noisy conditions using the considered keyword.},
	language = {en},
	booktitle = {Interspeech},
	author = {Sivasankaran, Sunit and Vincent, Emmanuel and Fohr, Dominique},
	year = {2018},
	keywords = {lu},
	file = {Sivasankaran et al. - Keyword-based speaker localization Localizing a t.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\N4BPZRAG\\Sivasankaran et al. - Keyword-based speaker localization Localizing a t.pdf:application/pdf},
}

@article{chakrabarty_multi-speaker_2019,
	title = {Multi-speaker {DoA} estimation using deep convolutional networks trained with noise signals},
	volume = {13},
	issn = {1932-4553, 1941-0484},
	abstract = {Supervised learning based methods for source localization, being data driven, can be adapted to different acoustic conditions via training and have been shown to be robust to adverse acoustic environments. In this paper, a convolutional neural network (CNN) based supervised learning method for estimating the direction-of-arrival (DOA) of multiple speakers is proposed. Multi-speaker DOA estimation is formulated as a multi-class multi-label classiﬁcation problem, where the assignment of each DOA label to the input feature is treated as a separate binary classiﬁcation problem. The phase component of the shorttime Fourier transform (STFT) coefﬁcients of the received microphone signals are directly fed into the CNN, and the features for DOA estimation are learnt during training. Utilizing the assumption of disjoint speaker activity in the STFT domain, a novel method is proposed to train the CNN with synthesized noise signals. Through experimental evaluation with both simulated and measured acoustic impulse responses, the ability of the proposed DOA estimation approach to adapt to unseen acoustic conditions and its robustness to unseen noise type is demonstrated. Through additional empirical investigation, it is also shown that with an array of M microphones our proposed framework yields the best localization performance with M-1 convolution layers. The ability of the proposed method to accurately localize speakers in a dynamic acoustic scenario with varying number of sources is also shown.},
	language = {en},
	number = {1},
	urldate = {2020-12-10},
	journal = {IEEE Journal of Selected Topics in Signal Processing},
	author = {Chakrabarty, Soumitro and Habets, Emanuël A. P.},
	year = {2019},
	keywords = {lu},
	pages = {8--21},
	file = {Chakrabarty et Habets - 2019 - Multi-Speaker DOA Estimation Using Deep Convolutio.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\PK5RYTRS\\Chakrabarty et Habets - 2019 - Multi-Speaker DOA Estimation Using Deep Convolutio.pdf:application/pdf},
}

@inproceedings{moing_learning_2020,
	title = {Learning multiple sound source {2D} localization},
	abstract = {In this paper, we propose novel deep learning based algorithms for multiple sound source localization. Specifically, we aim to find the 2D Cartesian coordinates of multiple sound sources in an enclosed environment by using multiple microphone arrays. To this end, we use an encoding-decoding architecture and propose two improvements on it to accomplish the task. In addition, we also propose two novel localization representations which increase the accuracy. Lastly, new metrics are developed relying on resolution-based multiple source association which enables us to evaluate and compare different localization approaches. We tested our method on both synthetic and real world data. The results show that our method improves upon the previous baseline approach for this problem.},
	booktitle = {{IEEE} 21st {International} {Workshop} on {Multimedia} {Signal} {Processing}},
	author = {Moing, Guillaume Le and Vinayavekhin, Phongtharin and Inoue, Tadanobu and Vongkulbhisal, Jayakorn and Munawar, Asim and Tachibana, Ryuki and Agravante, Don Joven},
	month = dec,
	year = {2020},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\XR3TZ33U\\Moing et al. - 2020 - Learning Multiple Sound Source 2D Localization.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\8IWMU3I9\\2012.html:text/html},
}

@inproceedings{moing_data-efficient_2021,
	title = {Data-efficient framework for real-world multiple sound source {2D} localization},
	abstract = {Deep neural networks have recently led to promising results for the task of multiple sound source localization. Yet, they require a lot of training data to cover a variety of acoustic conditions and microphone array layouts. One can leverage acoustic simulators to inexpensively generate labeled training data. However, models trained on synthetic data tend to perform poorly with real-world recordings due to the domain mismatch. Moreover, learning for different microphone array layouts makes the task more complicated due to the infinite number of possible layouts. We propose to use adversarial learning methods to close the gap between synthetic and real domains. Our novel ensemble-discrimination method significantly improves the localization performance without requiring any label from the real data. Furthermore, we propose a novel explicit transformation layer to be embedded in the localization architecture. It enables the model to be trained with data from specific microphone array layouts while generalizing well to unseen layouts during inference.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Moing, Guillaume Le and Vinayavekhin, Phongtharin and Agravante, Don Joven and Inoue, Tadanobu and Vongkulbhisal, Jayakorn and Munawar, Asim and Tachibana, Ryuki},
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\Y7P47GER\\Moing et al. - 2020 - Data-Efficient Framework for Real-world Multiple S.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\MK2TUSV9\\2012.html:text/html},
}

@article{pak_sound_2019,
	title = {Sound socalization based on phase difference enhancement using deep neural networks},
	volume = {27},
	issn = {2329-9304},
	doi = {10.1109/TASLP.2019.2919378},
	abstract = {The performance of most of the classical sound source localization algorithms degrades seriously in the presence of background noise or reverberation. Recently, deep neural networks (DNNs) have successfully been applied to sound source localization, which mainly aim to classify the direction-of-arrival (DoA) into one of the candidate sectors. In this paper, we propose a DNN-based phase difference enhancement for DoA estimation, which turned out to be better than the direct estimation of the DoAs from the input interchannel phase differences (IPDs). The sinusoidal functions of the phase differences for “clean and dry” source signals are estimated from the sinusoidal functions of the IPDs for the input signals, which may include directional signals, diffuse noise, and reverberation. The resulted DoA is further refined to compensate for the estimation bias near the end-fire directions. From the enhanced IPDs, we can determine the DoA for each frequency bin and the DoAs for the current frame from the distributions of the DoAs for frequencies. Experimental results with various types and levels of background noise, reverberation times, numbers of sources, room impulse responses, and DoAs showed that the proposed method outperformed conventional approaches.},
	number = {8},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Pak, Junhyeong and Shin, Jong W.},
	year = {2019},
	keywords = {lu},
	pages = {1335--1345},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\EQZTJFNP\\8723575.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\XXLYT3QS\\Pak et Shin - 2019 - Sound Localization Based on Phase Difference Enhan.pdf:application/pdf},
}

@article{comanducci_source_2020,
	title = {Source localization using distributed microphones in reverberant environments based on deep learning and ray space transform},
	volume = {28},
	issn = {2329-9304},
	doi = {10.1109/TASLP.2020.3011256},
	abstract = {In this article we present a methodology for source localization in reverberant environments from Generalized Cross Correlations (GCCs) computed between spatially distributed individual microphones. Reverberation tends to negatively affect localization based on Time Differences of Arrival (TDOAs), which become inaccurate due to the presence of spurious peaks in the GCC. We therefore adopt a data-driven approach based on a convolutional neural network, which, using the GCCs as input, estimates the source location in two steps. It first computes the Ray Space Transform (RST) from multiple arrays. The RST is a convenient representation of the acoustic rays impinging on the array in a parametric space, called Ray Space. Rays produced by a source are visualized in the RST as patterns, whose position is uniquely related to the source location. The second step consists of estimating the source location through a nonlinear fitting, which estimates the coordinates that best approximate the RST pattern obtained through the first step. It is worth noting that training can be accomplished on simulated data only, thus relaxing the need of actually deploying microphone arrays in the acoustic scene. The localization accuracy of the proposed techniques is similar to the one of SRP-PHAT, however our method demonstrates an increased robustness regarding different distributed microphones configurations. Moreover, the use of the RST as an intermediate representation makes it possible for the network to generalize to data unseen during training.},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Comanducci, Luca and Borra, Federico and Bestagini, Paolo and Antonacci, Fabio and Tubaro, Stefano and Sarti, Augusto},
	year = {2020},
	keywords = {lu},
	pages = {2238--2251},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\529MG93C\\9146703.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\IZFIX9HR\\Comanducci et al. - 2020 - Source Localization Using Distributed Microphones .pdf:application/pdf},
}

@inproceedings{huang_time-domain_2020,
	title = {A time-domain unsupervised learning based sound source localization method},
	doi = {10.1109/ICICSP50920.2020.9232117},
	abstract = {In recent years, deep neural networks have been applied in many fields. In this paper, a time-domain unsupervised learning based sound source localization method is proposed, where auto-encoder neural networks are adopted so that some operation like time-delay compensation can be removed and there is no need to prepare training data with precise alignment labels. In order to improve its performance, a training strategy based on the multi-task learning and acoustic transfer function is proposed as well, called joint training of alternating and splitting. Experiments show that the proposed method can learn the transmission characteristics, including the change of time delay and intensity. What's more, the proposed method also has better performance compared with SRP-PHAT, MUSIC and two other neural networks based methods.},
	booktitle = {International {Conference} on {Information} {Communication} and {Signal} {Processing}},
	author = {Huang, Yankun and Wu, Xihong and Qu, Tianshu},
	month = sep,
	year = {2020},
	keywords = {lu},
	pages = {26--32},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\HX85NWIC\\9232117.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\NU9RUSAJ\\Huang et al. - 2020 - A Time-domain Unsupervised Learning Based Sound So.pdf:application/pdf},
}

@article{diaz-guerra_robust_2021,
	title = {Robust sound source tracking using {SRP}-{PHAT} and {3D} convolutional neural networks},
	volume = {29},
	issn = {2329-9304},
	doi = {10.1109/TASLP.2020.3040031},
	abstract = {In this article, we present a new single sound source DOA estimation and tracking system based on the well-known SRP-PHAT algorithm and a three-dimensional Convolutional Neural Network. It uses SRP-PHAT power maps as input features of a fully convolutional causal architecture that uses 3D convolutional layers to accurately perform the tracking of a sound source even in highly reverberant scenarios where most of the state of the art techniques fail. Unlike previous methods, since we do not use bidirectional recurrent layers and all our convolutional layers are causal in the time dimension, our system is feasible for real-time applications and it provides a new DOA estimation for each new SRP-PHAT map. To train the model, we introduce a new procedure to simulate random trajectories as they are needed during the training, equivalent to an infinite-size dataset with high flexibility to modify its acoustical conditions such as the reverberation time. We use both acoustical simulations on a large range of reverberation times and the actual recordings of the LOCATA dataset to prove the robustness of our system and its good performance even using low-resolution SRP-PHAT maps.},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Diaz-Guerra, David and Miguel, Antonio and Beltran, Jose R.},
	year = {2021},
	keywords = {lu},
	pages = {300--311},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\BH7X6KWM\\9268154.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\53Y2VCNE\\Diaz-Guerra et al. - 2021 - Robust Sound Source Tracking Using SRP-PHAT and 3D.pdf:application/pdf},
}

@article{andrei_overlapped_2019,
	title = {Overlapped speech detection and competing speaker counting–‐humans sersus deep learning},
	volume = {13},
	issn = {1941-0484},
	doi = {10.1109/JSTSP.2019.2910759},
	abstract = {A natural evolution of applications that analyze speech is to improve their robustness to multi-speaker environments. Humans use selective auditory attention and can easily switch focus from one source to another even when listening to a single channel recording with overlapped speech. The same brain feature allows us to detect the number of simultaneously active sources. In order to quantify human level performance for this task we have designed a perception study that evaluates participants' ability to accurately count multiple speakers in a single channel audio file. We also analyzed the influence of listening time and of hearing familiar voices. The study was carried in 3 sessions, with the help of 31, 38 and 80 volunteers and significantly extends the findings in existing literature. Using the conclusions from the perception analysis, several convolutional neural networks were trained to estimate the number of competing speakers on speech timeframes ranging from 25 ms to 1 s. The same models were instructed to tag overlapped speech and we observed F-score values up to 0.91. For both tasks, the proposed methods lead to lower error than existing approaches and require smaller timeframes. Compared with human listeners, the neural networks can count speakers more accurately by analyzing a considerably shorter recording.},
	number = {4},
	journal = {IEEE Journal of Selected Topics in Signal Processing},
	author = {Andrei, Valentin and Cucu, Horia and Burileanu, Corneliu},
	month = aug,
	year = {2019},
	pages = {850--862},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\AB74WK9A\\8688468.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\D8YHYVC9\\Andrei et al. - 2019 - Overlapped Speech Detection and Competing Speaker .pdf:application/pdf},
}

@inproceedings{vera-diaz_towards_2021,
	title = {Towards domain independence in {CNN}-based acoustic localization using deep cross correlations},
	abstract = {Time delay estimation is essential in Acoustic Source Localization (ASL) systems. One of the most used techniques for this purpose is the Generalized Cross Correlation (GCC) between a pair of signals and its use in Steered Response Power (SRP) techniques, which estimate the acoustic power at a speciﬁc location. Nowadays, Deep Learning strategies may outperform these methods. However, they are generally dependent on the geometric and sensor conﬁguration conditions that are available during the training phases, thus having limited generalization capabilities when facing new environments if no re-training nor adaptation is applied. In this work, we propose a method based on an encoder-decoder CNN architecture capable of outperforming the well known SRP-PHAT algorithm, and also other Deep Learning strategies when working in mismatched training-testing conditions without requiring a model re-training. Our proposal aims to estimate a smoothed version of the correlation signals, that is then used to generate a reﬁned acoustic power map, which leads to better performance on the ASL task. Our experimental evaluation uses three publicly available realistic datasets and provides a comparison with the SRP-PHAT algorithm and other recent proposals based on Deep Learning.},
	language = {en},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Vera-Diaz, Juan Manuel and Pizarro, Daniel and Macias-Guarasa, Javier},
	year = {2021},
	keywords = {lu, non-direct},
	pages = {226--230},
	file = {Vera-Diaz et al. - Towards Domain Independence in CNN-based Acoustic .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\BX26SKQR\\Vera-Diaz et al. - Towards Domain Independence in CNN-based Acoustic .pdf:application/pdf},
}

@inproceedings{schymura_exploiting_2021,
	title = {Exploiting attention-based sequence-to-sequence architectures for sound event localization},
	abstract = {Sound event localization frameworks based on deep neural networks have shown increased robustness with respect to reverberation and noise in comparison to classical parametric approaches. In particular, recurrent architectures that incorporate temporal context into the estimation process seem to be well-suited for this task. This paper proposes a novel approach to sound event localization by utilizing an attention-based sequenceto-sequence model. These types of models have been successfully applied to problems in natural language processing and automatic speech recognition. In this work, a multi-channel audio signal is encoded to a latent representation, which is subsequently decoded to a sequence of estimated directions-of-arrival. Herein, attentions allow for capturing temporal dependencies in the audio signal by focusing on speciﬁc frames that are relevant for estimating the activity and direction-of-arrival of sound events at the current time-step. The framework is evaluated on three publicly available datasets for sound event localization. It yields superior localization performance compared to state-of-the-art methods in both anechoic and reverberant conditions.},
	language = {en},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Schymura, Christopher and Ochiai, Tsubasa and Delcroix, Marc and Kinoshita, Keisuke and Nakatani, Tomohiro and Araki, Shoko and Kolossa, Dorothea},
	year = {2021},
	keywords = {lu},
	file = {Schymura et al. - Exploiting Attention-based Sequence-to-Sequence Ar.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\LCH4QJNG\\Schymura et al. - Exploiting Attention-based Sequence-to-Sequence Ar.pdf:application/pdf},
}

@inproceedings{guirguis_seld-tcn_2021,
	title = {{SELD}-{TCN}: sound event localization \& detection via temporal convolutional networks},
	isbn = {978-90-827970-5-3},
	shorttitle = {{SELD}-{TCN}},
	doi = {10.23919/Eusipco47968.2020.9287716},
	abstract = {The understanding of the surrounding environment plays a critical role in autonomous robotic systems, such as selfdriving cars. Extensive research has been carried out concerning visual perception. Yet, to obtain a more complete perception of the environment, autonomous systems of the future should also take acoustic information into account. Recent sound event localization and detection (SELD) frameworks utilize convolutional recurrent neural networks (CRNNs). However, considering the recurrent nature of CRNNs, it becomes challenging to implement them efﬁciently on embedded hardware. Not only are their computations strenuous to parallelize, but they also require high memory bandwidth and large memory buffers. In this work, we develop a more robust and hardware-friendly novel architecture based on a temporal convolutional network (TCN). The proposed framework (SELD-TCN) outperforms the state-of-the-art SELDnet performance on four different datasets. Moreover, SELD-TCN achieves 4x faster training time per epoch and 40x faster inference time on an ordinary graphics processing unit (GPU).},
	language = {en},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Guirguis, Karim and Schorn, Christoph and Guntoro, Andre and Abdulatif, Sherif and Yang, Bin},
	month = jan,
	year = {2021},
	keywords = {lu},
	pages = {16--20},
	file = {Guirguis et al. - 2021 - SELD-TCN Sound Event Localization & Detection via.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\3NN5YKVR\\Guirguis et al. - 2021 - SELD-TCN Sound Event Localization & Detection via.pdf:application/pdf},
}

@inproceedings{krause_comparison_2021,
	title = {Comparison of convolution types in {CNN}-based feature extraction for sound source localization},
	isbn = {978-90-827970-5-3},
	doi = {10.23919/Eusipco47968.2020.9287344},
	abstract = {This paper presents an overview of several approaches to convolutional feature extraction in the context of deep neural network (DNN) based sound source localization. Different ways of processing multichannel audio data in the timefrequency domain using convolutional neural networks (CNNs) are described and tested with the aim to provide a comparative study of their performance. In most considered approaches, models are trained with phase and magnitude components of the Short-Time Fourier Transform (STFT). In addition to stateof-the-art 2D convolutional layers, we investigate several solutions for the processing of 3D matrices containing multichannel complex representation of the microphone signals. The ﬁrst two proposed approaches are the 3D convolutions and depthwise separable convolutions in which two types of ﬁlters are used to exploit information within and between the channels. Note that this paper presents the ﬁrst application of depthwise separable convolutions in a task of sound source localization. The third approach is based on complex-valued neural networks which allows for performing convolutions directly on complex signal representations. Experiments are conducted using two synthetic datasets containing noise and speech signals recorded using a tetrahedral microphone array. The paper presents the results obtained using all investigated model types and discusses the resulting accuracy and computational complexity in DNN-based source localization.},
	language = {en},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Krause, Daniel and Politis, Archontis and Kowalczyk, Konrad},
	month = jan,
	year = {2021},
	keywords = {lu},
	pages = {820--824},
	file = {Krause et al. - 2021 - Comparison of Convolution Types in CNN-based Featu.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\F9Y943FX\\Krause et al. - 2021 - Comparison of Convolution Types in CNN-based Featu.pdf:application/pdf},
}

@article{phan_multitask_2020,
	title = {On multitask loss function for audio event detection and localization},
	abstract = {Audio event localization and detection (SELD) have been commonly tackled using multitask models. Such a model usually consists of a multi-label event classification branch with sigmoid cross-entropy loss for event activity detection and a regression branch with mean squared error loss for direction-of-arrival estimation. In this work, we propose a multitask regression model, in which both (multi-label) event detection and localization are formulated as regression problems and use the mean squared error loss homogeneously for model training. We show that the common combination of heterogeneous loss functions causes the network to underfit the data whereas the homogeneous mean squared error loss leads to better convergence and performance. Experiments on the development and validation sets of the DCASE 2020 SELD task demonstrate that the proposed system also outperforms the DCASE 2020 SELD baseline across all the detection and localization metrics, reducing the overall SELD error (the combined metric) by approximately 10\% absolute.},
	journal = {arXiv:2009.05527},
	author = {Phan, Huy and Pham, Lam and Koch, Philipp and Duong, Ngoc Q. K. and McLoughlin, Ian and Mertins, Alfred},
	month = sep,
	year = {2020},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\ZG27UIGE\\Phan et al. - 2020 - On Multitask Loss Function for Audio Event Detecti.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\YTTD43MS\\2009.html:text/html},
}

@article{subramanian_deep_2021,
	title = {Deep learning based multi-source localization with source splitting and its effectiveness in multi-talker speech recognition},
	abstract = {Multi-source localization is an important and challenging technique for multi-talker conversation analysis. This paper proposes a novel supervised learning method using deep neural networks to estimate the direction of arrival (DOA) of all the speakers simultaneously from the audio mixture. At the heart of the proposal is a source splitting mechanism that creates source-specific intermediate representations inside the network. This allows our model to give source-specific posteriors as the output unlike the traditional multi-label classification approach. Existing deep learning methods perform a frame level prediction, whereas our approach performs an utterance level prediction by incorporating temporal selection and averaging inside the network to avoid post-processing. We also experiment with various loss functions and show that a variant of earth mover distance (EMD) is very effective in classifying DOA at a very high resolution by modeling inter-class relationships. In addition to using the prediction error as a metric for evaluating our localization model, we also establish its potency as a frontend with automatic speech recognition (ASR) as the downstream task. We convert the estimated DOAs into a feature suitable for ASR and pass it as an additional input feature to a strong multi-channel and multi-talker speech recognition baseline. This added input feature drastically improves the ASR performance and gives a word error rate (WER) of 6.3\% on the evaluation data of our simulated noisy two speaker mixtures, while the baseline which doesn't use explicit localization input has a WER of 11.5\%. We also perform ASR evaluation on real recordings with the overlapped set of the MC-WSJ-AV corpus in addition to simulated mixtures.},
	urldate = {2021-03-03},
	journal = {arXiv:2102.07955},
	author = {Subramanian, Aswin Shanmugam and Weng, Chao and Watanabe, Shinji and Yu, Meng and Yu, Dong},
	month = feb,
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\JPFSUQLT\\Subramanian et al. - 2021 - Deep Learning based Multi-Source Localization with.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\K6PMLBQR\\2102.html:text/html;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\7DJBFSLZ\\2102.html:text/html},
}

@inproceedings{wu_sslide_2021,
	title = {{SSLIDE}: sound source localization for indoors based on deep learning},
	shorttitle = {{SSLIDE}},
	abstract = {This paper presents SSLIDE, Sound Source Localization for Indoors using DEep learning, which applies deep neural networks (DNNs) with encoder-decoder structure to localize sound sources with random positions in a continuous space. The spatial features of sound signals received by each microphone are extracted and represented as likelihood surfaces for the sound source locations in each point. Our DNN consists of an encoder network followed by two decoders. The encoder obtains a compressed representation of the input likelihoods. One decoder resolves the multipath caused by reverberation, and the other decoder estimates the source location. Experiments based on both the simulated and experimental data show that our method can not only outperform multiple signal classification (MUSIC), steered response power with phase transform (SRP-PHAT), sparse Bayesian learning (SBL), and a competing convolutional neural network (CNN) approach in the reverberant environment but also achieve a good generalization performance.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Wu, Yifan and Ayyalasomayajula, Roshan and Bianco, Michael J. and Bharadia, Dinesh and Gerstoft, Peter},
	month = feb,
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\Q32BJJ3F\\Wu et al. - 2021 - SSLIDE Sound Source Localization for Indoors base.pdf:application/pdf},
}

@article{roy_esprit-estimation_1989,
	title = {{ESPRIT}-estimation of signal parameters via rotational invariance techniques},
	volume = {37},
	issn = {0096-3518},
	doi = {10.1109/29.32276},
	abstract = {An approach to the general problem of signal parameter estimation is described. The algorithm differs from its predecessor in that a total least-squares rather than a standard least-squares criterion is used. Although discussed in the context of direction-of-arrival estimation, ESPRIT can be applied to a wide variety of problems including accurate detection and estimation of sinusoids in noise. It exploits an underlying rotational invariance among signal subspaces induced by an array of sensors with a translational invariance structure. The technique, when applicable, manifests significant performance and computational advantages over previous algorithms such as MEM, Capon's MLM, and MUSIC.{\textless}{\textgreater}},
	number = {7},
	journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
	author = {Roy, Richard and Kailath, Thomas},
	month = jul,
	year = {1989},
	pages = {984--995},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\Q9QJKFNB\\32276.html:text/html;Version soumise:C\:\\Users\\RQML4978\\Zotero\\storage\\594XIJZL\\Roy et Kailath - 1989 - ESPRIT-estimation of signal parameters via rotatio.pdf:application/pdf},
}

@article{dmochowski_generalized_2007,
	title = {A generalized steered response power method for computationally viable source localization},
	volume = {15},
	issn = {1558-7924},
	doi = {10.1109/TASL.2007.906694},
	abstract = {The process of locating an acoustic source given measurements of the sound field at multiple microphones is of significant interest as both a classical array signal processing problem, and more recently, as a solution to the problems of automatic camera steering, teleconferencing, hands-free processing, and others. Despite the proven efficacy of steered-beamformer approaches to localization in harsh conditions, their practical application to real-time settings is hindered by undesirably high computational demands. This paper presents a computationally viable implementation of the steered response power (SRP) source localization method. The conventional approach is generalized by introducing an inverse mapping that maps relative delays to sets of candidate locations. Instead of traversing the three-dimensional location space, the one-dimensional relative delay space is traversed; at each lag, all locations which are inverse mapped by that delay are updated. This means that the computation of the SRP map is no longer performed sequentially in space. Most importantly, by subsetting the space of relative delays to only those that achieve a high level of cross-correlation, the required number of algorithm updates is drastically reduced without compromising localization accuracy. The generalization is scalable in the sense that the level of subsetting is an algorithm parameter. It is shown that this generalization may be viewed as a spatial decomposition of the SRP energy map into weighted basis functions-in this context, it becomes evident that the full SRP search considers all basis functions (even the ones with very low weighting). On the other hand, it is shown that by only including a few basis functions per microphone pair, the SRP map is quite accurately represented. As a result, in a real environment, the proposed generalization achieves virtually the same anomaly rate as the full SRP search while only performing 10\% the amount of algorithm updates as the full search.},
	number = {8},
	journal = {IEEE Transactions on Audio, Speech, and Language Processing},
	author = {Dmochowski, Jacek P. and Benesty, Jacob and Affes, Sofiene},
	month = nov,
	year = {2007},
	pages = {2510--2526},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\N6L5Z5FS\\4317561.html:text/html},
}

@inproceedings{grumiaux_multichannel_2020,
	title = {Multichannel {CRNN} for speaker counting: an analysis of performance},
	shorttitle = {Multichannel {CRNN} for {Speaker} {Counting}},
	abstract = {Speaker counting is the task of estimating the number of people that are simultaneously speaking in an audio recording. For several audio processing tasks such as speaker diarization, separation, localization and tracking, knowing the number of speakers at each timestep is a prerequisite, or at least it can be a strong advantage, in addition to enabling a low latency processing. In a previous work, we addressed the speaker counting problem with a multichannel convolutional recurrent neural network which produces an estimation at a short-term frame resolution. In this work, we show that, for a given frame, there is an optimal position in the input sequence for best prediction accuracy. We empirically demonstrate the link between that optimal position, the length of the input sequence and the size of the convolutional ﬁlters.},
	language = {en},
	urldate = {2021-03-15},
	booktitle = {Forum {Acusticum}},
	author = {Grumiaux, Pierre-Amaury and Kitic, Srdan and Girin, Laurent and Guérin, Alexandre},
	year = {2020},
	keywords = {lu},
	file = {Grumiaux et al. - 2021 - Multichannel CRNN for Speaker Counting an Analysi.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\4CTMFZQN\\Grumiaux et al. - 2021 - Multichannel CRNN for Speaker Counting an Analysi.pdf:application/pdf},
}

@book{pulkki_parametric_2018,
	title = {Parametric time-frequency domain spatial audio},
	isbn = {978-1-119-25258-0 978-1-119-25261-0},
	abstract = {"A comprehensive guide that addresses the theory and practice of spatial audio This book provides readers with the principles and best practices in spatial audio signal processing. It describes how sound fields and their perceptual attributes are captured and analyzed within the time-frequency domain, how essential representation parameters are coded, and how such signals are efficiently reproduced for practical applications. The book is split into four parts starting with an overview of the fundamentals. It then goes on to explain the reproduction of spatial sound before offering an examination of signal-dependent spatial filtering. The book finishes with coverage of both current and future applications and the direction that spatial audio research is heading in. Parametric Time-frequency Domain Spatial Audio focuses on applications in entertainment audio, including music, home cinema, and gaming\&mdash;covering the capturing and reproduction of spatial sound as well as its generation, transduction, representation, transmission, and perception. This book will teach readers the tools needed for such processing, and provides an overview to existing research. It also shows recent up-to-date projects and commercial applications built on top of the systems. Provides an in-depth presentation of the principles, past developments, state-of-the-art methods, and future research directions of spatial audio technologies Includes contributions from leading researchers in the field Offers MATLAB codes with selected chapters An advanced book aimed at readers who are capable of digesting mathematical expressions about digital signal processing and sound field analysis, Parametric Time-frequency Domain Spatial Audio is best suited for researchers in academia and in the audio industry"--},
	language = {en},
	publisher = {Wiley},
	author = {Pulkki, Ville and Delikaris-Manias, Symeon and Politis, Archontis},
	year = {2018},
	file = {Pulkki et al. - 2018 - Parametric time-frequency domain spatial audio.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\T874ZSUH\\Pulkki et al. - 2018 - Parametric time-frequency domain spatial audio.pdf:application/pdf},
}

@inproceedings{grumiaux_improved_2021,
	title = {Improved feature extraction for {CRNN}-based multiple sound source localization},
	abstract = {In this work, we propose to extend a state-of-the-art
multi-source localization system based on a convolutional recurrent neural network and Ambisonics signals. We significantly
improve the performance of the baseline network by changing
the layout between convolutional and pooling layers. We propose
several configurations with more convolutional layers and smaller
pooling sizes in-between, so that less information is lost across
the layers, leading to a better feature extraction. In parallel, we
test the system’s ability to localize up to 3 sources, in which case
the improved feature extraction provides the most significant
boost in accuracy. We evaluate and compare these improved
configurations on synthetic and real-world data. The obtained
results show a quite substantial improvement of the multiple
sound source localization performance over the baseline network.},
	language = {en},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Grumiaux, Pierre-Amaury and Kitic, Srdan and Girin, Laurent and Guérin, Alexandre},
	year = {2021},
	keywords = {lu},
	file = {Grumiaux et al. - Improved feature extraction for CRNN-based multipl.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\37MNRQRR\\Grumiaux et al. - Improved feature extraction for CRNN-based multipl.pdf:application/pdf},
}

@article{perotin_crnn-based_2019,
	title = {{CRNN}-based multiple {DoA} estimation using acoustic intensity features for {Ambisonics} recordings},
	volume = {13},
	issn = {1941-0484},
	doi = {10.1109/JSTSP.2019.2900164},
	abstract = {Localizing audio sources is challenging in real reverberant environments, especially when several sources are active. We propose to use a neural network built from stacked convolutional and recurrent layers in order to estimate the directions of arrival of multiple sources from a first-order Ambisonics recording. It returns the directions of arrival over a discrete grid of a known number of sources. We propose to use features derived from the acoustic intensity vector as inputs. We analyze the behavior of the neural network by means of a visualization technique called layerwise relevance propagation. This analysis highlights which parts of the input signal are relevant in a given situation. We also conduct experiments to evaluate the performance of our system in various environments, from simulated rooms to real recordings, with one or two speech sources. The results show that the proposed features significantly improve performances with respect to raw Ambisonics inputs.},
	number = {1},
	journal = {IEEE Journal of Selected Topics in Signal Processing},
	author = {Perotin, Lauréline and Serizel, Romain and Vincent, Emmanuel and Guérin, Alexandre},
	month = mar,
	year = {2019},
	keywords = {lu},
	pages = {22--33},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\IFFJIMX5\\8643769.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\9RS53UMH\\Perotin et al. - 2019 - CRNN-Based Multiple DoA Estimation Using Acoustic .pdf:application/pdf;Perotin - CRNN-based multiple DoA estimation using Ambisonic.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\TPWGMJPJ\\Perotin - CRNN-based multiple DoA estimation using Ambisonic.pdf:application/pdf},
}

@inproceedings{grumiaux_high-resolution_2020,
	address = {Amsterdam, Netherlands},
	title = {High-resolution speaker counting in reverberant rooms using {CRNN} with {Ambisonics} features},
	isbn = {978-90-827970-5-3},
	doi = {10.23919/Eusipco47968.2020.9287637},
	abstract = {Speaker counting is the task of estimating the number of people that are simultaneously speaking in an audio recording. For several audio processing tasks such as speaker diarization, separation, localization and tracking, knowing the number of speakers at each timestep is a prerequisite, or at least it can be a strong advantage, in addition to enabling a low latency processing. For that purpose, we address the speaker counting problem with a multichannel convolutional recurrent neural network which produces an estimation at a short-term frame resolution. We trained the network to predict up to 5 concurrent speakers in a multichannel mixture, with simulated data including many different conditions in terms of source and microphone positions, reverberation, and noise. The network can predict the number of speakers with good accuracy at frame resolution.},
	language = {en},
	urldate = {2021-03-18},
	booktitle = {European {Signal} {Processing} {Conference}},
	author = {Grumiaux, Pierre-Amaury and Kitic, Srdan and Girin, Laurent and Guerin, Alexandre},
	year = {2020},
	keywords = {lu},
	file = {Grumiaux et al. - 2021 - High-Resolution Speaker Counting in Reverberant Ro.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\YY2DZ9LC\\Grumiaux et al. - 2021 - High-Resolution Speaker Counting in Reverberant Ro.pdf:application/pdf},
}

@techreport{garofolo_timit_1993,
	title = {{TIMIT} acoustic-phonetic continuous speech corpus},
	abstract = {The TIMIT corpus of read speech is designed to provide speech data for acoustic-phonetic studies and for the development and evaluation of automatic speech recognition systems. TIMIT contains broadband recordings of 630 speakers of eight major dialects of American English, each reading ten phonetically rich sentences. The TIMIT corpus includes time-aligned orthographic, phonetic and word transcriptions as well as a 16-bit, 16kHz speech waveform file for each utterance. Corpus design was a joint effort among the Massachusetts Institute of Technology (MIT), SRI International (SRI) and Texas Instruments, Inc. (TI). The speech was recorded at TI, transcribed at MIT and verified and prepared for CD-ROM production by the National Institute of Standards and Technology (NIST).},
	author = {Garofolo, John S. and Lamel, Lori F. and Fisher, William M. and Fiscus, Jonathan G. and Pallett, David S. and Dahlgren, Nancy L. and Zue, Victor},
	year = {1993},
}

@article{allen_image_1979,
	title = {Image method for efficiently simulating small‐room acoustics},
	volume = {65},
	issn = {0001-4966},
	doi = {10.1121/1.382599},
	language = {en},
	number = {4},
	urldate = {2021-03-18},
	journal = {The Journal of the Acoustical Society of America},
	author = {Allen, Jont B. and Berkley, David A.},
	year = {1979},
	pages = {943--950},
	file = {Allen et Berkley - 1979 - Image method for efficiently simulating small‐room.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AIKAPSD5\\Allen et Berkley - 1979 - Image method for efficiently simulating small‐room.pdf:application/pdf},
}

@article{dozat_incorporating_2016,
	title = {Incorporating {Nesterov} momentum into {Adam}},
	abstract = {Semantic Scholar extracted view of \&quot;Incorporating Nesterov Momentum into Adam\&quot; by Timothy Dozat},
	language = {en},
	urldate = {2021-03-19},
	journal = {ICLR 2016},
	author = {Dozat, Timothy},
	year = {2016},
	file = {Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\CKNGVI2M\\d44efdc542f2cc5e196f04bc76bc783bfd7084af.html:text/html},
}

@inproceedings{vaswani_attention_2017,
	title = {Attention is all you need},
	abstract = {The dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring signiﬁcantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.},
	language = {en},
	booktitle = {Conference on {Neural} {Information} {Processing} {System}},
	author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Lukasz and Polosukhin, Illia},
	month = dec,
	year = {2017},
	keywords = {lu},
	pages = {5998--6008},
	file = {Vaswani et al. - 2017 - Attention Is All You Need.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\FZV3RM7K\\Vaswani et al. - 2017 - Attention Is All You Need.pdf:application/pdf},
}

@inproceedings{dosovitskiy_image_2021,
	title = {An image is worth 16x16 words: transformers for image recognition at scale},
	shorttitle = {An {Image} is {Worth} 16x16 {Words}},
	abstract = {While the Transformer architecture has become the de-facto standard for natural language processing tasks, its applications to computer vision remain limited. In vision, attention is either applied in conjunction with convolutional networks, or used to replace certain components of convolutional networks while keeping their overall structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.), Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring substantially fewer computational resources to train.},
	booktitle = {International {Conference} on {Learning} {Representations}},
	author = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
	month = may,
	year = {2021},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\X5F9AGFT\\Dosovitskiy et al. - 2020 - An Image is Worth 16x16 Words Transformers for Im.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\VYE2PTBR\\2010.html:text/html},
}

@article{poschadel_direction_2021,
	title = {Direction of arrival estimation of noisy speech using convolutional recurrent neural networks with higher-order {Ambisonics} signals},
	abstract = {Training convolutional recurrent neural networks on first-order Ambisonics signals is a well-known approach when estimating the direction of arrival for speech/sound signals. In this work, we investigate whether increasing the order of Ambisonics up to the fourth order further improves the estimation performance of convolutional recurrent neural networks. While our results on data based on simulated spatial room impulse responses show that the use of higher Ambisonics orders does have the potential to provide better localization results, no further improvement was shown on data based on real spatial room impulse responses from order two onwards. Rather, it seems to be crucial to extract meaningful features from the raw data. First order features derived from the acoustic intensity vector were superior to pure higher-order magnitude and phase features in almost all scenarios.},
	journal = {arXiv:2102.09853},
	author = {Poschadel, Nils and Hupke, Robert and Preihs, Stephan and Peissig, Jürgen},
	month = mar,
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\VBX236DD\\Poschadel et al. - 2021 - Direction of Arrival Estimation of Noisy Speech Us.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\VIC9Q8TX\\2102.html:text/html},
}

@inproceedings{gelderblom_synthetic_2021,
	title = {Synthetic data for {DNN}-based {DoA} estimation of indoor speech},
	abstract = {This paper investigates the use of different room impulse response (RIR) simulation methods for synthesizing training data for deep neural network-based direction of arrival (DOA) estimation of speech in reverberant rooms.},
	language = {en},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Gelderblom, Femke B and Liu, Yi and Kvam, Johannes and Myrvoll, Tor Andre},
	year = {2021},
	keywords = {lu},
	pages = {6},
	file = {Gelderblom et al. - SYNTHETIC DATA FOR DNN-BASED DOA ESTIMATION OF IND.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\TB2PT9ZP\\Gelderblom et al. - SYNTHETIC DATA FOR DNN-BASED DOA ESTIMATION OF IND.pdf:application/pdf},
}

@article{pujol_beamlearning_2021,
	title = {{BeamLearning}: an end-to-end deep learning approach for the angular localization of sound sources using raw multichannel acoustic pressure data},
	volume = {149},
	abstract = {Sound sources localization using multichannel signal processing has been a subject of active research for decades. In recent years, the use of deep learning in audio signal processing has allowed to drastically improve performances for machine hearing. This has motivated the scientific community to also develop machine learning strategies for source localization applications. In this paper, we present BeamLearning, a multi-resolution deep learning approach that allows to encode relevant information contained in unprocessed time domain acoustic signals captured by microphone arrays. The use of raw data aims at avoiding simplifying hypothesis that most traditional model-based localization methods rely on. Benefits of its use are shown for realtime sound source 2D-localization tasks in reverberating and noisy environments. Since supervised machine learning approaches require large-sized, physically realistic, precisely labelled datasets, we also developed a fast GPU-based computation of room impulse responses using fractional delays for image source models. A thorough analysis of the network representation and extensive performance tests are carried out using the BeamLearning network with synthetic and experimental datasets. Obtained results demonstrate that the BeamLearning approach significantly outperforms the wideband MUSIC and SRP-PHAT methods in terms of localization accuracy and computational efficiency in presence of heavy measurement noise and reverberation.},
	number = {6},
	journal = {The Journal of the Acoustical Society of America},
	author = {Pujol, Hadrien and Bavu, Eric and Garcia, Alexandre},
	month = apr,
	year = {2021},
	keywords = {lu},
	pages = {4248--4263},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\L2AJ5QHB\\Pujol et al. - 2021 - BeamLearning an end-to-end Deep Learning approach.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\5DFDAZ5Z\\2104.html:text/html},
}

@article{wang_supervised_2018,
	title = {Supervised speech separation based on deep learning: an overview},
	volume = {26},
	issn = {2329-9304},
	shorttitle = {Supervised {Speech} {Separation} {Based} on {Deep} {Learning}},
	doi = {10.1109/TASLP.2018.2842159},
	abstract = {Speech separation is the task of separating target speech from background interference. Traditionally, speech separation is studied as a signal processing problem. A more recent approach formulates speech separation as a supervised learning problem, where the discriminative patterns of speech, speakers, and background noise are learned from training data. Over the past decade, many supervised separation algorithms have been put forward. In particular, the recent introduction of deep learning to supervised speech separation has dramatically accelerated progress and boosted separation performance. This paper provides a comprehensive overview of the research on deep learning based supervised speech separation in the last several years. We first introduce the background of speech separation and the formulation of supervised separation. Then, we discuss three main components of supervised separation: learning machines, training targets, and acoustic features. Much of the overview is on separation algorithms where we review monaural methods, including speech enhancement (speech-nonspeech separation), speaker separation (multitalker separation), and speech dereverberation, as well as multimicrophone techniques. The important issue of generalization, unique to supervised learning, is discussed. This overview provides a historical perspective on how advances are made. In addition, we discuss a number of conceptual issues, including what constitutes the target source.},
	number = {10},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Wang, DeLiang and Chen, Jitong},
	month = oct,
	year = {2018},
	pages = {1702--1726},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\AVQ23UJ7\\8369155.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\2KIQI96G\\Wang et Chen - 2018 - Supervised Speech Separation Based on Deep Learnin.pdf:application/pdf},
}

@inproceedings{bahdanau_neural_2015,
	title = {Neural machine translation by jointly learning to align and translate},
	abstract = {Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition.},
	booktitle = {International {Conference} on {Learning} {Representations}},
	author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
	month = may,
	year = {2015},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\LS2UJKU7\\Bahdanau et al. - 2016 - Neural Machine Translation by Jointly Learning to .pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\USH3QQUH\\1409.html:text/html},
}

@article{lecun_backpropagation_1989,
	title = {Backpropagation applied to handwritten zip code recognition},
	volume = {1},
	issn = {0899-7667},
	doi = {10.1162/neco.1989.1.4.541},
	abstract = {The ability of learning networks to generalize can be greatly enhanced by providing constraints from the task domain. This paper demonstrates how such constraints can be integrated into a backpropagation network through the architecture of the network. This approach has been successfully applied to the recognition of handwritten zip code digits provided by the U.S. Postal Service. A single network learns the entire recognition operation, going from the normalized image of the character to the final classification.},
	number = {4},
	journal = {Neural Computation},
	author = {LeCun, Yann and Boser, Bernhard and Denker, John S. and Henderson, Donnie and Howard, Richard E. and Hubbard, Wayne and Jackel, Lawrence D.},
	year = {1989},
	pages = {541--551},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\XCPKG86P\\6795724.html:text/html;LeCun et al. - 1989 - Backpropagation Applied to Handwritten Zip Code Re.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\8Y7SFPWP\\LeCun et al. - 1989 - Backpropagation Applied to Handwritten Zip Code Re.pdf:application/pdf},
}

@inproceedings{bologni_acoustic_2021,
	title = {Acoustic reflectors localization from stereo recordings using neural networks},
	doi = {10.1109/ICASSP39728.2021.9414473},
	abstract = {Acoustic room geometry estimation is often performed in ad hoc settings, i.e., using multiple microphones and sources distributed around the room, or assuming control over the excitation signals. We propose a fully convolutional network (FCN) that localizes reflective surfaces under the relaxed assumptions that (i) a compact array of only two microphones is available, (ii) emitter and receivers are not synchronized, and (iii) both the excitation signals and the impulse responses of the enclosures are unknown. Our FCN is trained in a supervised fashion to predict the likelihood of reflective surfaces at specific distances and directions-of-arrival (DOA). When a single reflective surface is present, up to 80\% of real and virtual sources are detected, while this figure approaches 50\% in rectangular rooms. Experiments on real-world recordings report similar accuracy as with artificially reverberated speech signals, validating the generalization capabilities of the framework.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Bologni, Giovanni and Heusdens, Richard and Martinez, Jorge},
	month = jun,
	year = {2021},
	keywords = {lu},
	pages = {1--5},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\SBT7JZEP\\9414473.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\Z8HIU6X4\\Bologni et al. - 2021 - Acoustic Reflectors Localization from Stereo Recor.pdf:application/pdf},
}

@inproceedings{he_deep_2016,
	title = {Deep residual learning for image recognition},
	booktitle = {{IEEE} {Conference} on {Computer} {Vision} and {Pattern} {Recognition}},
	author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
	year = {2016},
	pages = {770--778},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\UMNDRG38\\He et al. - 2016 - Deep Residual Learning for Image Recognition.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\Y8DRB85Z\\He_Deep_Residual_Learning_CVPR_2016_paper.html:text/html},
}

@article{hochreiter_long_1997,
	title = {Long short-term memory},
	volume = {9},
	issn = {0899-7667},
	doi = {10.1162/neco.1997.9.8.1735},
	abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it by introducing a novel, efficient, gradient based method called long short-term memory (LSTM). Truncating the gradient where this does not do harm, LSTM can learn to bridge minimal time lags in excess of 1000 discrete-time steps by enforcing constant error flow through constant error carousels within special units. Multiplicative gate units learn to open and close access to the constant error flow. LSTM is local in space and time; its computational complexity per time step and weight is O. 1. Our experiments with artificial data involve local, distributed, real-valued, and noisy pattern representations. In comparisons with real-time recurrent learning, back propagation through time, recurrent cascade correlation, Elman nets, and neural sequence chunking, LSTM leads to many more successful runs, and learns much faster. LSTM also solves complex, artificial long-time-lag tasks that have never been solved by previous recurrent network algorithms.},
	number = {8},
	journal = {Neural Computation},
	author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
	month = nov,
	year = {1997},
	pages = {1735--1780},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\XQVGS4MG\\Hochreiter and Schmidhuber - 1997 - Long Short-Term Memory.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\ZRS6CMGL\\Long-Short-Term-Memory.html:text/html},
}

@inproceedings{cho_learning_2014,
	title = {Learning phrase representations using {RNN} encoder-decoder for statistical machine translation},
	abstract = {In this paper, we propose a novel neural network model called RNN Encoder-Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixed-length vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder-Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases.},
	booktitle = {Conference on {Empirical} {Methods} in {Natural} {Language} {Processing}},
	author = {Cho, Kyunghyun and van Merrienboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and Bougares, Fethi and Schwenk, Holger and Bengio, Yoshua},
	month = oct,
	year = {2014},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\9BIYGBMV\\Cho et al. - 2014 - Learning Phrase Representations using RNN Encoder-.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\TDFWZ596\\1406.html:text/html},
}

@inproceedings{nguyen_general_2021,
	title = {A general network architecture for sound event localization and detection using transfer learning and recurrent neural network},
	doi = {10.1109/ICASSP39728.2021.9414602},
	abstract = {Polyphonic sound event detection and localization (SELD) task is challenging because it is difficult to jointly optimize sound event detection (SED) and direction-of-arrival (DOA) estimation in the same network. We propose a general network architecture for SELD in which the SELD network comprises sub-networks that are pre-trained to solve SED and DOA estimation independently, and a recurrent layer that combines the SED and DOA estimation outputs into SELD outputs. The recurrent layer does the alignment between the sound classes and DOAs of sound events while being unaware of how these outputs are produced by the upstream SED and DOA estimation algorithms. This simple network architecture is compatible with different existing SED and DOA estimation algorithms. It is highly practical since the sub-networks can be improved independently. The experimental results using the DCASE 2020 SELD dataset show that the performances of our proposed network architecture using different SED and DOA estimation algorithms and different audio formats are competitive with other state-of-the-art SELD algorithms. The source code for the proposed SELD network architecture is available at Github 1.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Nguyen, Thi Ngoc Tho and Nguyen, Ngoc Khanh and Phan, Huy and Pham, Lam and Ooi, Kenneth and Jones, Douglas L. and Gan, Woon-Seng},
	month = jun,
	year = {2021},
	keywords = {lu},
	pages = {935--939},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\QAZIWEIT\\9414602.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\NBMI3HYC\\Nguyen et al. - 2021 - A General Network Architecture for Sound Event Loc.pdf:application/pdf},
}

@article{schymura_pilot_2021,
	title = {{PILOT}: introducing {Transformers} for probabilistic sound event localization},
	abstract = {Sound event localization aims at estimating the positions of sound sources in the environment with respect to an acoustic receiver (e.g. a microphone array). Recent advances in this domain most prominently focused on utilizing deep recurrent neural networks. Inspired by the success of transformer architectures as a suitable alternative to classical recurrent neural networks, this paper introduces a novel transformer-based sound event localization framework, where temporal dependencies in the received multi-channel audio signals are captured via self-attention mechanisms. Additionally, the estimated sound event positions are represented as multivariate Gaussian variables, yielding an additional notion of uncertainty, which many previously proposed deep learning-based systems designed for this application do not provide. The framework is evaluated on three publicly available multi-source sound event localization datasets and compared against state-of-the-art methods in terms of localization error and event detection accuracy. It outperforms all competing systems on all datasets with statistical significant differences in performance.},
	journal = {arXiv:2106.03903},
	author = {Schymura, Christopher and Bönninghoff, Benedikt and Ochiai, Tsubasa and Delcroix, Marc and Kinoshita, Keisuke and Nakatani, Tomohiro and Araki, Shoko and Kolossa, Dorothea},
	month = jun,
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\85XWLYLE\\Schymura et al. - 2021 - PILOT Introducing Transformers for Probabilistic .pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\7VU7HDZF\\2106.html:text/html},
}

@article{wang_four-stage_2021,
	title = {A four-stage data augmentation approach to {ResNet}-{Conformer} based acoustic modeling for sound event localization and detection},
	abstract = {In this paper, we propose a novel four-stage data augmentation approach to ResNet-Conformer based acoustic modeling for sound event localization and detection (SELD). First, we explore two spatial augmentation techniques, namely audio channel swapping (ACS) and multi-channel simulation (MCS), to deal with data sparsity in SELD. ACS and MDS focus on augmenting the limited training data with expanding direction of arrival (DOA) representations such that the acoustic models trained with the augmented data are robust to localization variations of acoustic sources. Next, time-domain mixing (TDM) and time-frequency masking (TFM) are also investigated to deal with overlapping sound events and data diversity. Finally, ACS, MCS, TDM and TFM are combined in a step-by-step manner to form an effective four-stage data augmentation scheme. Tested on the Detection and Classiﬁcation of Acoustic Scenes and Events (DCASE) 2020 data sets, our proposed augmentation approach greatly improves the system performance, ranking our submitted system in the ﬁrst place in the SELD task of DCASE 2020 Challenge. Furthermore, we employ a ResNetConformer architecture to model both global and local context dependencies of an audio sequence to yield further gains over those architectures used in the DCASE 2020 SELD evaluations.},
	language = {en},
	journal = {arXiv:2101.02919},
	author = {Wang, Qing and Du, Jun and Wu, Hua-Xin and Pan, Jia and Ma, Feng and Lee, Chin-Hui},
	month = jan,
	year = {2021},
	keywords = {lu},
	file = {Wang et al. - 2021 - A Four-Stage Data Augmentation Approach to ResNet-.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\TGSPTN9M\\Wang et al. - 2021 - A Four-Stage Data Augmentation Approach to ResNet-.pdf:application/pdf},
}

@book{goodfellow_deep_2016,
	title = {Deep learning},
	publisher = {MIT Press},
	author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
	year = {2016},
}

@article{politis_dataset_2021,
	title = {A dataset of dynamic reverberant sound scenes with directional interferers for sound event localization and detection},
	abstract = {This report presents the dataset and baseline of Task 3 of the DCASE2021 Challenge on Sound Event Localization and Detection (SELD). The dataset is based on emulation of real recordings of static or moving sound events under real conditions of reverberation and ambient noise, using spatial room impulse responses captured in a variety of rooms and delivered in two spatial formats. The acoustical synthesis remains the same as in the previous iteration of the challenge, however the new dataset brings more challenging conditions of polyphony and overlapping instances of the same class. The most important difference of the new dataset is the introduction of directional interferers, meaning sound events that are localized in space but do not belong to the target classes to be detected and are not annotated. Since such interfering events are expected in every real-world scenario of SELD, the new dataset aims to promote systems that deal with this condition effectively. A modified SELDnet baseline employing the recent ACCDOA representation for SELD problems accompanies the dataset and is described herein. To investigate the individual and combined effects of ambient noise, interferers, and reverberation, we study the performance of the baseline on different versions of the dataset excluding or including combinations of these factors. The results indicate that by far the most detrimental effects are caused by directional interferers.},
	journal = {arXiv:2106.06999},
	author = {Politis, Archontis and Adavanne, Sharath and Krause, Daniel and Deleforge, Antoine and Srivastava, Prerak and Virtanen, Tuomas},
	month = jun,
	year = {2021},
	keywords = {lu},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\TSUC29EE\\Politis et al. - 2021 - A Dataset of Dynamic Reverberant Sound Scenes with.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\UL7MXU5G\\2106.html:text/html},
}

@inproceedings{yousefi_real-time_2021,
	title = {Real-time speaker counting in a cocktail party scenario using attention-guided convolutional neural network},
	abstract = {Most current speech technology systems are designed to operate well even in the presence of multiple active speakers. However, most solutions assume that the number of co-current speakers is known. Unfortunately, this information might not always be available in real-world applications. In this study, we propose a real-time, single-channel attention-guided Convolutional Neural Network (CNN) to estimate the number of active speakers in overlapping speech. The proposed system extracts higher-level information from the speech spectral content using a CNN model. Next, the attention mechanism summarizes the extracted information into a compact feature vector without losing critical information. Finally, the active speakers are classiﬁed using a fully connected network. Experiments on simulated overlapping speech using WSJ corpus show that the attention solution is shown to improve the performance by almost 3\% absolute over conventional temporal average pooling. The proposed Attention-guided CNN achieves 76.15\% for both Weighted Accuracy and average Recall, and 75.80\% Precision on speech segments as short as 20 frames (i.e., 200 ms). All the classiﬁcation metrics exceed 92\% for the attention-guided model in ofﬂine scenarios where the input signal is more than 100 frames long (i.e., 1s).},
	language = {en},
	booktitle = {Interspeech},
	author = {Yousefi, Midia and Hansen, John H L},
	year = {2021},
	file = {Youseﬁ and Hansen - Real-time Speaker counting in a cocktail party sce.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\JX8977DK\\Youseﬁ and Hansen - Real-time Speaker counting in a cocktail party sce.pdf:application/pdf},
}

@article{francombe_iosr_2017,
	title = {{IoSR} listening room multichannel {BRIR} dataset},
	journal = {University of Surrey},
	author = {Francombe, Jon},
	year = {2017},
}

@inproceedings{cristoforetti_dirha_2014,
	title = {The {DIRHA} simulated corpus},
	abstract = {This paper describes a multi-microphone multi-language acoustic corpus being developed under the EC project Distant-speech Interaction for Robust Home Applications (DIRHA). The corpus is composed of several sequences obtained by convolution of dry acoustic events with more than 9000 impulse responses measured in a real apartment equipped with 40 microphones. The acoustic events include in-domain sentences of different typologies uttered by native speakers in four different languages and non-speech events representing typical domestic noises. To increase the realism of the resulting corpus, background noises were recorded in the real home environment and then added to the generated sequences. The purpose of this work is to describe the simulation procedure and the data sets that were created and used to derive the corpus. The corpus contains signals of different characteristics making it suitable for various multi-microphone signal processing and distant speech recognition tasks.},
	language = {en},
	booktitle = {{LREC}},
	author = {Cristoforetti, Luca and Ravanelli, Mirco and Omologo, Maurizio and Sosi, Alessandro and Abad, Alberto},
	year = {2014},
	pages = {2629--2634},
	file = {Cristoforetti et al. - The DIRHA simulated corpus.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\9RU8S3GA\\Cristoforetti et al. - The DIRHA simulated corpus.pdf:application/pdf},
}

@inproceedings{hadad_multichannel_2014,
	title = {Multichannel audio database in various acoustic environments},
	isbn = {978-1-4799-6808-4},
	doi = {10.1109/IWAENC.2014.6954309},
	abstract = {In this paper we describe a new multichannel room impulse responses database. The impulse responses are measured in a room with conﬁgurable reverberation level resulting in three different acoustic scenarios with reverberation times RT60 equals to 160 ms, 360 ms and 610 ms. The measurements were carried out in recording sessions of several source positions on a spatial grid (angle range of −90o to 90o in 15o steps with 1 m and 2 m distance from the microphone array). The signals in all sessions were captured by three microphone array conﬁgurations. The database is accompanied with software utilities to easily access and manipulate the data. Besides the description of the database we demonstrate its use in spatial source separation task.},
	language = {en},
	booktitle = {International {Workshop} on {Acoustic} {Signal} {Enhancement}},
	author = {Hadad, Elior and Heese, Florian and Vary, Peter and Gannot, Sharon},
	month = sep,
	year = {2014},
	pages = {313--317},
	file = {Hadad et al. - 2014 - Multichannel audio database in various acoustic en.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\BBM26765\\Hadad et al. - 2014 - Multichannel audio database in various acoustic en.pdf:application/pdf},
}

@inproceedings{scheibler_pyroomacoustics_2018,
	title = {Pyroomacoustics: a {Python} package for audio room simulation and array processing algorithms},
	shorttitle = {Pyroomacoustics},
	doi = {10.1109/ICASSP.2018.8461310},
	abstract = {We present pyroomacoustics, a software package aimed at the rapid development and testing of audio array processing algorithms. The content of the package can be divided into three main components: an intuitive Python object-oriented interface to quickly construct different simulation scenarios involving multiple sound sources and microphones in 2D and 3D rooms; a fast C implementation of the image source model for general polyhedral rooms to efficiently generate room impulse responses and simulate the propagation between sources and receivers; and finally, reference implementations of popular algorithms for beamforming, direction finding, and adaptive filtering. Together, they form a package with the potential to speed up the time to market of new algorithms by significantly reducing the implementation overhead in the performance evaluation step.},
	booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
	author = {Scheibler, Robin and Bezzam, Eric and Dokmanić, Ivan},
	month = apr,
	year = {2018},
	pages = {351--355},
	file = {IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\4EM7E4BY\\Scheibler et al. - 2018 - Pyroomacoustics A Python Package for Audio Room S.pdf:application/pdf},
}

@article{grumiaux_saladnet_2021,
	title = {{SALADnet}: self-attentive multisource localization in the {Ambisonics} domain},
	abstract = {In this work, we propose a novel self-attention based neural network for robust multi-speaker localization from Ambisonics recordings. Starting from a state-of-the-art convolutional recurrent neural network, we investigate the beneﬁt of replacing the recurrent layers by self-attention encoders, inherited from the Transformer architecture. We evaluate these models on synthetic and real-world data, with up to 3 simultaneous speakers. The obtained results indicate that the majority of the proposed architectures either perform on par, or outperform the CRNN baseline, especially in the multisource scenario. Moreover, by avoiding the recurrent layers, the proposed models lend themselves to parallel computing, which is shown to produce considerable savings in execution time.},
	language = {en},
	journal = {Accepted to IEEE Workshop on Applications of Signal Processing to Audio and Acoustics},
	author = {Grumiaux, Pierre-Amaury and Kitic, Srdan and Srivastava, Prerak and Girin, Laurent and Guérin, Alexandre},
	year = {2021},
	keywords = {lu},
	file = {Grumiaux - 2021 - SALADNET SELF-ATTENTIVE MULTISOURCE LOCALIZATION .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\KTSVZ46H\\Grumiaux - 2021 - SALADNET SELF-ATTENTIVE MULTISOURCE LOCALIZATION .pdf:application/pdf},
}

@patent{blumlein_improvements_1931,
	title = {Improvements in and relating to sound-transmission, sound-recording and sound-reproducing systems},
	number = {394325},
	author = {Blumlein, Alan D.},
	year = {1931},
	file = {Stereo Sauce - Blumlein patent:C\:\\Users\\RQML4978\\Zotero\\storage\\LL4QUQDD\\blumlein_patent.html:text/html},
}

@article{gerzon_periphony_1973,
	title = {Periphony: with-height sound reproduction},
	volume = {21},
	shorttitle = {Periphony},
	url = {https://www.aes.org/e-lib/browse.cfm?elib=2012},
	abstract = {Periphony (sound reproduction in both vertical and horizontal directions around a listener) may be recorded among others, via practical two-, four-, and nine-channel systems. Matrix parameters and microphone techniques are described for 19 different systems, and a design procedure for other periphonic systems is given. Amplitude and energy directional resolution are discussed, as is compatibility with current horizontal-only systems.},
	language = {English},
	number = {1},
	urldate = {2021-07-07},
	journal = {Journal of the Audio Engineering Society},
	author = {Gerzon, Michael A.},
	month = feb,
	year = {1973},
	pages = {2--10},
	file = {Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\T7MJNSPZ\\browse.html:text/html},
}

@book{courant_methods_2009,
	address = {Weinheim},
	title = {Methods of mathematical physics. {Vol}.1},
	volume = {1},
	isbn = {978-0-471-50447-4},
	language = {eng},
	publisher = {Wiley-VCH},
	author = {Courant, Richard and Hilbert, David},
	year = {2009},
	note = {ZSCC: NoCitationData[s0]},
	file = {Methods of Mathematical Physics (2 Volumes) by R. Courant, D. Hilbert (z-lib.org).pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\EHH3H4XT\\Methods of Mathematical Physics (2 Volumes) by R. Courant, D. Hilbert (z-lib.org).pdf:application/pdf},
}

@book{arfken_mathematical_2013,
	edition = {7th},
	title = {Mathematical methods for physicists: a comprehensive guide},
	isbn = {978-0-12-384654-9},
	shorttitle = {Mathematical methods for physicists},
	publisher = {Elsevier},
	author = {Arfken, George B. and Weber, Hans-Jurgen and Harris, Frank E.},
	year = {2013},
	file = {Mathematical_Methods_for_Physicists,_Seventh_Editi_2600326_(z-lib.org).pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\EMSJMGYW\\Mathematical_Methods_for_Physicists,_Seventh_Editi_2600326_(z-lib.org).pdf:application/pdf},
}

@incollection{schmidt_erdmagnetismus_1917,
	title = {Erdmagnetismus},
	author = {Schmidt, Adolf Friedrich Karl},
	year = {1917},
}

@book{jackson_classical_1999,
	address = {New York},
	edition = {3rd ed},
	title = {Classical electrodynamics},
	isbn = {978-0-471-30932-1},
	publisher = {Wiley},
	author = {Jackson, John David},
	year = {1999},
	file = {Classical Electrodynamics by John David Jackson (z-lib.org).pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\GQB4D6PQ\\Classical Electrodynamics by John David Jackson (z-lib.org).pdf:application/pdf},
}

@misc{noauthor_ront-sf1_nodate,
	title = {R{ØDE} {NT}-{SF1}},
	url = {https://www.rode.com/ntsf1},
	urldate = {2021-07-12},
	file = {RØDE NT-SF1:C\:\\Users\\RQML4978\\Zotero\\storage\\ZQASUH73\\ntsf1.html:text/html},
}

@phdthesis{salaun_bertet_formats_nodate,
	title = {Formats audio 3d hiérarchiques : caractérisation objective et perceptive des systèmes ambisonics d’ordres supérieurs},
	language = {fr},
	author = {Salaün Bertet, Stéphanie},
	file = {Stéphanie - Formats audio 3d hiérarchiques  caractérisation o.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\GSK9ELWD\\Stéphanie - Formats audio 3d hiérarchiques  caractérisation o.pdf:application/pdf},
}

@inproceedings{jarrett_3d_2010,
	title = {{3D} source localization in the spherical harmonic domain using a pseudointensity vector},
	abstract = {The problem of acoustic source localization is important in many acoustic signal processing applications, such as distant speech acquisition and automated camera steering. In noisy and reverberant environments, the source localization problem becomes challenging and many existing algorithms deteriorate. Three-dimensional source localization presents advantages for certain applications such as beamforming, where we can steer a beam to both the desired azimuth and the desired elevation. In this paper, we present an acoustic source localization method with low computational complexity which, instead of using individual microphone signals, combines them to form eigenbeams. We then use the zero-and first-order eigenbeams to compute a pseudointensity vector pointing in the direction of the sound source. In an experimental study, the proposed method's localization accuracy is compared with that of a steered response power localization method, which uses the same eigenbeams. The results demonstrate that the proposed method has higher localization accuracy.},
	booktitle = {2010 18th {European} {Signal} {Processing} {Conference}},
	author = {Jarrett, Daniel P. and Habets, Emanuël A. P. and Naylor, Patrick A.},
	month = aug,
	year = {2010},
	note = {ZSCC: 0000089 
ISSN: 2219-5491},
	pages = {442--446},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\SENGCP45\\7096575.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\VVCEUTR2\\Jarrett et al. - 2010 - 3D source localization in the spherical harmonic d.pdf:application/pdf},
}

@article{hornik_multilayer_1989,
	title = {Multilayer feedforward networks are universal approximators},
	volume = {2},
	issn = {08936080},
	doi = {10.1016/0893-6080(89)90020-8},
	abstract = {This paper rigorously establishes thut standard rnultiluyer feedforward networks with as f\&v us one hidden layer using arbitrary squashing functions ure capable of upproximating uny Bore1 measurable function from one finite dimensional space to another to any desired degree of uccuracy, provided sujficirntly muny hidden units are available. In this sense, multilayer feedforward networks are u class of universul rlpproximators.},
	language = {en},
	number = {5},
	journal = {Neural Networks},
	author = {Hornik, Kurt and Stinchcombe, Maxwell and White, Halbert},
	year = {1989},
	pages = {359--366},
	file = {Hornik et al. - 1989 - Multilayer feedforward networks are universal appr.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AJUVVXKQ\\Hornik et al. - 1989 - Multilayer feedforward networks are universal appr.pdf:application/pdf},
}

@article{jumper_highly_2021,
	title = {Highly accurate protein structure prediction with {AlphaFold}},
	issn = {0028-0836, 1476-4687},
	doi = {10.1038/s41586-021-03819-2},
	language = {en},
	journal = {Nature},
	author = {Jumper, John and Evans, Richard and Pritzel, Alexander and Green, Tim and Figurnov, Michael and Ronneberger, Olaf and Tunyasuvunakool, Kathryn and Bates, Russ and Žídek, Augustin and Potapenko, Anna and Bridgland, Alex and Meyer, Clemens and Kohl, Simon A. A. and Ballard, Andrew J. and Cowie, Andrew and Romera-Paredes, Bernardino and Nikolov, Stanislav and Jain, Rishub and Adler, Jonas and Back, Trevor and Petersen, Stig and Reiman, David and Clancy, Ellen and Zielinski, Michal and Steinegger, Martin and Pacholska, Michalina and Berghammer, Tamas and Bodenstein, Sebastian and Silver, David and Vinyals, Oriol and Senior, Andrew W. and Kavukcuoglu, Koray and Kohli, Pushmeet and Hassabis, Demis},
	month = jul,
	year = {2021},
	pages = {1--11},
	file = {Jumper et al. - 2021 - Highly accurate protein structure prediction with .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\G4CPRQGG\\Jumper et al. - 2021 - Highly accurate protein structure prediction with .pdf:application/pdf},
}

@article{hinton_deep_2012,
	title = {Deep neural networks for acoustic modeling in speech recognition: the shared views of four research groups},
	volume = {29},
	issn = {1558-0792},
	shorttitle = {Deep {Neural} {Networks} for {Acoustic} {Modeling} in {Speech} {Recognition}},
	doi = {10.1109/MSP.2012.2205597},
	abstract = {Most current speech recognition systems use hidden Markov models (HMMs) to deal with the temporal variability of speech and Gaussian mixture models (GMMs) to determine how well each state of each HMM fits a frame or a short window of frames of coefficients that represents the acoustic input. An alternative way to evaluate the fit is to use a feed-forward neural network that takes several frames of coefficients as input and produces posterior probabilities over HMM states as output. Deep neural networks (DNNs) that have many hidden layers and are trained using new methods have been shown to outperform GMMs on a variety of speech recognition benchmarks, sometimes by a large margin. This article provides an overview of this progress and represents the shared views of four research groups that have had recent successes in using DNNs for acoustic modeling in speech recognition.},
	number = {6},
	journal = {IEEE Signal Processing Magazine},
	author = {Hinton, Geoffrey and Deng, Li and Yu, Dong and Dahl, George E. and Mohamed, Abdel-rahman and Jaitly, Navdeep and Senior, Andrew and Vanhoucke, Vincent and Nguyen, Patrick and Sainath, Tara N. and Kingsbury, Brian},
	month = nov,
	year = {2012},
	pages = {82--97},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\CJ5U5PI3\\6296526.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\H56JNARD\\Hinton et al. - 2012 - Deep Neural Networks for Acoustic Modeling in Spee.pdf:application/pdf},
}

@article{hennequin_spleeter_2020,
	title = {Spleeter: a fast and efficient music source separation tool with pre-trained models},
	volume = {5},
	issn = {2475-9066},
	shorttitle = {Spleeter},
	doi = {10.21105/joss.02154},
	abstract = {Hennequin et al., (2020). Spleeter: a fast and efficient music source separation tool with pre-trained models. Journal of Open Source Software, 5(50), 2154, https://doi.org/10.21105/joss.02154},
	language = {en},
	number = {50},
	journal = {Journal of Open Source Software},
	author = {Hennequin, Romain and Khlif, Anis and Voituret, Felix and Moussallam, Manuel},
	month = jun,
	year = {2020},
	pages = {2154},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\YIIR58LG\\Hennequin et al. - 2020 - Spleeter a fast and efficient music source separa.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\E94D5STG\\joss.html:text/html},
}

@inproceedings{hadjeres_deepbach_2017,
	title = {{DeepBach}: a steerable model for bach chorales generation},
	shorttitle = {{DeepBach}},
	abstract = {This paper introduces DeepBach, a graphical model aimed at modeling polyphonic music and specifically hymn-like pieces. We claim that, after being trained on the chorale harmonizations by Johann Sebastian Bach, our model is capable of generating highly convincing chorales in the style of Bach. DeepBach's strength comes from the use of pseudo-Gibbs sampling coupled with an adapted representation of musical data. This is in contrast with many automatic music composition approaches which tend to compose music sequentially. Our model is also steerable in the sense that a user can constrain the generation by imposing positional constraints such as notes, rhythms or cadences in the generated score. We also provide a plugin on top of the MuseScore music editor making the interaction with DeepBach easy to use.},
	booktitle = {International {Conference} on {Machine} {Learning}},
	author = {Hadjeres, Gaëtan and Pachet, François and Nielsen, Frank},
	month = aug,
	year = {2017},
	pages = {1362--1371},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\UZICLUW5\\Hadjeres et al. - 2017 - DeepBach a Steerable Model for Bach Chorales Gene.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\KZH3UI2U\\1612.html:text/html},
}

@article{mcculloch_logicial_1943,
	title = {A logicial calculus of the ideas immanent in nervous activity},
	volume = {5},
	language = {en},
	journal = {The Bulletin of Mathematical Biophysics},
	author = {McCulloch, Warren S. and Pitts, Walter},
	year = {1943},
	pages = {115--133},
	file = {Mcculloch and Pitts - A LOGICAL CALCULUS OF THE IDEAS IMMANENT IN NERVOU.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\6GYGXDY7\\Mcculloch and Pitts - A LOGICAL CALCULUS OF THE IDEAS IMMANENT IN NERVOU.pdf:application/pdf},
}

@article{srivastava_dropout_2014,
	title = {Dropout: a simple way to prevent neural networks from overﬁtting},
	volume = {15},
	abstract = {Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overﬁtting is a serious problem in such networks. Large networks are also slow to use, making it diﬃcult to deal with overﬁtting by combining the predictions of many diﬀerent large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of diﬀerent “thinned” networks. At test time, it is easy to approximate the eﬀect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This signiﬁcantly reduces overﬁtting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classiﬁcation and computational biology, obtaining state-of-the-art results on many benchmark data sets.},
	language = {en},
	number = {1},
	journal = {Journal of Machine Learning Research},
	author = {Srivastava, Nitish and Hinton, Geoﬀrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
	month = jun,
	year = {2014},
	pages = {1929--1958},
	file = {Srivastava et al. - Dropout A Simple Way to Prevent Neural Networks f.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\BSJ556WW\\Srivastava et al. - Dropout A Simple Way to Prevent Neural Networks f.pdf:application/pdf},
}

@inproceedings{yu_multi-scale_2016,
	title = {Multi-scale context aggregation by dilated convolutions},
	abstract = {State-of-the-art models for semantic segmentation are based on adaptations of convolutional networks that had originally been designed for image classification. However, dense prediction and image classification are structurally different. In this work, we develop a new convolutional network module that is specifically designed for dense prediction. The presented module uses dilated convolutions to systematically aggregate multi-scale contextual information without losing resolution. The architecture is based on the fact that dilated convolutions support exponential expansion of the receptive field without loss of resolution or coverage. We show that the presented context module increases the accuracy of state-of-the-art semantic segmentation systems. In addition, we examine the adaptation of image classification networks to dense prediction and show that simplifying the adapted network can increase accuracy.},
	booktitle = {International {Conference} on {Learning} {Representations}},
	author = {Yu, Fisher and Koltun, Vladlen},
	year = {2016},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\VV42T5WA\\Yu and Koltun - 2016 - Multi-Scale Context Aggregation by Dilated Convolu.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\QBHE2SFI\\1511.html:text/html},
}

@article{schuster_bidirectional_1997,
	title = {Bidirectional recurrent neural networks},
	volume = {45},
	issn = {1941-0476},
	doi = {10.1109/78.650093},
	abstract = {In the first part of this paper, a regular recurrent neural network (RNN) is extended to a bidirectional recurrent neural network (BRNN). The BRNN can be trained without the limitation of using input information just up to a preset future frame. This is accomplished by training it simultaneously in positive and negative time direction. Structure and training procedure of the proposed network are explained. In regression and classification experiments on artificial data, the proposed structure gives better results than other approaches. For real data, classification experiments for phonemes from the TIMIT database show the same tendency. In the second part of this paper, it is shown how the proposed bidirectional structure can be easily modified to allow efficient estimation of the conditional posterior probability of complete symbol sequences without making any explicit assumption about the shape of the distribution. For this part, experiments on real data are reported.},
	number = {11},
	journal = {IEEE Transactions on Signal Processing},
	author = {Schuster, Mike and Paliwal, Kuldip K.},
	month = nov,
	year = {1997},
	pages = {2673--2681},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\W5AG93DU\\650093.html:text/html;Submitted Version:C\:\\Users\\RQML4978\\Zotero\\storage\\J6VWWXND\\Schuster and Paliwal - 1997 - Bidirectional recurrent neural networks.pdf:application/pdf},
}

@inproceedings{luong_effective_2015,
	title = {Effective approaches to attention-based neural machine translation},
	abstract = {An attentional mechanism has lately been used to improve neural machine translation (NMT) by selectively focusing on parts of the source sentence during translation. However, there has been little work exploring useful architectures for attention-based NMT. This paper examines two simple and effective classes of attentional mechanism: a global approach which always attends to all source words and a local one that only looks at a subset of source words at a time. We demonstrate the effectiveness of both approaches over the WMT translation tasks between English and German in both directions. With local attention, we achieve a significant gain of 5.0 BLEU points over non-attentional systems which already incorporate known techniques such as dropout. Our ensemble model using different attention architectures has established a new state-of-the-art result in the WMT'15 English to German translation task with 25.9 BLEU points, an improvement of 1.0 BLEU points over the existing best system backed by NMT and an n-gram reranker.},
	booktitle = {Conference on {Empirical} {Methods} in {Natural} {Language} {Processing}},
	author = {Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D.},
	month = sep,
	year = {2015},
	pages = {1412--1421},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\3RB4RDTT\\Luong et al. - 2015 - Effective Approaches to Attention-based Neural Mac.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\9FK8RWC4\\1508.html:text/html},
}

@inproceedings{brown_language_2020,
	title = {Language models are few-shot learners},
	abstract = {Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by ﬁne-tuning on a speciﬁc task. While typically task-agnostic in architecture, this method still requires task-speciﬁc ﬁne-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions – something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art ﬁnetuning approaches. Speciﬁcally, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or ﬁne-tuning, with tasks and few-shot demonstrations speciﬁed purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-ﬂy reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3’s few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we ﬁnd that GPT-3 can generate samples of news articles which human evaluators have difﬁculty distinguishing from articles written by humans. We discuss broader societal impacts of this ﬁnding and of GPT-3 in general.},
	language = {en},
	booktitle = {Conference on {Neural} {Information} {Processing} {System}},
	author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
	year = {2020},
	file = {Brown et al. - 2020 - Language Models are Few-Shot Learners.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\VHD56GML\\Brown et al. - 2020 - Language Models are Few-Shot Learners.pdf:application/pdf},
}

@article{devlin_bert_2019,
	title = {{BERT}: pre-training of deep bidirectional transformers for language understanding},
	shorttitle = {{BERT}},
	abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be ﬁnetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspeciﬁc architecture modiﬁcations.},
	language = {en},
	journal = {arXiv:1810.04805},
	author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
	month = may,
	year = {2019},
	file = {Devlin et al. - 2019 - BERT Pre-training of Deep Bidirectional Transform.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\47F4AWDD\\Devlin et al. - 2019 - BERT Pre-training of Deep Bidirectional Transform.pdf:application/pdf},
}

@book{blauert_spatial_1997,
	title = {Spatial hearing: the psychophysics of human sound localization},
	shorttitle = {Spatial {Hearing}},
	abstract = {The field of spatial hearing has exploded in the decade or so since Jens Blauert's classic work on acoustics was first published in English. This revised editio},
	language = {en},
	publisher = {MIT Press},
	author = {Blauert, Jens},
	year = {1997},
	file = {Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\TPK6JUH3\\Spatial-HearingThe-Psychophysics-of-Human-Sound.html:text/html},
}

@article{hawley_benefit_2004,
	title = {The benefit of binaural hearing in a cocktail party: effect of location and type of interferer},
	volume = {115},
	issn = {0001-4966},
	shorttitle = {The benefit of binaural hearing in a cocktail party},
	doi = {10.1121/1.1639908},
	number = {2},
	journal = {The Journal of the Acoustical Society of America},
	author = {Hawley, Monica L. and Litovsky, Ruth Y. and Culling, John F.},
	month = feb,
	year = {2004},
	pages = {833--843},
}

@article{arons_review_1992,
	title = {A review of the cocktail party effect},
	volume = {12},
	abstract = {The “cocktail party effect”—the ability to focus one’s listening attention on a single talker among a cacophony of conversations and background noise—has been recognized for some time. This specialized listening ability may be because of characteristics of the human speech production system, the auditory system, or high-level perceptual and language processing. This paper investigates the literature on what is known about the effect, from the original technical descriptions through current research in the areas of auditory streams and spatial display systems.},
	language = {en},
	number = {7},
	journal = {Journal of the American Voice I/O Society},
	author = {Arons, Barry},
	year = {1992},
	pages = {35--50},
	file = {Arons - A Review of The Cocktail Party Effect.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\L4BFKN2I\\Arons - A Review of The Cocktail Party Effect.pdf:application/pdf},
}

@article{nassif_speech_2019,
	title = {Speech recognition using deep neural networks: a systematic review},
	volume = {7},
	issn = {2169-3536},
	shorttitle = {Speech {Recognition} {Using} {Deep} {Neural} {Networks}},
	abstract = {Over the past decades, a tremendous amount of research has been done on the use of machine learning for speech processing applications, especially speech recognition. However, in the past few years, research has focused on utilizing deep learning for speech-related applications. This new area of machine learning has yielded far better results when compared to others in a variety of applications including speech, and thus became a very attractive area of research. This paper provides a thorough examination of the different studies that have been conducted since 2006, when deep learning first arose as a new area of machine learning, for speech applications. A thorough statistical analysis is provided in this review which was conducted by extracting specific information from 174 papers published between the years 2006 and 2018. The results provided in this paper shed light on the trends of research in this area as well as bring focus to new research topics.},
	journal = {IEEE Access},
	author = {Nassif, Ali Bou and Shahin, Ismail and Attili, Imtinan and Azzeh, Mohammad and Shaalan, Khaled},
	year = {2019},
	pages = {19143--19165},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\9AUJQCS3\\8632885.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\MWJJP4IW\\Nassif et al. - 2019 - Speech Recognition Using Deep Neural Networks A S.pdf:application/pdf},
}

@inproceedings{ravanelli_speaker_2018,
	title = {Speaker recognition from raw waveform with {SincNet}},
	abstract = {Deep learning is progressively gaining popularity as a viable alternative to i-vectors for speaker recognition. Promising results have been recently obtained with Convolutional Neural Networks (CNNs) when fed by raw speech samples directly. Rather than employing standard hand-crafted features, the latter CNNs learn low-level speech representations from waveforms, potentially allowing the network to better capture important narrow-band speaker characteristics such as pitch and formants. Proper design of the neural network is crucial to achieve this goal.},
	language = {en},
	booktitle = {{IEEE} {Spoken} {Language} {Technology} {Workshop}},
	author = {Ravanelli, Mirco and Bengio, Yoshua},
	year = {2018},
	pages = {1021--1028},
	file = {Ravanelli and Bengio - 2019 - Speaker Recognition from Raw Waveform with SincNet.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\VFGE4AHA\\Ravanelli and Bengio - 2019 - Speaker Recognition from Raw Waveform with SincNet.pdf:application/pdf},
}

@inproceedings{dmochowski_broadband_2007,
	title = {Broadband {Music}: opportunities and challenges for multiple source localization},
	shorttitle = {Broadband {Music}},
	doi = {10.1109/ASPAA.2007.4392978},
	abstract = {It is well-known that the subspace MUltiple SIgnal Classification (MUSIC) method provides high-resolution spatial spectral estimates in narrowband signal environments. However, for broadband signals, such high-resolution methods still elude researchers. This paper proposes a broadband version of the MUSIC method using a parameterized version of the spatial correlation matrix. The proposed algorithm utilizes both inter-microphone amplitude and phase differences; as a result, simulation results show that the proposed method allows two broadband sources which are not resolvable by conventional steered beamforming to be accurately resolved.},
	booktitle = {{IEEE} {Workshop} on {Applications} of {Signal} {Processing} to {Audio} and {Acoustics}},
	author = {Dmochowski, Jacek P. and Benesty, Jacob and Affes, Sofiene},
	month = oct,
	year = {2007},
	pages = {18--21},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\UKZ2EA53\\4392978.html:text/html},
}

@inproceedings{svensson_computational_2002,
	title = {Computational modelling and simulation of acoutic spaces},
	abstract = {The computational modelling of acoustic spaces is fundamental to many applications in auralization/virtual acoustics. The demands vary widely, from real-time simulation in multimedia and computer games, to non-real time situations with high accuracy needs, such as prediction of room acoustic conditions in music performance spaces. Acoustic spaces include single room or multi-room spaces, with simple or complex geometries and boundary conditions. Outdoor spaces can range from city environments...},
	language = {English},
	booktitle = {{AES} {International} {Conference} on {Virtual}, {Synthetic}, and {Entertainment} {Audio}},
	publisher = {Audio Engineering Society},
	author = {Svensson, Peter and Kristiansen, Ulf R.},
	month = jun,
	year = {2002},
	file = {Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\HCUJVJK8\\browse.html:text/html},
}

@inproceedings{garofolo_csr-i_2007,
	title = {{CSR}-{I} ({WSJ0})},
	booktitle = {Linguistic {Data} {Consortium}},
	author = {Garofolo, John and Graff, David and Paul, Doug and Pallett, David},
	year = {2007},
}

@article{evers_locata_2020,
	title = {The {LOCATA} {Challenge}: acoustic source localization and tracking},
	volume = {28},
	issn = {2329-9304},
	shorttitle = {The {LOCATA} {Challenge}},
	doi = {10.1109/TASLP.2020.2990485},
	abstract = {The ability to localize and track acoustic events is a fundamental prerequisite for equipping machines with the ability to be aware of and engage with humans in their surrounding environment. However, in realistic scenarios, audio signals are adversely affected by reverberation, noise, interference, and periods of speech inactivity. In dynamic scenarios, where the sources and microphone platforms may be moving, the signals are additionally affected by variations in the source-sensor geometries. In practice, approaches to sound source localization and tracking are often impeded by missing estimates of active sources, estimation errors, as well as false estimates. The aim of the LOCAlization and TrAcking (LOCATA) Challenge is an open-access framework for the objective evaluation and benchmarking of broad classes of algorithms for sound source localization and tracking. This article provides a review of relevant localization and tracking algorithms and, within the context of the existing literature, a detailed evaluation and dissemination of the LOCATA submissions. The evaluation highlights achievements in the field, open challenges, and identifies potential future directions.},
	journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
	author = {Evers, Christine and Löllmann, Heinrich W. and Mellmann, Heinrich and Schmidt, Alexander and Barfuss, Hendrik and Naylor, Patrick A. and Kellermann, Walter},
	year = {2020},
	pages = {1620--1643},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\GLJS5TLJ\\9079214.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\F8QY57WK\\Evers et al. - 2020 - The LOCATA Challenge Acoustic Source Localization.pdf:application/pdf},
}

@article{guizzo_l3das21_2021,
	title = {{L3DAS21} challenge: machine learning for {3D} audio signal processing},
	shorttitle = {{L3DAS21} {Challenge}},
	abstract = {The L3DAS21 Challenge is aimed at encouraging and fostering collaborative research on machine learning for 3D audio signal processing, with particular focus on 3D speech enhancement (SE) and 3D sound localization and detection (SELD). Alongside with the challenge, we release the L3DAS21 dataset, a 65 hours 3D audio corpus, accompanied with a Python API that facilitates the data usage and results submission stage. Usually, machine learning approaches to 3D audio tasks are based on single-perspective Ambisonics recordings or on arrays of single-capsule microphones. We propose, instead, a novel multichannel audio configuration based multiple-source and multiple-perspective Ambisonics recordings, performed with an array of two first-order Ambisonics microphones. To the best of our knowledge, it is the first time that a dual-mic Ambisonics configuration is used for these tasks. We provide baseline models and results for both tasks, obtained with state-of-the-art architectures: FaSNet for SE and SELDNet for SELD. This report is aimed at providing all needed information to participate in the L3DAS21 Challenge, illustrating the details of the L3DAS21 dataset, the challenge tasks and the baseline models.},
	journal = {arXiv:2104.05499},
	author = {Guizzo, Eric and Gramaccioni, Riccardo F. and Jamili, Saeid and Marinoni, Christian and Massaro, Edoardo and Medaglia, Claudia and Nachira, Giuseppe and Nucciarelli, Leonardo and Paglialunga, Ludovica and Pennese, Marco and Pepe, Sveva and Rocchi, Enrico and Uncini, Aurelio and Comminiello, Danilo},
	month = apr,
	year = {2021},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\PXA6LW27\\Guizzo et al. - 2021 - L3DAS21 Challenge Machine Learning for 3D Audio S.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\TIKF8HJ2\\2104.html:text/html},
}

@article{jarrett_rigid_2012,
	title = {Rigid sphere room impulse response simulation: {Algorithm} and applications},
	volume = {132},
	issn = {0001-4966},
	doi = {10.1121/1.4740497},
	number = {3},
	journal = {The Journal of the Acoustical Society of America},
	author = {Jarrett, Daniel P. and Habets, Emanuel A. P. and Thomas, Mark R. P. and Naylor, Patrick A.},
	month = sep,
	year = {2012},
	pages = {1462--1472},
}

@inproceedings{huang_densely_2017,
	title = {Densely connected convolutional networks},
	doi = {10.1109/CVPR.2017.243},
	abstract = {Recent work has shown that convolutional networks can be substantially deeper, more accurate, and efficient to train if they contain shorter connections between layers close to the input and those close to the output. In this paper, we embrace this observation and introduce the Dense Convolutional Network (DenseNet), which connects each layer to every other layer in a feed-forward fashion. Whereas traditional convolutional networks with L layers have L connections-one between each layer and its subsequent layer-our network has L(L+1)/2 direct connections. For each layer, the feature-maps of all preceding layers are used as inputs, and its own feature-maps are used as inputs into all subsequent layers. DenseNets have several compelling advantages: they alleviate the vanishing-gradient problem, strengthen feature propagation, encourage feature reuse, and substantially reduce the number of parameters. We evaluate our proposed architecture on four highly competitive object recognition benchmark tasks (CIFAR-10, CIFAR-100, SVHN, and ImageNet). DenseNets obtain significant improvements over the state-of-the-art on most of them, whilst requiring less memory and computation to achieve high performance. Code and pre-trained models are available at https://github.com/liuzhuang13/DenseNet.},
	booktitle = {{IEEE} {Conference} on {Computer} {Vision} and {Pattern} {Recognition}},
	author = {Huang, Gao and Liu, Zhuang and Van Der Maaten, Laurens and Weinberger, Kilian Q.},
	month = jul,
	year = {2017},
	pages = {2261--2269},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\LZKJGXYN\\8099726.html:text/html;Submitted Version:C\:\\Users\\RQML4978\\Zotero\\storage\\HGLV7WN7\\Huang et al. - 2017 - Densely Connected Convolutional Networks.pdf:application/pdf},
}

@article{bach_pixel-wise_2015,
	title = {On pixel-wise explanations for non-linear classifier decisions by layer-wise relevance propagation},
	volume = {10},
	doi = {10.1371/journal.pone.0130140},
	abstract = {This work proposes a general solution to the problem of understanding classification decisions by pixel-wise decomposition of nonlinear classifiers by introducing a methodology that allows to visualize the contributions of single pixels to predictions for kernel-based classifiers over Bag of Words features and for multilayered neural networks. Understanding and interpreting classification decisions of automated image classification systems is of high value in many applications, as it allows to verify the reasoning of the system and provides additional information to the human expert. Although machine learning methods are solving very successfully a plethora of tasks, they have in most cases the disadvantage of acting as a black box, not providing any information about what made them arrive at a particular decision. This work proposes a general solution to the problem of understanding classification decisions by pixel-wise decomposition of nonlinear classifiers. We introduce a methodology that allows to visualize the contributions of single pixels to predictions for kernel-based classifiers over Bag of Words features and for multilayered neural networks. These pixel contributions can be visualized as heatmaps and are provided to a human expert who can intuitively not only verify the validity of the classification decision, but also focus further analysis on regions of potential interest. We evaluate our method for classifiers trained on PASCAL VOC 2009 images, synthetic image data containing geometric shapes, the MNIST handwritten digits data set and for the pre-trained ImageNet model available as part of the Caffe open source package.},
	number = {7},
	journal = {PloS one},
	author = {Bach, Sebastian and Binder, Alexander and Montavon, Grégoire and Klauschen, Frederick and Müller, Klaus-Robert and Samek, Wojciech},
	year = {2015},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\IQVU3WWQ\\Bach et al. - 2015 - On Pixel-Wise Explanations for Non-Linear Classifi.pdf:application/pdf},
}

@article{kuhn_hungarian_1955,
	title = {The {Hungarian} method for the assignment problem},
	volume = {2},
	issn = {00281441, 19319193},
	doi = {10.1002/nav.3800020109},
	language = {en},
	number = {1-2},
	urldate = {2021-10-08},
	journal = {Naval Research Logistics Quarterly},
	author = {Kuhn, Harold W.},
	month = mar,
	year = {1955},
	pages = {83--97},
	file = {Kuhn - 1955 - The Hungarian method for the assignment problem.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\VZS6A3E4\\Kuhn - 1955 - The Hungarian method for the assignment problem.pdf:application/pdf},
}

@inproceedings{vogl_drum_2017,
	title = {Drum transcription via joint beat and drum modeling using convolutional recurrent neural networks},
	abstract = {Existing systems for automatic transcription of drum tracks from polyphonic music focus on detecting drum instrument onsets but lack consideration of additional meta information like bar boundaries, tempo, and meter. We address this limitation by proposing a system which has the capability to detect drum instrument onsets along with the corresponding beats and downbeats. In this design, the system has the means to utilize information on the rhythmical structure of a song which is closely related to the desired drum transcript. To this end, we introduce and compare different architectures for this task, i.e., recurrent, convolutional, and recurrent-convolutional neural networks. We evaluate our systems on two well-known data sets and an additional new data set containing both drum and beat annotations. We show that convolutional and recurrentconvolutional neural networks perform better than state-ofthe-art methods and that learning beats jointly with drums can be beneﬁcial for the task of drum detection.},
	language = {en},
	booktitle = {Conference of the {International} {Society} for {Music} {Information} {Retrieval}},
	author = {Vogl, Richard and Dorfer, Matthias and Widmer, Gerhard and Knees, Peter},
	year = {2017},
	pages = {150--157},
	file = {Vogl et al. - 2017 - DRUM TRANSCRIPTION VIA JOINT BEAT AND DRUM MODELIN.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\ST8ID7V9\\Vogl et al. - 2017 - DRUM TRANSCRIPTION VIA JOINT BEAT AND DRUM MODELIN.pdf:application/pdf},
}

@book{bregman_auditory_1994,
	title = {Auditory scene analysis: the perceptual organization of sound},
	isbn = {978-0-262-52195-6},
	shorttitle = {Auditory {Scene} {Analysis}},
	abstract = {Auditory Scene Analysis addresses the problem of hearing complex auditory environments, using a series of creative analogies to describe the process required of the human auditory system as it analyzes mixtures of sounds to recover descriptions of individual sounds. In a unified and comprehensive way, Bregman establishes a theoretical framework that integrates his findings with an unusually wide range of previous research in psychoacoustics, speech perception, music theory and composition, and computer modeling.},
	language = {en},
	publisher = {MIT Press},
	author = {Bregman, Albert S.},
	year = {1994},
}

@article{tranter_overview_2006,
	title = {An overview of automatic speaker diarization systems},
	volume = {14},
	issn = {1558-7924},
	doi = {10.1109/TASL.2006.878256},
	abstract = {Audio diarization is the process of annotating an input audio channel with information that attributes (possibly overlapping) temporal regions of signal energy to their specific sources. These sources can include particular speakers, music, background noise sources, and other signal source/channel characteristics. Diarization can be used for helping speech recognition, facilitating the searching and indexing of audio archives, and increasing the richness of automatic transcriptions, making them more readable. In this paper, we provide an overview of the approaches currently used in a key area of audio diarization, namely speaker diarization, and discuss their relative merits and limitations. Performances using the different techniques are compared within the framework of the speaker diarization task in the DARPA EARS Rich Transcription evaluations. We also look at how the techniques are being introduced into real broadcast news systems and their portability to other domains and tasks such as meetings and speaker verification},
	number = {5},
	journal = {IEEE Transactions on Audio, Speech, and Language Processing},
	author = {Tranter, S.E. and Reynolds, D.A.},
	month = sep,
	year = {2006},
	pages = {1557--1565},
	file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\ZM9279H6\\1677976.html:text/html},
}

@article{park_review_2021,
	title = {A review of speaker diarization: recent advances with deep learning},
	shorttitle = {A {Review} of {Speaker} {Diarization}},
	abstract = {Speaker diarization is a task to label audio or video recordings with classes that correspond to speaker identity, or in short, a task to identify "who spoke when". In the early years, speaker diarization algorithms were developed for speech recognition on multispeaker audio recordings to enable speaker adaptive processing. These algorithms also gained their own value as a standalone application over time to provide speaker-specific metainformation for downstream tasks such as audio retrieval. More recently, with the emergence of deep learning technology, which has driven revolutionary changes in research and practices across speech application domains, rapid advancements have been made for speaker diarization. In this paper, we review not only the historical development of speaker diarization technology but also the recent advancements in neural speaker diarization approaches. Furthermore, we discuss how speaker diarization systems have been integrated with speech recognition applications and how the recent surge of deep learning is leading the way of jointly modeling these two components to be complementary to each other. By considering such exciting technical trends, we believe that this paper is a valuable contribution to the community to provide a survey work by consolidating the recent developments with neural methods and thus facilitating further progress toward a more efficient speaker diarization.},
	journal = {arXiv:2101.0962},
	author = {Park, Tae Jin and Kanda, Naoyuki and Dimitriadis, Dimitrios and Han, Kyu J. and Watanabe, Shinji and Narayanan, Shrikanth},
	month = jun,
	year = {2021},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\XV5YMSYP\\Park et al. - 2021 - A Review of Speaker Diarization Recent Advances w.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\PATADHHA\\2101.html:text/html},
}

@article{grumiaux_survey_2021,
	title = {A survey of sound source localization with deep learning methods},
	abstract = {This article is a survey on deep learning methods for single and multiple sound source localization. We are particularly interested in sound source localization in indoor/domestic environment, where reverberation and diffuse noise are present. We provide an exhaustive topography of the neural-based localization literature in this context, organized according to several aspects: the neural network architecture, the type of input features, the output strategy (classification or regression), the types of data used for model training and evaluation, and the model training strategy. This way, an interested reader can easily comprehend the vast panorama of the deep learning-based sound source localization methods. Tables summarizing the literature survey are provided at the end of the paper for a quick search of methods with a given set of target characteristics.},
	author = {Grumiaux, Pierre-Amaury and Kitić, Srđan and Girin, Laurent and Guérin, Alexandre},
	month = sep,
	year = {2021},
	note = {Submitted.},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\KNGDIH8T\\Grumiaux et al. - 2021 - A Survey of Sound Source Localization with Deep Le.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\5F6JKMS6\\2109.html:text/html},
}

@article{blandin_multi-source_2012,
	title = {Multi-source {TDOA} estimation in reverberant audio using angular spectra and clustering},
	volume = {92},
	issn = {01651684},
	doi = {10.1016/j.sigpro.2011.09.032},
	abstract = {We consider the problem of estimating the time diﬀerences of arrival (TDOAs) of multiple sources from a two-channel reverberant audio signal. While several clustering-based or angular spectrum-based methods have been proposed in the literature, only relatively small-scale experimental evaluations restricted to either category of methods have been carried out so far. We design and conduct the ﬁrst large-scale experimental evaluation of these methods and investigate a two-step procedure combining angular spectra and clustering. In addition, we introduce and evaluate ﬁve new TDOA estimation methods inspired from signal-to-noise-ratio (SNR) weighting and probabilistic multi-source modeling techniques that have been successful for anechoic TDOA estimation and audio source separation. The results show that clustering-based methods do not improve upon angular spectrum-based methods. For 5 cm microphone spacing, the best TDOA estimation performance is achieved by one of the proposed SNR-based angular spectrum methods. For larger spacing, a variant of the generalized cross-correlation with phase transform (GCC-PHAT) method performs best.},
	language = {en},
	number = {8},
	journal = {Signal Processing},
	author = {Blandin, Charles and Ozerov, Alexey and Vincent, Emmanuel},
	month = aug,
	year = {2012},
	pages = {1950--1960},
	file = {Blandin et al. - 2012 - Multi-source TDOA estimation in reverberant audio .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\3Z7LNVJS\\Blandin et al. - 2012 - Multi-source TDOA estimation in reverberant audio .pdf:application/pdf},
}

@inproceedings{katharopoulos_transformers_2020,
	title = {Transformers are {RNNs}:  fast autoregressive transformers with linear attention},
	abstract = {Transformers achieve remarkable performance in several tasks but due to their quadratic complexity, with respect to the input’s length, they are prohibitively slow for very long sequences. To address this limitation, we express the self-attention as a linear dot-product of kernel feature maps and make use of the associativity property of matrix products to reduce the complexity from O N 2 to O (N ), where N is the sequence length. We show that this formulation permits an iterative implementation that dramatically accelerates autoregressive transformers and reveals their relationship to recurrent neural networks. Our linear transformers achieve similar performance to vanilla transformers and they are up to 4000x faster on autoregressive prediction of very long sequences.},
	language = {en},
	booktitle = {International {Conference} on {Machine} {Learning}},
	publisher = {PMLR},
	author = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, François},
	year = {2020},
	pages = {5156--5165},
	file = {Katharopoulos et al. - Transformers are RNNs  Fast Autoregressive Transf.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\E42SB2UQ\\Katharopoulos et al. - Transformers are RNNs  Fast Autoregressive Transf.pdf:application/pdf},
}

@article{herre_mpeg-h_2014,
	title = {{MPEG}-{H} audio-{The} new standard for universal spatial/{3D} audio coding},
	volume = {62},
	abstract = {Recently, a new generation of spatial audio formats were introduced that include elevated loudspeakers and surpass traditional surround sound formats, such as 5.1, in terms of spatial realism. To facilitate high-quality bit-rate-efficient distribution and flexible reproduction of 3D sound, the MPEG standardization group recently started the MPEG-H Audio Coding development for the universal carriage of encoded 3D sound from channel-based, objectbased, and HOA-based input. High quality reproduction is supported for many output formats from 22.2 and beyond down to 5.1, stereo, and binaural reproduction-independently of the original encoding format-thus overcoming incompatibility between various 3D formats. The paper describes the current status of the standardization project and provides an overview of the system architecture, its capabilities, and performance.},
	journal = {Journal of the Audio Engineering Society},
	author = {Herre, Juergen and Hilpert, Johannes and Kuntz, Achim and Plogsties, Jan},
	month = dec,
	year = {2014},
	pages = {821--830},
}

@inproceedings{wabnitz_room_2010,
	title = {Room acoustics simulation for multichannel microphone arrays},
	booktitle = {International {Symposium} on {Room} {Acoustics}},
	author = {Wabnitz, Andrew and Epain, Nicolas and Jin, Craig and Van Schaik, André},
	year = {2010},
	pages = {1--6},
}

@book{kinsler_fundamentals_2000,
	address = {New York},
	edition = {4th ed},
	title = {Fundamentals of acoustics},
	isbn = {978-0-471-84789-2},
	publisher = {Wiley},
	editor = {Kinsler, Lawrence E.},
	year = {2000},
	note = {ZSCC: 0010654},
	file = {Fundamentals of Acoustics by L. Kinsler, et al. (z-lib.org).pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\DCYUVXPM\\Fundamentals of Acoustics by L. Kinsler, et al. (z-lib.org).pdf:application/pdf},
}

@article{buda_systematic_2018,
	title = {A systematic study of the class imbalance problem in convolutional neural networks},
	volume = {106},
	issn = {1879-2782},
	doi = {10.1016/j.neunet.2018.07.011},
	abstract = {In this study, we systematically investigate the impact of class imbalance on classification performance of convolutional neural networks (CNNs) and compare frequently used methods to address the issue. Class imbalance is a common problem that has been comprehensively studied in classical machine learning, yet very limited systematic research is available in the context of deep learning. In our study, we use three benchmark datasets of increasing complexity, MNIST, CIFAR-10 and ImageNet, to investigate the effects of imbalance on classification and perform an extensive comparison of several methods to address the issue: oversampling, undersampling, two-phase training, and thresholding that compensates for prior class probabilities. Our main evaluation metric is area under the receiver operating characteristic curve (ROC AUC) adjusted to multi-class tasks since overall accuracy metric is associated with notable difficulties in the context of imbalanced data. Based on results from our experiments we conclude that (i) the effect of class imbalance on classification performance is detrimental; (ii) the method of addressing class imbalance that emerged as dominant in almost all analyzed scenarios was oversampling; (iii) oversampling should be applied to the level that completely eliminates the imbalance, whereas the optimal undersampling ratio depends on the extent of imbalance; (iv) as opposed to some classical machine learning models, oversampling does not cause overfitting of CNNs; (v) thresholding should be applied to compensate for prior class probabilities when overall number of properly classified cases is of interest.},
	language = {eng},
	journal = {Neural Networks},
	author = {Buda, Mateusz and Maki, Atsuto and Mazurowski, Maciej A.},
	month = oct,
	year = {2018},
	pages = {249--259},
	file = {Submitted Version:C\:\\Users\\RQML4978\\Zotero\\storage\\LZM6Z83B\\Buda et al. - 2018 - A systematic study of the class imbalance problem .pdf:application/pdf},
}

@inproceedings{bengio_curriculum_2009,
	address = {New York, NY, USA},
	series = {{ICML} '09},
	title = {Curriculum learning},
	isbn = {978-1-60558-516-1},
	url = {https://doi.org/10.1145/1553374.1553380},
	doi = {10.1145/1553374.1553380},
	abstract = {Humans and animals learn much better when the examples are not randomly presented but organized in a meaningful order which illustrates gradually more concepts, and gradually more complex ones. Here, we formalize such training strategies in the context of machine learning, and call them "curriculum learning". In the context of recent research studying the difficulty of training in the presence of non-convex training criteria (for deep deterministic and stochastic neural networks), we explore curriculum learning in various set-ups. The experiments show that significant improvements in generalization can be achieved. We hypothesize that curriculum learning has both an effect on the speed of convergence of the training process to a minimum and, in the case of non-convex criteria, on the quality of the local minima obtained: curriculum learning can be seen as a particular form of continuation method (a general strategy for global optimization of non-convex functions).},
	urldate = {2021-10-18},
	booktitle = {International {Conference} on {Machine} {Learning}},
	publisher = {Association for Computing Machinery},
	author = {Bengio, Yoshua and Louradour, Jérôme and Collobert, Ronan and Weston, Jason},
	month = jun,
	year = {2009},
	pages = {41--48},
	file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\KZRH6X82\\Bengio et al. - 2009 - Curriculum learning.pdf:application/pdf},
}

@inproceedings{jiang_self-paced_2015,
	title = {Self-{Paced} {Curriculum} {Learning}},
	abstract = {Curriculum learning (CL) or self-paced learning (SPL) represents a recently proposed learning regime inspired by the learning process of humans and animals that gradually proceeds from easy to more complex samples in training. The two methods share a similar conceptual learning paradigm, but differ in speciﬁc learning schemes. In CL, the curriculum is predetermined by prior knowledge, and remain ﬁxed thereafter. Therefore, this type of method heavily relies on the quality of prior knowledge while ignoring feedback about the learner. In SPL, the curriculum is dynamically determined to adjust to the learning pace of the leaner. However, SPL is unable to deal with prior knowledge, rendering it prone to overﬁtting. In this paper, we discover the missing link between CL and SPL, and propose a uniﬁed framework named self-paced curriculum leaning (SPCL). SPCL is formulated as a concise optimization problem that takes into account both prior knowledge known before training and the learning progress during training. In comparison to human education, SPCL is analogous to “instructor-student-collaborative” learning mode, as opposed to “instructor-driven” in CL or “student-driven” in SPL. Empirically, we show that the advantage of SPCL on two tasks.},
	language = {en},
	booktitle = {{AAAI} {Conference} on {Artificial} {Intelligence}},
	author = {Jiang, Lu and Meng, Deyu and Zhao, Qian and Shan, Shiguang and Hauptmann, Alexander G},
	year = {2015},
	pages = {7},
	file = {Jiang et al. - Self-Paced Curriculum Learning.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\R9S97QLX\\Jiang et al. - Self-Paced Curriculum Learning.pdf:application/pdf},
}

@article{bronstein_geometric_2021,
	title = {Geometric deep learning: grids, groups, graphs, geodesics, and gauges},
	shorttitle = {Geometric {Deep} {Learning}},
	abstract = {The last decade has witnessed an experimental revolution in data science and machine learning, epitomised by deep learning methods. Indeed, many high-dimensional learning tasks previously thought to be beyond reach -- such as computer vision, playing Go, or protein folding -- are in fact feasible with appropriate computational scale. Remarkably, the essence of deep learning is built from two simple algorithmic principles: first, the notion of representation or feature learning, whereby adapted, often hierarchical, features capture the appropriate notion of regularity for each task, and second, learning by local gradient-descent type methods, typically implemented as backpropagation. While learning generic functions in high dimensions is a cursed estimation problem, most tasks of interest are not generic, and come with essential pre-defined regularities arising from the underlying low-dimensionality and structure of the physical world. This text is concerned with exposing these regularities through unified geometric principles that can be applied throughout a wide spectrum of applications. Such a 'geometric unification' endeavour, in the spirit of Felix Klein's Erlangen Program, serves a dual purpose: on one hand, it provides a common mathematical framework to study the most successful neural network architectures, such as CNNs, RNNs, GNNs, and Transformers. On the other hand, it gives a constructive procedure to incorporate prior physical knowledge into neural architectures and provide principled way to build future architectures yet to be invented.},
	journal = {arXiv:2104.13478},
	author = {Bronstein, Michael M. and Bruna, Joan and Cohen, Taco and Veličković, Petar},
	month = may,
	year = {2021},
	file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\HM66LM82\\Bronstein et al. - 2021 - Geometric Deep Learning Grids, Groups, Graphs, Ge.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\Y38R4MK3\\2104.html:text/html},
}