-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathref.bib
2407 lines (2210 loc) · 267 KB
/
ref.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{van1988beamforming,
title={Beamforming: A versatile approach to spatial filtering},
author={Van Veen, Barry D. and Buckley, Kevin M.},
journal={IEEE ASSP Magazine},
volume={5},
number={2},
pages={4--24},
year={1988},
}
@inproceedings{erdogan2016improved,
title={Improved MVDR beamforming using single-channel mask prediction networks},
author={Erdogan, Hakan and Hershey, John R. and Watanabe, Shinji and Mandel, Michael and Le Roux, Jonathan},
booktitle={Interspeech},
year={2016}
}
@article{gannot2017consolidated,
title={A consolidated perspective on multimicrophone speech enhancement and source separation},
author={Gannot, Sharon and Vincent, Emmanuel and Markovich-Golan, Shmulik and Ozerov, Alexey},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume={25},
number={4},
pages={692--730},
year={2017},
publisher={IEEE}
}
@inproceedings{heymann2016neural,
title={Neural network based spectral mask estimation for acoustic beamforming},
author={Heymann, Jahn and Drude, Lukas and Haeb-Umbach, Reinhold},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
year={2016}
}
@inproceedings{higuchi2017deep,
title={Deep clustering-based beamforming for separation with unknown number of sources},
author={Higuchi, Takuya and Kinoshita, Keisuke and Delcroix, Marc and Zmolkova, Katerina and Nakatani, Tomohiro},
booktitle={Interspeech},
year={2017}
}
@inproceedings{leglaive2019semi,
title={Semi-supervised multichannel speech enhancement with variational autoencoders and non-negative matrix factorization},
author={Leglaive, Simon and Girin, Laurent and Horaud, Radu},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
pages={101--105},
year={2019}
}
@article{li2017acoustic,
title={Acoustic modeling for Google Home},
author={Li, Bo and Sainath, Tara and Narayanan, Arun and Caroselli, Joe and Bacchiani, Michiel and Misra, Ananya and Shafran, Izhak and Sak, Hasim and Pundak, Golan and Chin, Kean and others},
journal={Interspeech},
year={2017}
}
@inproceedings{li2016neural,
title={Neural network adaptive beamforming for robust multichannel speech recognition.},
author={Li, Bo and Sainath, Tara N. and Weiss, Ron J. and Wilson, Kevin W. and Bacchiani, Michiel},
booktitle={Interspeech},
year={2016}
}
@inproceedings{meng2017deep,
title={Deep long short-term memory adaptive beamforming networks for multichannel robust speech recognition},
author={Meng, Zhong and Watanabe, Shinji and Hershey, John R. and Erdogan, Hakan},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
year={2017}
}
@article{nugraha2016multichannel,
title={Multichannel audio source separation with deep neural networks},
author={Nugraha, Aditya A. and Liutkus, Antoine and Vincent, Emmanuel},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume={24},
number={9},
pages={1652--1664},
year={2016}
}
@inproceedings{perotin2018multichannel,
title={Multichannel speech separation with recurrent neural networks from high-order Ambisonics recordings},
author={Perotin, Laur{\'e}line and Serizel, Romain and Vincent, Emmanuel and Gu{\'e}rin, Alexandre},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
year={2018}
}
@article{wang2017supervised,
title={Supervised speech separation based on deep learning: an overview},
author={Wang, DeLiang and Chen, Jitong},
journal={arXiv:1708.07524},
year={2017}
}
@article{duong2010un,
title={Under-determined reverberant audio source separation using a full-rank spatial covariance model},
author={Duong, Ngoc QK and Vincent, Emmanuel and Gribonval, R{\'e}mi},
journal={IEEE Transactions on Audio, Speech, and Language Processing},
volume={18},
number={7},
pages={1830--1840},
year={2010}
}
@inproceedings{heymann2017beamnet,
title={Beamnet: End-to-end training of a beamformer-supported multi-channel ASR system},
author={Heymann, Jahn and Drude, Lukas and Boeddeker, Christoph and Hanebrink, Patrick and Haeb-Umbach, Reinhold},
booktitle={IEEE International Conference on Acoustics, Speech and Signal Processing},
year={2017}
}
@inproceedings{pasha_towards_2017,
address = {San Francisco, CA},
title = {Towards real-time source counting by estimation of coherent-to-diffuse ratios from ad-hoc microphone array recordings},
isbn = {978-1-5090-5925-6},
url = {http://ieeexplore.ieee.org/document/7895582/},
doi = {10.1109/HSCMA.2017.7895582},
abstract = {Coherent-to-diffuse ratio (CDR) estimates over short time frames are utilised for source counting using ad-hoc microphone arrays to record speech from multiple participants in scenarios such as a meeting. It is shown that the CDR estimates obtained at ad-hoc dual (two channel) microphone nodes, located at unknown locations within an unknown reverberant room, can detect time frames with more than one active source and are informative for source counting applications. Results show that interfering sources can be detected with accuracies ranging from 69\% to 89\% for delays ranging from 20 ms to 300 ms, with source counting accuracies ranged from 61\% to 81\% for two sources and the same range of delays.},
language = {en},
urldate = {2019-01-10},
booktitle = {Hands-free {Speech} {Communications} and {Microphone} {Arrays}},
publisher = {IEEE},
author = {Pasha, Shahab and Donley, Jacob and Ritz, Christian and Zou, Yue Xian},
year = {2017},
keywords = {non-lu},
pages = {161--165},
file = {Pasha et al. - 2017 - Towards real-time source counting by estimation of.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\3NWMK2DK\\Pasha et al. - 2017 - Towards real-time source counting by estimation of.pdf:application/pdf},
}
@book{jacobsen_fundamentals_2013,
title = {Fundamentals of general linear acoustics},
language = {en},
publisher = {John Wiley \& Sons},
author = {Jacobsen, Finn and Juhl, Peter Moller},
month = jul,
year = {2013},
keywords = {partiel-lu},
file = {Jacobsen - Fundamentals of General Linear Acoustics.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\NP2YZPLB\\Jacobsen - Fundamentals of General Linear Acoustics.pdf:application/pdf},
}
@phdthesis{baque_analyse_2017,
address = {Le Mans},
title = {Analyse de scène sonore multi-capteurs : un front-end temps-réel pour la manipulation de scène},
abstract = {La thèse s’inscrit dans un contexte d’essor de l’audio spatialisé (5.1, Dolby Atmos...). Parmi les formats audio 3D existants, l’ambisonie permet une représentation spatiale homogène du champ sonore et se prête naturellement à des manipulations : rotations, distorsion du champ sonore. L’objectif de cette thèse est de fournir un outil d’analyse et de manipulation de contenus audio (essentiellement vocaux) au format ambisonique. Un fonctionnement temps-réel et en conditions acoustiques réelles sont les principales contraintes à respecter. L’algorithme mis au point est basé sur une analyse en composantes indépendantes (ACI) appliquée trame à trame qui permet de décomposer le champ acoustique en un ensemble de contributions, correspondant à des sources (champ direct) ou à de la réverbération. Une étape de classification bayésienne, appliquée aux composantes extraites, permet alors l’identification et le dénombrement des sources sonores contenues dans le mélange. Les sources identifiées sont localisées grâce à la matrice de mélange obtenue par ACI, pour fournir une cartographie de la scène sonore. Une étude exhaustive des performances est menée sur des contenus réels en fonction de plusieurs paramètres : nombre de sources, environnement acoustique, longueur des trames, ou ordre ambisonique utilisé. Des résultats fiables en terme de localisation et de comptage de sources ont été obtenus pour des trames de quelques centaines de ms. L’algorithme, exploité comme prétraitement dans un prototype d’assistant vocal domestique, permet d’améliorer significativement les performances de reconnaissance, notamment en prise de son lointaine et en présence de sources interférentes.},
language = {fr},
urldate = {2018-10-18},
school = {Université du Maine},
author = {Baque, Mathieu},
year = {2017},
keywords = {partiel-lu},
file = {Baque - 2017 - Analyse de scène sonore multi-capteurs un front-.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\J9AR8IWT\\Baque - 2017 - Analyse de scène sonore multi-capteurs un front-.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\Y4CMKVJG\\tel-01792433.html:text/html},
}
@article{williams_fourier_2000,
title = {Fourier acoustics: sound radiation and nearfield acoustical holography},
volume = {108},
issn = {0001-4966},
doi = {10.1121/1.1289662},
language = {en},
number = {4},
journal = {The Journal of the Acoustical Society of America},
author = {Williams, Earl G. and Mann, J. Adin},
month = oct,
year = {2000},
keywords = {partiel-lu},
pages = {1373--1373},
file = {Williams et Mann - 2000 - Fourier Acoustics Sound Radiation and Nearfield A.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\5QUVK8GF\\Williams et Mann - 2000 - Fourier Acoustics Sound Radiation and Nearfield A.pdf:application/pdf},
}
@article{rafaely_analysis_2005,
title = {Analysis and design of spherical microphone arrays},
volume = {13},
issn = {1063-6676},
doi = {10.1109/TSA.2004.839244},
abstract = {Spherical microphone arrays have been recently studied for sound-field recordings, beamforming, and sound-field analysis which use spherical harmonics in the design. Although the microphone arrays and the associated algorithms were presented, no comprehensive theoretical analysis of performance was provided. This paper presents a spherical-harmonics-based design and analysis framework for spherical microphone arrays. In particular, alternative spatial sampling schemes for the positioning of microphones on a sphere are presented, and the errors introduced by finite number of microphones, spatial aliasing, inaccuracies in microphone positioning, and measurement noise are investigated both theoretically and by using simulations. The analysis framework can also provide a useful guide for the design and analysis of more general spherical microphone arrays which do not use spherical harmonics explicitly.},
language = {en},
number = {1},
journal = {IEEE Transactions on Speech and Audio Processing},
author = {Rafaely, B.},
month = jan,
year = {2005},
keywords = {partiel-lu},
pages = {135--143},
file = {Rafaely - 2005 - Analysis and design of spherical microphone arrays.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\KIEPWA8Z\\Rafaely - 2005 - Analysis and design of spherical microphone arrays.pdf:application/pdf},
}
@book{vincent_audio_2018,
title = {Audio source separation and speech enhancement},
publisher = {John Wiley \& Sons},
author = {Vincent, Emmanuel and Virtanen, Tuomas and Gannot, Sharon},
year = {2018},
keywords = {non-lu},
file = {Vincent et al. - 2018 - Audio Source Separation and Speech Enhancement.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\9Y4MT6VY\\Vincent et al. - 2018 - Audio Source Separation and Speech Enhancement.pdf:application/pdf},
}
@phdthesis{merimaa_analysis_2006,
address = {Helsinki},
title = {Analysis, synthesis, and perception of spatial sound - binaural localization modeling and multichannel loudspeaker reproduction},
abstract = {In everyday audio environments, sound from several sources arrives at a listening position both directly from the sources and as reflections from the acoustical environment. This thesis deals, within some limitations, with analysis of the resulting spatial sound field, reproduction of perceptually relevant features of the sound as measured in a chosen listening position, as well as with modeling of the related auditory localization.},
language = {en},
school = {Helsinki University of Technology},
author = {Merimaa, Juha},
year = {2006},
keywords = {partiel-lu},
file = {Merimaa - 2006 - Analysis, Synthesis, and Perception of Spatial Sou.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AMGGKJ2K\\Merimaa - 2006 - Analysis, Synthesis, and Perception of Spatial Sou.pdf:application/pdf},
}
@phdthesis{daniel_representation_2001,
address = {Paris},
title = {Représentation de champs acoustiques, application à la transmission et à la reproduction de scènes sonores complexes dans un contexte multimédia},
language = {French},
school = {Paris VI},
author = {Daniel, Jerôme},
year = {2001},
keywords = {partiel-lu},
file = {Daniel - 2001 - Représentation de champs acoustiques, application .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\WT68JQ6H\\Daniel - 2001 - Représentation de champs acoustiques, application .pdf:application/pdf},
}
@article{kingma_adam:_2014,
title = {Adam: {A} {Method} for {Stochastic} {Optimization}},
shorttitle = {Adam},
abstract = {We introduce Adam, an algorithm for first-order gradient-based optimization of stochastic objective functions, based on adaptive estimates of lower-order moments. The method is straightforward to implement, is computationally efficient, has little memory requirements, is invariant to diagonal rescaling of the gradients, and is well suited for problems that are large in terms of data and/or parameters. The method is also appropriate for non-stationary objectives and problems with very noisy and/or sparse gradients. The hyper-parameters have intuitive interpretations and typically require little tuning. Some connections to related algorithms, on which Adam was inspired, are discussed. We also analyze the theoretical convergence properties of the algorithm and provide a regret bound on the convergence rate that is comparable to the best known results under the online convex optimization framework. Empirical results demonstrate that Adam works well in practice and compares favorably to other stochastic optimization methods. Finally, we discuss AdaMax, a variant of Adam based on the infinity norm.},
language = {en},
urldate = {2019-04-11},
journal = {arXiv:1412.6980},
author = {Kingma, Diederik P. and Ba, Jimmy},
year = {2014},
keywords = {Computer Science - Machine Learning},
file = {Kingma et Ba - 2014 - Adam A Method for Stochastic Optimization.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AM7Q3XPX\\Kingma et Ba - 2014 - Adam A Method for Stochastic Optimization.pdf:application/pdf},
}
@inproceedings{vecchiotti_end--end_2019,
title = {End-to-end binaural sound localisation from the raw waveform},
abstract = {A novel end-to-end binaural sound localisation approach is proposed which estimates the azimuth of a sound source directly from the waveform. Instead of employing hand-crafted features commonly employed for binaural sound localisation, such as the interaural time and level difference, our end-to-end system approach uses a convolutional neural network (CNN) to extract specific features from the waveform that are suitable for localisation. Two systems are proposed which differ in the initial frequency analysis stage. The first system is auditory-inspired and makes use of a gammatone filtering layer, while the second system is fully data-driven and exploits a trainable convolutional layer to perform frequency analysis. In both systems, a set of dedicated convolutional kernels are then employed to search for specific localisation cues, which are coupled with a localisation stage using fully connected layers. Localisation experiments using binaural simulation in both anechoic and reverberant environments show that the proposed systems outperform a state-ofthe-art deep neural network system. Furthermore, our investigation of the frequency analysis stage in the second system suggests that the CNN is able to exploit different frequency bands for localisation according to the characteristics of the reverberant environment.},
language = {en},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Vecchiotti, Paolo and Ma, Ning and Squartini, Stefano and Brown, Guy J.},
year = {2019},
keywords = {Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, lu},
pages = {451--455},
file = {Vecchiotti et al. - 2019 - End-to-end Binaural Sound Localisation from the Ra.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\YF5WBJEN\\Vecchiotti et al. - 2019 - End-to-end Binaural Sound Localisation from the Ra.pdf:application/pdf},
}
@article{stoter_countnet:_2019,
title = {{CountNet}: estimating the number of concurrent speakers using supervised learning},
volume = {27},
issn = {2329-9290, 2329-9304},
shorttitle = {{CountNet}},
doi = {10.1109/TASLP.2018.2877892},
abstract = {Estimating the maximum number of concurrent speakers from single-channel mixtures is a challenging problem and an essential first step to address various audio-based tasks such as blind source separation, speaker diarization, and audio surveillance. We propose a unifying probabilistic paradigm, where deep neural network architectures are used to infer output posterior distributions. These probabilities are in turn processed to yield discrete point estimates. Designing such architectures often involves two important and complementary aspects that we investigate and discuss. First, we study how recent advances in deep architectures may be exploited for the task of speaker count estimation. In particular, we show that convolutional recurrent neural networks outperform recurrent networks used in a previous study when adequate input features are used. Even for short segments of speech mixtures, we can estimate up to five speakers, with a significantly lower error than other methods. Second, through comprehensive evaluation, we compare the best-performing method to several baselines, as well as the influence of gain variations, different data sets, and reverberation. The output of our proposed method is compared to human performance. Finally, we give insights into the strategy used by our proposed method.},
language = {en},
number = {2},
urldate = {2019-01-15},
journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
author = {St\"{o}ter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuel A. P.},
year = {2019},
keywords = {lu},
pages = {268--282},
file = {Stoter et al. - 2019 - CountNet Estimating the Number of Concurrent Spea.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\AZLNM27I\\Stoter et al. - 2019 - CountNet Estimating the Number of Concurrent Spea.pdf:application/pdf},
}
@inproceedings{comminiello_quaternion_2019,
title = {Quaternion convolutional neural networks for detection and localization of {3D} sound events},
abstract = {Learning from data in the quaternion domain enables us to exploit internal dependencies of 4D signals and treating them as a single entity. One of the models that perfectly suits with quaternion-valued data processing is represented by 3D acoustic signals in their spherical harmonics decomposition. In this paper, we address the problem of localizing and detecting sound events in the spatial sound field by using quaternion-valued data processing. In particular, we consider the spherical harmonic components of the signals captured by a first-order ambisonic microphone and process them by using a quaternion convolutional neural network. Experimental results show that the proposed approach exploits the correlated nature of the ambisonic signals, thus improving accuracy results in 3D sound event detection and localization.},
language = {en},
urldate = {2018-12-18},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Comminiello, Danilo and Lella, Marco and Scardapane, Simone and Uncini, Aurelio},
year = {2019},
keywords = {lu},
file = {Comminiello et al. - 2018 - Quaternion Convolutional Neural Networks for Detec.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\LQEXE6IA\\Comminiello et al. - 2018 - Quaternion Convolutional Neural Networks for Detec.pdf:application/pdf},
}
@inproceedings{yang_multiple_2017,
title = {Multiple sound source counting and localization based on spatial principal eigenvector},
doi = {10.21437/Interspeech.2017-940},
abstract = {Multiple sound source localization remains a challenging issue due to the interaction between sources. Although traditional approaches can locate multiple sources effectively, most of them require the number of sound sources as a priori knowledge. However, the number of sound sources is generally unknown in practical applications. To overcome this problem, a spatial principal eigenvector based approach is proposed to estimate the number and the direction of arrivals (DOAs) of multiple speech sources. Firstly, a time-frequency (TF) bin weighting scheme is utilized to select the TF bins dominated by single source. Then, for these selected bins, the spatial principal eigenvectors are extracted to construct a contribution function which is used to simultaneously estimate the number of sources and corresponding coarse DOAs. Finally, the coarse DOA estimations are refined by iteratively optimizing the assignment of selected TF bins to each source. Experimental results validate that the proposed approach yields favorable performance for multiple sound source counting and localization in the environment with different levels of noise and reverberation.},
language = {en},
urldate = {2019-01-10},
booktitle = {Interspeech},
publisher = {ISCA},
author = {Yang, Bing and Liu, Hong and Pang, Cheng},
year = {2017},
keywords = {lu},
pages = {1924--1928},
file = {Yang et al. - 2017 - Multiple Sound Source Counting and Localization Ba.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\A5KNKDQW\\Yang et al. - 2017 - Multiple Sound Source Counting and Localization Ba.pdf:application/pdf},
}
@inproceedings{arai_estimating_2003,
title = {Estimating number of speakers by the modulation characteristics of speech},
volume = {2},
isbn = {978-0-7803-7663-2},
doi = {10.1109/ICASSP.2003.1202328},
abstract = {A method for estimating number of speakers of mixed speech signals was proposed. The algorithm was based on the modulation characteristics of speech, specifically that a single speech utterance typically has a distinct modulation pattern with a peak around 4-5 Hz. Having observed that the modulation peak decreases as number of speakers increases, our estimation algorithm used the region of the modulation frequency between 2 and 8 Hz. We obtained a novel parameter we called “equivalent number of speakers” to estimate the number of simultaneous speakers when speech signals contain multiple speakers.},
language = {en},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech}, and {Signal} {Processing}},
author = {Arai, Takayuki},
year = {2003},
keywords = {lu},
pages = {197--200},
file = {Arai - 2003 - Estimating number of speakers by the modulation ch.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\85VA3DGU\\Arai - 2003 - Estimating number of speakers by the modulation ch.pdf:application/pdf},
}
@techreport{habets_room_2006,
title = {Room impulse response generator},
institution = {Technische Universiteit Eindhoven},
author = {Habets, Emanuel A. P.},
year = {2006},
file = {rir_generator.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\U6W3GTW5\\rir_generator.pdf:application/pdf},
}
@article{schmidt_multiple_1986,
title = {Multiple emitter location and signal parameter estimation},
volume = {34},
issn = {0018-926X},
doi = {10.1109/TAP.1986.1143830},
abstract = {Processing the signals received on an array of sensors for the location of the emitter is of great enough interest to have been treated under many special case assumptions. The general problem considers sensors with arbitrary locations and arbitrary directional characteristics (gain/phase/polarization) in a noise/interference environment of arbitrary covariance matrix. This report is concerned first with the multiple emitter aspect of this problem and second with the generality of solution. A description is given of the multiple signal classification (MUSIC) algorithm, which provides asymptotically unbiased estimates of 1) number of incident wavefronts present; 2) directions of arrival (DOA) (or emitter locations); 3) strengths and cross correlations among the incident waveforms; 4) noise/interference strength. Examples and comparisons with methods based on maximum likelihood (ML) and maximum entropy (ME), as well as conventional beamforming are included. An example of its use as a multiple frequency estimator operating on time series is included.},
number = {3},
journal = {IEEE Transactions on Antennas and Propagation},
author = {Schmidt, Ralph},
month = mar,
year = {1986},
keywords = {Adaptive arrays, Direction of arrival estimation, Direction-of-arrival estimation, Frequency estimation, Interference, Multiple signal classification, non-lu, Parameter estimation, Polarization, Sensor arrays, Sensor phenomena and characterization, Signal processing, Signal processing antennas, Working environment noise},
pages = {276--280},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\K2W5KTQI\\1143830.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\YZJPU4U2\\Schmidt - 1986 - Multiple emitter location and signal parameter est.pdf:application/pdf},
}
@article{hou_squared_2016,
title = {Squared earth mover's distance-based loss for training deep neural networks},
abstract = {In the context of single-label classification, despite the huge success of deep learning, the commonly used crossentropy loss function ignores the intricate inter-class relationships that often exist in real-life tasks such as age classification. In this work, we propose to leverage these relationships between classes by training deep nets with the exact squared Earth Mover’s Distance (also known as Wasserstein distance) for single-label classification. The EMD2 loss uses the predicted probabilities of all classes and penalizes the miss-predictions according to a ground distance matrix that quantifies the dissimilarities between classes. We demonstrate that on datasets with strong inter-class relationships such as an ordering between classes, our exact EMD2 losses yield new state-of-the-art results. Furthermore, we propose a method to automatically learn this matrix using the CNN’s own features during training. We show that our method can learn a ground distance matrix efficiently with no inter-class relationship priors and yield the same performance gain. Finally, we show that our method can be generalized to applications that lack strong interclass relationships and still maintain state-of-the-art performance. Therefore, with limited computational overhead, one can always deploy the proposed loss function on any dataset over the conventional cross-entropy.},
language = {en},
journal = {arXiv:1611.05916},
author = {Hou, Le and Yu, Chen-Ping and Samaras, Dimitris},
month = nov,
year = {2016},
keywords = {lu},
file = {Hou et al. - 2016 - Squared Earth Mover's Distance-based Loss for Trai.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\93ZHVN7F\\Hou et al. - 2016 - Squared Earth Mover's Distance-based Loss for Trai.pdf:application/pdf},
}
@article{knapp_generalized_1976,
title = {The generalized correlation method for estimation of time delay},
volume = {24},
issn = {0096-3518},
doi = {10.1109/TASSP.1976.1162830},
abstract = {A maximum likelihood (ML) estimator is developed for determining time delay between signals received at two spatially separated sensors in the presence of uncorrelated noise. This ML estimator can be realized as a pair of receiver prefilters followed by a cross correlator. The time argument at which the correlator achieves a maximum is the delay estimate. The ML estimator is compared with several other proposed processors of similar form. Under certain conditions the ML estimator is shown to be identical to one proposed by Hannan and Thomson [10] and MacDonald and Schultheiss [21]. Qualitatively, the role of the prefilters is to accentuate the signal passed to the correlator at frequencies for which the signal-to-noise (S/N) ratio is highest and, simultaneously, to suppress the noise power. The same type of prefiltering is provided by the generalized Eckart filter, which maximizes the S/N ratio of the correlator output. For low S/N ratio, the ML estimator is shown to be equivalent to Eckart prefiltering.},
number = {4},
journal = {IEEE Transactions on Acoustics, Speech, and Signal Processing},
author = {Knapp, Charles and Carter, Glifford},
month = aug,
year = {1976},
keywords = {non-lu},
pages = {320--327},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\XWS8HP6Q\\1162830.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\RFTA2NH4\\Knapp and Carter - 1976 - The generalized correlation method for estimation .pdf:application/pdf},
}
@article{jacobsen_note_1991,
title = {A note on instantaneous and time-averaged active and reactive sound intensity},
volume = {147},
issn = {0022460X},
doi = {10.1016/0022-460X(91)90496-7},
language = {en},
number = {3},
urldate = {2018-10-09},
journal = {Journal of Sound and Vibration},
author = {Jacobsen, Finn},
month = jun,
year = {1991},
keywords = {lu},
pages = {489--496},
file = {Jacobsen - 1991 - A note on instantaneous and time-averaged active a.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\J4YL3SVR\\Jacobsen - 1991 - A note on instantaneous and time-averaged active a.pdf:application/pdf},
}
@inproceedings{he_joint_2018,
title = {Joint localization and classification of multiple sound sources using a multi-task neural network},
abstract = {We propose a novel multi-task neural network-based approach for joint sound source localization and speech/non-speech classification in noisy environments. The network takes raw short time Fourier transform as input and outputs the likelihood values for the two tasks, which are used for the simultaneous detection, localization and classification of an unknown number of overlapping sound sources, Tested with real recorded data, our method achieves significantly better performance in terms of speech/non-speech classification and localization of speech sources, compared to method that performs localization and classification separately. In addition, we demonstrate that incorporating the temporal context can further improve the performance.},
language = {en},
urldate = {2018-11-22},
booktitle = {Interspeech},
author = {He, Weipeng and Motlicek, Petr and Odobez, Jean-Marc},
year = {2018},
keywords = {lu},
pages = {312--316},
file = {He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\Z8KXJUSB\\He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:application/pdf;He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\SYBZYA4I\\He et al. - 2018 - Joint Localization and Classification of Multiple .pdf:application/pdf},
}
@phdthesis{moreau_etude_2006,
address = {Le Mans},
title = {Étude et réalisation d’outils avancés d’encodage spatial pour la technique de spatialisation sonore {Higher} {Order} {Ambisonics} : microphone {3D} et contrôle de distance},
language = {fr},
school = {Université du Main},
author = {Moreau, Sébastien},
year = {2006},
keywords = {partiel-lu},
file = {Library Catalog Entry Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\UVS59NXC\\SRCH.html:text/html;Moreau - Étude et réalisation d’outils avancés d’encodage s.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\KDITCCBI\\Moreau - Étude et réalisation d’outils avancés d’encodage s.pdf:application/pdf},
}
@inproceedings{li_online_2018,
title = {Online direction of arrival estimation based on deep learning},
isbn = {978-1-5386-4658-8},
doi = {10.1109/ICASSP.2018.8461386},
abstract = {Direction of arrival (DOA) estimation is an important topic in microphone array processing. Conventional methods work well in relatively clean conditions but suffer from noise and reverberation distortions. Recently, deep learning-based methods show the robustness to noise and reverberation. However, the performance is degraded rapidly or even model cannot work when microphone array structure changes. So it has to retrain the model with new data, which is a huge work. In this paper, we propose a supervised learning algorithm for DOA estimation combining convolutional neural network (CNN) and long short term memory (LSTM). Experimental results show that the proposed method can improve the accuracy significantly. In addition, due to an input feature design, the proposed method can adapt to a new microphone array conveniently only use a very small amount of data.},
language = {en},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Li, Qinglong and Zhang, Xueliang and Li, Hao},
month = apr,
year = {2018},
keywords = {lu},
pages = {2616--2620},
file = {Li et al. - 2018 - Online Direction of Arrival Estimation Based on De.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\E6GCSTTT\\Li et al. - 2018 - Online Direction of Arrival Estimation Based on De.pdf:application/pdf},
}
@inproceedings{xiao_learning-based_2015,
title = {A learning-based approach to direction of arrival estimation in noisy and reverberant environments},
doi = {10.1109/ICASSP.2015.7178484},
abstract = {This paper presents a learning-based approach to the task of direction of arrival estimation (DOA) from microphone array input. Traditional signal processing methods such as the classic least square (LS) method rely on strong assumptions on signal models and accurate estimations of time delay of arrival (TDOA) . They only work well in relatively clean conditions, but suffer from noise and reverberation distortions. In this paper, we propose a learning-based approach that can learn from a large amount of simulated noisy and reverberant microphone array inputs for robust DOA estimation. Specifically, we extract features from the generalised cross correlation (GCC) vectors and use a multilayer perceptron neural network to learn the nonlinear mapping from such features to the DOA. One advantage of the learning based method is that as more and more training data becomes available, the DOA estimation will become more and more accurate. Experimental results on simulated data show that the proposed learning based method produces much better results than the state-of-the-art LS method. The testing results on real data recorded in meeting rooms show improved root-mean-square error (RMSE) compared to the LS method.},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Xiao, Xiong and Zhao, Shengkui and Zhong, Xionghu and Jones, Douglas L. and Chng, Eng S. and Li, Haizhou},
year = {2015},
keywords = {lu},
pages = {2814--2818},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\BP2U7Q7A\\7178484.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\DMFDRYEJ\\Xiao et al. - 2015 - A learning-based approach to direction of arrival .pdf:application/pdf},
}
@inproceedings{nicol_sound_2010,
title = {Sound spatialization by higher order {Ambisonics}: encoding and decoding a sound scene in practice from a theoretical point of view},
abstract = {An overview of HOA technology is presented. First, HOA defines a format of spatial audio which has many attractive properties, such as scalability and flexibility. Besides, this format is independent of the encoding (i.e. microphone signals) and decoding (i.e. loudspeaker signals) formats. Second, HOA provides tools to record, or create, and render a spatial sound scene. These tools, which rely on a specific encoding and decoding of spatial information, will be analysed and discussed from a both theoretical and practical point of view. Third, the final issue is the assessment of the virtual sound scene that is (re)created by HOA. The toolkit of available methodologies and criteria is examined.},
language = {en},
booktitle = {International {Symposium} on {Ambisonics} and {Spherical} {Acoustics}},
author = {Nicol, Rozenn},
year = {2010},
keywords = {non-lu},
pages = {9},
file = {Nicol - 2010 - Orange Labs TECHOPERATPS 2 Avenue Pierre Marzin,.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\EHFBR5TG\\Nicol - 2010 - Orange Labs TECHOPERATPS 2 Avenue Pierre Marzin,.pdf:application/pdf},
}
@article{kitic_tramp:_2018,
title = {{TRAMP}: {TRacking} by a realtime {AMbisonic}-based {Particle} filter},
abstract = {This article presents a multiple sound source localization and tracking system, fed by the Eigenmike array. The First Order Ambisonics (FOA) format is used to build a pseudointensity-based spherical histogram, from which the source position estimates are deduced. These instantaneous estimates are processed by a well-known tracking system relying on a set of particle filters. While the novelty within localization and tracking is incremental, the fully-functional, complete and real-time running system based on these algorithms is proposed for the first time. As such, it could serve as an additional baseline method of the LOCATA challenge.},
language = {en},
journal = {LOCATA Challenge Workshop},
author = {Kitic, Srdan and Guérin, Alexandre},
year = {2018},
keywords = {lu},
file = {Guérin - 2018 - Orange Labs 4 Rue du Clos Courtel 35510 Cesson-Sév.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\V85SDXXK\\Guérin - 2018 - Orange Labs 4 Rue du Clos Courtel 35510 Cesson-Sév.pdf:application/pdf},
}
@inproceedings{vesperini_neural_2016,
title = {A neural network based algorithm for speaker localization in a multi-room environment},
doi = {10.1109/MLSP.2016.7738817},
abstract = {A Speaker Localization algorithm based on Neural Networks for multi-room domestic scenarios is proposed in this paper. The approach is fully data-driven and employs a Neural Network fed by GCC-PHAT (Generalized Cross Correlation Phase Transform) Patterns, calculated by means of the microphone signals, to determine the speaker position in the room under analysis. In particular, we deal with a multi-room case study, in which the acoustic scene of each room is influenced by sounds emitted in the other rooms. The algorithm is tested against the home recorded DIRHA dataset, characterized by multiple wall and ceiling microphone signals for each room. In particular, we focused on the speaker localization problem in two distinct neighbouring rooms. We assumed the presence of an Oracle multi-room Voice Activity Detector (VAD) in our experiments. A three-stage optimization procedure has been adopted to find the best network configuration and GCC-PHAT Patterns combination. Moreover, an algorithm based on Time Difference of Arrival (TDOA), recently proposed in literature for the addressed applicative context, has been considered as term of comparison. As result, the proposed algorithm outperforms the reference one, providing an average localization error, expressed in terms of RMSE, equal to 525 mm against 1465 mm. Concluding, we also assessed the algorithm performance when a real VAD, recently proposed by some of the authors, is used. Even though a degradation of localization capability is registered (an average RMSE equal to 770 mm), still a remarkable improvement with respect to the state of the art performance is obtained.},
booktitle = {{IEEE} {International} {Workshop} on {Machine} {Learning} for {Signal} {Processing}},
author = {Vesperini, Fabio and Vecchiotti, Paolo and Principi, Emanuele and Squartini, Stefano and Piazza, Francesco},
year = {2016},
keywords = {lu},
pages = {1--6},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\WWKEZENN\\7738817.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\MJVSURQC\\Vesperini et al. - 2016 - A neural network based algorithm for speaker local.pdf:application/pdf},
}
@inproceedings{xu_crowd++:_2013,
title = {Crowd++: unsupervised speaker count with smartphones},
abstract = {Smartphones are excellent mobile sensing platforms, with the microphone in particular being exercised in several audio inference applications. We take smartphone audio inference a step further and demonstrate for the first time that it’s possible to accurately estimate the number of people talking in a certain place – with an average error distance of 1.5 speakers – through unsupervised machine learning analysis on audio segments captured by the smartphones. Inference occurs transparently to the user and no human intervention is needed to derive the classification model. Our results are based on the design, implementation, and evaluation of a system called Crowd++, involving 120 participants in 10 very different environments. We show that no dedicated external hardware or cumbersome supervised learning approaches are needed but only off-the-shelf smartphones used in a transparent manner. We believe our findings have profound implications in many research fields, including social sensing and personal wellbeing assessment.},
language = {en},
booktitle = {{ACM} {International} {Joint} {Conference} on {Pervasive} and {Ubiquitous} {Computing}},
author = {Xu, Chenren and Li, Sugang and Liu, Gang and Zhang, Yanyong and Miluzzo, Emiliano and Chen, Yih-Farn and Li, Jun and Firner, Bernhard},
year = {2013},
keywords = {lu},
pages = {43--52},
file = {Xu et al. - Crowd++ Unsupervised Speaker Count with Smartphon.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\H3LEJAQU\\Xu et al. - Crowd++ Unsupervised Speaker Count with Smartphon.pdf:application/pdf},
}
@inproceedings{larnel_bref_1991,
title = {{BREF}, a large vocabulary spoken corpus for {French}},
abstract = {This paper presents some of the design considerations of BREF, a large read-speech corpus for French. BREF was designed to provide continuous speech data for the development of dictation machines, for the evaluation of continuous speech recognition systems (both speaker-dependent and speakerindependent), and for the study of phonological variations. The texts to be read were selected from 5 million words of the French newspaper, Le Monde. In total, 11,000 texts were selected, with selection criteria that emphasisized maximizing the number of distinct triphones. Separate text materials were selected for training and test corpora. Ninety speakers have been recorded, each providing between 5,000 and 10,000 words (approximately 40-70 min.) of speech.},
booktitle = {Eurospeech},
author = {Larnel, Lori F. and Gauvain, Jean-Luc and Eskénazi, Maxine},
year = {1991},
keywords = {Expect, Selection (user interface), Speech corpus, Speech recognition, Text corpus, Time-compressed speech, Triphone, Vocabulary},
file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\2NKUZWBU\\Larnel et al. - 1991 - BREF, a large vocabulary spoken corpus for French.pdf:application/pdf},
}
@inproceedings{wei_determining_2018,
title = {Determining number of speakers from single microphone speech signals by multi-label convolutional neural network},
doi = {10.1109/IECON.2018.8592773},
abstract = {This paper presents a multi-label convolutional neural network approach to determine the number of speakers when using a single microphone which is more challenging than when using multiple microphones. Spectrograms of windowed noisy speech signals for 1talker, 2talkers and 3+talkers are used as inputs to a multi-label convolutional neural network. The architecture of the developed multi-label convolutional neural network is discussed and it is shown that this network with median filtering can achieve an overall accuracy of about 81\% for the noisy speech dataset examined.},
booktitle = {Annual {Conference} of the {IEEE} {Industrial} {Electronics} {Society}},
author = {Wei, Haoran and Kehtarnavaz, Nasser},
month = oct,
year = {2018},
keywords = {Acoustics, Conferences, convolutional neural nets, Convolutional neural networks, determining number of speakers, Filtering, lu, median filtering, median filters, microphones, Microphones, multi-label convolutional neural network, multilabel convolutional neural network approach, noisy speech signals, single microphone speech signals, Spectrogram, speech enhancement, speech processing, Speech processing},
pages = {2706--2710},
file = {[email protected]:C\:\\Users\\RQML4978\\Zotero\\storage\\7E2J7JRY\\[email protected]:application/pdf},
}
@inproceedings{stoter_classification_2018,
title = {Classification vs. regression in supervised learning for single channel speaker count estimation},
doi = {10.1109/ICASSP.2018.8462159},
abstract = {The task of estimating the maximum number of concurrent speakers from single channel mixtures is important for various audio-based applications, such as blind source separation, speaker diarisation, audio surveillance or auditory scene classification. Building upon powerful machine learning methodology, we develop a Deep Neural Network (DNN) that estimates a speaker count. While DNNs efficiently map input representations to output targets, it remains unclear how to best handle the network output to infer integer source count estimates, as a discrete count estimate can either be tackled as a regression or a classification problem. In this paper, we investigate this important design decision and also address complementary parameter choices such as the input representation. We evaluate a state-of-the-art DNN audio model based on a Bi-directional Long Short-Term Memory network architecture for speaker count estimations. Through experimental evaluations aimed at identifying the best overall strategy for the task and show results for five seconds speech segments in mixtures of up to ten speakers.},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {St\"{o}ter, Fabien-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuel A. P.},
year = {2018},
keywords = {acoustic signal processing, audio based applications, audio signal processing, audio surveillance, auditory scene classification, bidirectional long short term memory network architecture, blind source separation, channel estimation, Channel estimation, classification problem, cocktail-party, Computer architecture, concurrent speakers, deep neural network, design decision, discrete count estimate, DNN audio model, Estimation, integer source count estimates, learning (artificial intelligence), lu, Machine learning, map input representations, maximum number, network output, neural nets, Neural networks, number of concurrent speakers, output targets, overlapped speech, pattern classification, powerful machine learning methodology, regression analysis, regression problem, single channel mixtures, single channel speaker count estimation, speaker count estimation, speaker diarisation, speaker recognition, speech segments, supervised learning, Task analysis, Training},
pages = {436--440},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\4BL2RGGJ\\Stöter et al. - 2018 - Classification vs. Regression in Supervised Learni.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\E2NLA3V7\\1712.html:text/html},
}
@article{sayoud_proposal_2010,
title = {Proposal of a new confidence parameter estimating the number of speakers-an experimental investigation-},
volume = {1},
abstract = {Abstract. Is it possible to know how many speakers are speaking simultaneously in case of speech overlap? If the human brain, creation not yet mastered, manages to do it and even to understand the mixed speech meaning, it is not yet the case for the existing systems of automatic speaker recognition. In practice, these systems present a strong degradation in such situations. For this task, we propose a new method able to estimate the number of speakers in a mixture of speech signals. The algorithm developed here is based on the computation of the statistical characteristic of the 7th Mel coefficient extracted by spectral analysis from the speech signal. This algorithm using a confidence parameter, which we called PENS, is tested on seven different sets of the ORATOR database, where each set contains seven multi-speaker files. Results show that the PENS parameter permits us to make a good discrimination, without any ambiguity, between a mono-speaker signal (only one speaker is speaking) and a mixed-speakers signal (several speakers are speaking simultaneously). Moreover, it permits us to estimate, in case of mixed speech signals, the number of speakers with a good precision, especially when the number of speakers is less than four.},
number = {2},
journal = {Journal of Information Hiding and Multimedia Signal Processing},
author = {Sayoud, Halim and Ouamour, Siham},
month = apr,
year = {2010},
keywords = {lu},
file = {Citeseer - Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\9BSGC2K7\\summary.html:text/html;Sayoud and Ouamour - 2010 - Proposal of a New Confidence Parameter Estimating .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\DG3RFZBH\\Sayoud and Ouamour - 2010 - Proposal of a New Confidence Parameter Estimating .pdf:application/pdf},
}
@article{arberet_robust_2010,
title = {A robust method to count and locate audio sources in a multichannel underdetermined mixture},
volume = {58},
issn = {1053-587X},
doi = {10.1109/TSP.2009.2030854},
abstract = {We propose a method to count and estimate the mixing directions in an underdetermined multichannel mixture. The approach is based on the hypothesis that in the neighborhood of \textit{some} time-frequency points, only one source essentially contributes to the mixture: such time-frequency points can provide robust local estimates of the corresponding source direction. At the core of our contribution is a statistical model to exploit a local confidence measure, which detects the time-frequency regions where such robust information is available. A clustering algorithm called DEMIX is proposed to merge the information from all time-frequency regions according to their confidence level. So as to estimate the delays of anechoic mixtures and overcome the intrinsic ambiguities of phase unwrapping as met with DUET, we propose a technique similar to GCC-PHAT that is able to estimate delays that can largely exceed one sample. We propose an extensive experimental study that shows the resulting method is more robust in conditions where all DUET-like comparable methods fail, that is, in particular, a) when time-delays largely exceed one sample and b) when the source directions are very close.},
number = {1},
journal = {IEEE Transactions on Signal Processing},
author = {Arberet, Simon and Gribonval, Rémi and Bimbot, Frédéric},
year = {2010},
keywords = {Audio recording, audio signals, audio sources, Biomedical imaging, blind source separation, Blind source separation, Clustering algorithms, delay estimation, Delay estimation, DEMIX, direction of arrival, Direction of arrival estimation, direction-of-arrival estimation, lu, mixing directions, multichannel audio, multichannel underdetermined mixture, Phase estimation, Robustness, signal sources, Source separation, sparse component analysis, sparse matrices, Speech processing, Time frequency analysis, time-delays},
pages = {121--133},
file = {RR-6593.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\ZM7N943E\\RR-6593.pdf:application/pdf},
}
@article{anguera_speaker_2012,
title = {Speaker diarization: a review of recent research},
volume = {20},
issn = {1558-7916},
shorttitle = {Speaker {Diarization}},
doi = {10.1109/TASL.2011.2125954},
abstract = {Speaker diarization is the task of determining “who spoke when?” in an audio or video recording that contains an unknown amount of speech and also an unknown number of speakers. Initially, it was proposed as a research topic related to automatic speech recognition, where speaker diarization serves as an upstream processing step. Over recent years, however, speaker diarization has become an important key technology for many tasks, such as navigation, retrieval, or higher level inference on audio data. Accordingly, many important improvements in accuracy and robustness have been reported in journals and conferences in the area. The application domains, from broadcast news, to lectures and meetings, vary greatly and pose different problems, such as having access to multiple microphones and multimodal information or overlapping speech. The most recent review of existing technology dates back to 2006 and focuses on the broadcast news domain. In this paper, we review the current state-of-the-art, focusing on research developed since 2006 that relates predominantly to speaker diarization for conference meetings. Finally, we present an analysis of speaker diarization performance as reported through the NIST Rich Transcription evaluations on meeting data and identify important areas for future research.},
number = {2},
journal = {IEEE Transactions on Audio, Speech, and Language Processing},
author = {Anguera, Xavier and Bozonnet, Simon and Evans, Nicholas and Fredouille, Corinne and Friedland, Gerald and Vinyals, Oriol},
month = feb,
year = {2012},
keywords = {Acoustics, Adaptation models, audio data, audio recording, audio signal processing, automatic speech recognition, broadcast news, conference meetings, Data models, information resources, Meetings, Microphones, multimodal information, NIST, NIST Rich Transcription evaluations, non-lu, rich transcription, speaker diarization, speaker recognition, Speech, speech overlapping, Speech recognition, teleconferencing, television broadcasting, upstream processing, video recording},
pages = {356--370},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\LY5XV48K\\6135543.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\55LN3XWQ\\Anguera et al. - 2012 - Speaker Diarization A Review of Recent Research.pdf:application/pdf},
}
@article{adavanne_localization_2019,
title = {Localization, detection and tracking of multiple moving sound sources with a convolutional recurrent neural network},
abstract = {This paper investigates the joint localization, detection, and tracking of sound events using a convolutional recurrent neural network (CRNN). We use a CRNN previously proposed for the localization and detection of stationary sources, and show that the recurrent layers enable the spatial tracking of moving sources when trained with dynamic scenes. The tracking performance of the CRNN is compared with a stand-alone tracking method that combines a multisource (DOA) estimator and a particle filter. Their respective performance is evaluated in various acoustic conditions such as anechoic and reverberant scenarios, stationary and moving sources at several angular velocities, and with a varying number of overlapping sources. The results show that the CRNN manages to track multiple sources more consistently than the parametric method across acoustic scenarios, but at the cost of higher localization error.},
language = {en},
urldate = {2019-04-30},
journal = {arXiv:1904.12769},
author = {Adavanne, Sharath and Politis, Archontis and Virtanen, Tuomas},
month = apr,
year = {2019},
keywords = {Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, lu},
file = {Adavanne et al. - 2019 - Localization, Detection and Tracking of Multiple M.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\APNFVL6N\\Adavanne et al. - 2019 - Localization, Detection and Tracking of Multiple M.pdf:application/pdf},
}
@article{purwins_deep_2019,
title = {Deep learning for audio signal processing},
volume = {13},
issn = {1932-4553, 1941-0484},
doi = {10.1109/JSTSP.2019.2908700},
abstract = {Given the recent surge in developments of deep learning, this article provides a review of the state-of-the-art deep learning techniques for audio signal processing. Speech, music, and environmental sound processing are considered side-by-side, in order to point out similarities and differences between the domains, highlighting general methods, problems, key references, and potential for cross-fertilization between areas. The dominant feature representations (in particular, log-mel spectra and raw waveform) and deep learning models are reviewed, including convolutional neural networks, variants of the long short-term memory architecture, as well as more audio-specific neural network models. Subsequently, prominent deep learning application areas are covered, i.e. audio recognition (automatic speech recognition, music information retrieval, environmental sound detection, localization and tracking) and synthesis and transformation (source separation, audio enhancement, generative models for speech, sound, and music synthesis). Finally, key issues and future questions regarding deep learning applied to audio signal processing are identified.},
number = {2},
journal = {IEEE Journal of Selected Topics in Signal Processing},
author = {Purwins, Hendrik and Li, Bo and Virtanen, Tuomas and Schlüter, Jan and Chang, Shuo-yiin and Sainath, Tara},
month = apr,
year = {2019},
keywords = {lu},
pages = {206--219},
file = {arXiv\:1905.00078 PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\KHUSBDWQ\\Purwins et al. - 2019 - Deep Learning for Audio Signal Processing.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\YDMQEZL4\\1905.html:text/html},
}
@inproceedings{von_neumann_all-neural_2019,
title = {All-neural online source separation, counting, and diarization for meeting analysis},
doi = {10.1109/ICASSP.2019.8682572},
abstract = {Automatic meeting analysis comprises the tasks of speaker counting, speaker diarization, and the separation of overlapped speech, followed by automatic speech recognition. This all has to be carried out on arbitrarily long sessions and, ideally, in an online or block-online manner. While significant progress has been made on individual tasks, this paper presents for the first time an all-neural approach to simultaneous speaker counting, diarization and source separation. The NN-based estimator operates in a block-online fashion and tracks speakers even if they remain silent for a number of time blocks, thus learning a stable output order for the separated sources. The neural network is recurrent over time as well as over the number of sources. The simulation experiments show that state of the art separation performance is achieved, while at the same time delivering good diarization and source counting results. It even generalizes well to an unseen large number of blocks.},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {von Neumann, Thilo and Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro and Haeb-Umbach, Reinhold},
month = may,
year = {2019},
keywords = {Artificial neural networks, Blind source separation, Estimation, Indexes, meeting diarization, neural network, non-lu, online processing, source counting, Source separation, Speech recognition, Task analysis},
pages = {91--95},
file = {Neumann et al. - 2019 - All-neural Online Source Separation, Counting, and.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\Z79VEYD8\\Neumann et al. - 2019 - All-neural Online Source Separation, Counting, and.pdf:application/pdf},
}
@article{oord_wavenet:_2016,
title = {{WaveNet}: a generative model for raw audio},
abstract = {This paper introduces WaveNet, a deep neural network for generating raw audio waveforms. The model is fully probabilistic and autoregressive, with the predictive distribution for each audio sample conditioned on all previous ones; nonetheless we show that it can be efficiently trained on data with tens of thousands of samples per second of audio. When applied to text-to-speech, it yields state-ofthe-art performance, with human listeners rating it as significantly more natural sounding than the best parametric and concatenative systems for both English and Mandarin. A single WaveNet can capture the characteristics of many different speakers with equal fidelity, and can switch between them by conditioning on the speaker identity. When trained to model music, we find that it generates novel and often highly realistic musical fragments. We also show that it can be employed as a discriminative model, returning promising results for phoneme recognition.},
language = {en},
journal = {arXiv:1609.03499},
author = {Oord, Aaron van den and Dieleman, Sander and Zen, Heiga and Simonyan, Karen and Vinyals, Oriol and Graves, Alex and Kalchbrenner, Nal and Senior, Andrew and Kavukcuoglu, Koray},
month = sep,
year = {2016},
keywords = {Computer Science - Machine Learning, Computer Science - Sound, lu},
file = {Oord et al. - 2016 - WaveNet A Generative Model for Raw Audio.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\Z8FP47L6\\Oord et al. - 2016 - WaveNet A Generative Model for Raw Audio.pdf:application/pdf},
}
@book{chollet_deep_2017,
title = {Deep learning with {Python}},
isbn = {978-1-61729-443-3},
language = {en},
publisher = {Simon and Schuster},
author = {Chollet, François},
year = {2017},
keywords = {lu, Machine learning, Neural networks (Computer science), Python (Computer program language)},
file = {Chollet - 2018 - Deep learning with Python.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\S8KTC9RZ\\Chollet - 2018 - Deep learning with Python.pdf:application/pdf},
}
@article{silver_mastering_2017,
title = {Mastering chess and shogi by self-play with a general reinforcement learning algorithm},
abstract = {The game of chess is the most widely-studied domain in the history of artificial intelligence. The strongest programs are based on a combination of sophisticated search techniques, domain-specific adaptations, and handcrafted evaluation functions that have been refined by human experts over several decades. In contrast, the AlphaGo Zero program recently achieved superhuman performance in the game of Go, by tabula rasa reinforcement learning from games of self-play. In this paper, we generalise this approach into a single AlphaZero algorithm that can achieve, tabula rasa, superhuman performance in many challenging domains. Starting from random play, and given no domain knowledge except the game rules, AlphaZero achieved within 24 hours a superhuman level of play in the games of chess and shogi (Japanese chess) as well as Go, and convincingly defeated a world-champion program in each case.},
journal = {arXiv:1712.01815},
author = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and Lillicrap, Timothy and Simonyan, Karen and Hassabis, Demis},
month = dec,
year = {2017},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, lu},
file = {arXiv\:1712.01815 PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\U2H9ZHWU\\Silver et al. - 2017 - Mastering Chess and Shogi by Self-Play with a Gene.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\L98AT8LM\\1712.html:text/html},
}
@inproceedings{perotin_regression_2019,
title = {Regression versus classification for neural network based audio source localization},
abstract = {We compare the performance of regression and classification neural networks for single-source direction-of-arrival estimation. Since the output space is continuous and structured, regression seems more appropriate. However, classification on a discrete spherical grid is widely believed to perform better and is predominantly used in the literature. For regression, we propose two ways to account for the spherical geometry of the output space based either on the angular distance between spherical coordinates or on the mean squared error between Cartesian coordinates. For classification, we propose two alternatives to the classical one-hot encoding framework: we derive a Gibbs distribution from the squared angular distance between grid points and use the corresponding probabilities either as soft targets or as cross-entropy weights that retain a clear probabilistic interpretation. We show that regression on Cartesian coordinates is generally more accurate, except when localized interference is present, in which case classification appears to be more robust.},
language = {en},
booktitle = {{IEEE} {Workshop} on {Applications} of {Signal} {Processing} to {Audio} and {Acoustics}},
author = {Perotin, Lauréline and Défossez, Alexandre and Vincent, Emmanuel and Serizel, Romain and Guérin, Alexandre},
year = {2019},
keywords = {lu},
file = {waspaa_perotin_camera_ready.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\VLDB6UZK\\waspaa_perotin_camera_ready.pdf:application/pdf},
}
@phdthesis{perotin_localisation_2019,
title = {Localisation et rehaussement de sources de parole au format {Ambisonique}},
language = {French},
school = {Université de Lorraine},
author = {Perotin, Lauréline},
month = oct,
year = {2019},
keywords = {partiel-lu},
file = {Perotin - 2019 - Localisation et rehaussement de sources de parole .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\DH3UF3GL\\Perotin - 2019 - Localisation et rehaussement de sources de parole .pdf:application/pdf},
}
@inproceedings{roden_sound_2015,
title = {On sound source localization of speech signals using deep neural networks},
copyright = {http://rightsstatements.org/vocab/InC/1.0/},
isbn = {978-3-939296-08-9},
abstract = {In recent years artificial neural networks are successfully applied especially in the context of automatic speech recognition. As information processing systems, neural networks are trained by, e.g., backpropagation or restricted Boltzmann machines to classify patterns at the input of the system. The current work presents the implementation of a deep neural network (DNN) architecture for acoustic source localization.},
language = {en},
booktitle = {Deutsche {Jahrestagung} für {Akustik}},
author = {Roden, Reinhild and Moritz, Niko and Gerlach, Stephan and Weinzierl, Stefan and Goetze, Stefan},
year = {2015},
keywords = {lu},
file = {Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\N69VE3R2\\Roden et al. - 2015 - On sound source localization of speech signals usi.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\5EMA4FWE\\9746.html:text/html},
}
@inproceedings{zermini_deep_2016,
title = {Deep neural network based audio source separation},
abstract = {Audio source separation aims to extract individual sources from mixtures of
multiple sound sources. Many techniques have been developed such as independent compo-
nent analysis, computational auditory scene analysis, and non-negative matrix factorisa-
tion. A method based on Deep Neural Networks (DNNs) and time-frequency (T-F) mask-
ing has been recently developed for binaural audio source separation. In this method, the
DNNs are used to predict the Direction Of Arrival (DOA) of the audio sources with respect
to the listener which is then used to generate soft T-F masks for the recovery/estimation
of the individual audio sources.},
booktitle = {{IMA} {International} {Conference} on {Mathematics} in {Signal} {Processing}},
author = {Zermini, Alfredo and Yu, Yingfu and Xu, Yong and Wang, Wenwu and Plumbley, Mark D.},
year = {2016},
keywords = {lu, non-direct},
file = {Zermini et al. - 2016 - Deep neural network based audio source separation.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\R5V9PPQL\\Zermini et al. - 2016 - Deep neural network based audio source separation.pdf:application/pdf},
}
@article{yalta_sound_2017,
title = {Sound source localization using deep learning models},
volume = {29},
doi = {10.20965/jrm.2017.p0037},
abstract = {Title: Sound Source Localization Using Deep Learning Models {\textbar} Keywords: sound source localization, deep learning, deep residual networks {\textbar} Author: Nelson Yalta, Kazuhiro Nakadai, and Tetsuya Ogata},
number = {1},
urldate = {2020-03-19},
journal = {Journal of Robotics and Mechatronics},
author = {Yalta, Nelson and Nakadai, Kazuhiro and Ogata, Tetsuya},
year = {2017},
keywords = {lu},
pages = {37--48},
file = {Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\QUWEG52T\\robot002900010037.html:text/html;Yalta et al. - 2017 - Sound Source Localization Using Deep Learning Mode.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\C9HEVQIM\\Yalta et al. - 2017 - Sound Source Localization Using Deep Learning Mode.pdf:application/pdf},
}
@article{suvorov_deep_2018,
title = {Deep residual network for sound source localization in the time domain},
abstract = {This study presents a system for sound source localization in time domain using a deep residual neural network. Data from the linear 8 channel microphone array with 3 cm spacing is used by the network for direction estimation. We propose to use the deep residual network for sound source localization considering the localization task as a classification task. This study describes the gathered dataset and developed architecture of the neural network. We will show the training process and its result in this study. The developed system was tested on validation part of the dataset and on new data capture in real time. The accuracy classification of 30 m sec sound frames is 99.2\%. The standard deviation of sound source localization is 4\{{\textbackslash}deg\}. The proposed method of sound source localization was tested inside of speech recognition pipeline. Its usage decreased word error rate by 1.14\% in comparison with similar speech recognition pipeline using GCC-PHAT sound source localization.},
journal = {arXiv:1808.06429},
author = {Suvorov, Dmitry and Dong, Ge and Zhukov, Roman},
month = aug,
year = {2018},
keywords = {lu},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\HWHPTCSD\\Suvorov et al. - 2018 - Deep Residual Network for Sound Source Localizatio.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\MJVFTJ5E\\1808.html:text/html},
}
@article{he_deep_2018,
title = {Deep neural networks for multiple speaker detection and localization},
doi = {10.1109/ICRA.2018.8461267},
abstract = {We propose to use neural networks for simultaneous detection and localization of multiple sound sources in human-robot interaction. In contrast to conventional signal processing techniques, neural network-based sound source localization methods require fewer strong assumptions about the environment. Previous neural network-based methods have been focusing on localizing a single sound source, which do not extend to multiple sources in terms of detection and localization. In this paper, we thus propose a likelihood-based encoding of the network output, which naturally allows the detection of an arbitrary number of sources. In addition, we investigate the use of sub-band cross-correlation information as features for better localization in sound mixtures, as well as three different network architectures based on different motivations. Experiments on real data recorded from a robot show that our proposed methods significantly outperform the popular spatial spectrum-based approaches.},
journal = {IEEE International Conference on Robotics and Automation},
author = {He, Weipeng and Motlicek, Petr and Odobez, Jean-Marc},
year = {2018},
keywords = {lu},
pages = {74--79},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\K35TSYM4\\He et al. - 2018 - Deep Neural Networks for Multiple Speaker Detectio.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\D3P98NL3\\1711.html:text/html},
}
@inproceedings{hirvonen_classication_2015,
title = {Classification of spatial audio location and content using convolutional neural networks},
abstract = {This paper investigates the use of Convolutional Neural Networks for spatial audio classification. In contrast to traditional methods that use hand-engineered features and algorithms, we show that a Convolutional Network in combination with generic preprocessing can give good results, and allows for specialization to challenging conditions. The method can adapt to e.g. different source distances and microphone arrays, as well as estimate both spatial location and audio content type jointly. For example, with typical single-source material in a simulated reverberant room, we can achieve cross-validation accuracy of 94.3\% for 40-ms frames across 16 classes (eight spatial directions, content type speech vs. music).},
language = {en},
booktitle = {Audio {Engineering} {Society} {Convention}},
author = {Hirvonen, Toni},
year = {2015},
keywords = {lu},
file = {Hirvonen - 2015 - Classification of Spatial Audio Location and Conten.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\IWKKP9KV\\Hirvonen - 2015 - Classification of Spatial Audio Location and Conten.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\MJYRHEX9\\browse.html:text/html},
}
@inproceedings{takeda_discriminative_2016,
title = {Discriminative multiple sound source localization based on deep neural networks using independent location model},
doi = {10.1109/SLT.2016.7846325},
abstract = {We propose a training method for multiple sound source localization (SSL) based on deep neural networks (DNNs). Such networks function as posterior probability estimator of sound location in terms of position labels and achieve high localization correctness. Since the previous DNNs' configuration for SSL handles one-sound-source cases, it should be extended to multiple-sound-source cases to apply it to real environments. However, a naïve design causes 1) an increase in the number of labels and training data patterns and 2) a lack of label consistency across different numbers of sound sources, such as one and two-or-more-sound cases. These two problems were solved using our proposed method, which involves an independent location model for the former and an block-wise consistent labeling with ordering for the latter. Our experiments indicated that the SSL based on DNNs trained by our proposed training method out-performed a conventional SSL method by a maximum of 18 points in terms of block-level correctness.},
booktitle = {{IEEE} {Spoken} {Language} {Technology} {Workshop}},
author = {Takeda, Ryu and Komatani, Kazunori},
year = {2016},
keywords = {lu},
pages = {603--609},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\7E7ZDJ5Z\\7846325.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\RZNETCRE\\Takeda et Komatani - 2016 - Discriminative multiple sound source localization .pdf:application/pdf},
}
@article{vera-diaz_towards_2018,
title = {Towards end-to-end acoustic localization using deep learning: from audio signal to source position coordinates},
volume = {18},
issn = {1424-8220},
shorttitle = {Towards {End}-to-{End} {Acoustic} {Localization} using {Deep} {Learning}},
doi = {10.3390/s18103418},
abstract = {This paper presents a novel approach for indoor acoustic source localization using microphone arrays and based on a Convolutional Neural Network (CNN). The proposed solution is, to the best of our knowledge, the first published work in which the CNN is designed to directly estimate the three dimensional position of an acoustic source, using the raw audio signal as the input information avoiding the use of hand crafted audio features. Given the limited amount of available localization data, we propose in this paper a training strategy based on two steps. We first train our network using semi-synthetic data, generated from close talk speech recordings, and where we simulate the time delays and distortion suffered in the signal that propagates from the source to the array of microphones. We then fine tune this network using a small amount of real data. Our experimental results show that this strategy is able to produce networks that significantly improve existing localization methods based on {\textbackslash}textit\{SRP-PHAT\} strategies. In addition, our experiments show that our CNN method exhibits better resistance against varying gender of the speaker and different window sizes compared with the other methods.},
number = {10},
journal = {Sensors},
author = {Vera-Diaz, Juan Manuel and Pizarro, Daniel and Macias-Guarasa, Javier},
year = {2018},
keywords = {lu},
pages = {3418},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\LUC4GIH3\\Vera-Diaz et al. - 2018 - Towards End-to-End Acoustic Localization using Dee.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\Y2WKDTWD\\1807.html:text/html},
}
@article{salvati_exploiting_2018,
title = {Exploiting {CNNs} for improving acoustic source localization in noisy and reverberant conditions},
volume = {2},
issn = {2471-285X},
doi = {10.1109/TETCI.2017.2775237},
abstract = {This paper discusses the application of convolutional neural networks (CNNs) to minimum variance distortionless response localization schemes. We investigate the direction of arrival estimation problems in noisy and reverberant conditions using a uniform linear array (ULA). CNNs are used to process the multichannel data from the ULA and to improve the data fusion scheme, which is performed in the steered response power computation. CNNs improve the incoherent frequency fusion of the narrowband response power by weighting the components, reducing the deleterious effects of those components affected by artifacts due to noise and reverberation. The use of CNNs avoids the necessity of previously encoding the multichannel data into selected acoustic cues with the advantage to exploit its ability in recognizing geometrical pattern similarity. Experiments with both simulated and real acoustic data demonstrate the superior localization performance of the proposed SRP beamformer with respect to other state-ofthe-art techniques.},
number = {2},
journal = {IEEE Transactions on Emerging Topics in Computational Intelligence},
author = {Salvati, Daniele and Drioli, Carlo and Foresti, Gian Luca},
month = apr,
year = {2018},
keywords = {lu},
pages = {103--116},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\DR4S4Y4X\\8323305.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\TWMD5XDG\\Salvati et al. - 2018 - Exploiting CNNs for Improving Acoustic Source Loca.pdf:application/pdf},
}
@inproceedings{thuillier_spatial_2018,
title = {Spatial audio feature discovery with convolutional neural networks},
isbn = {978-1-5386-4658-8},
doi = {10.1109/ICASSP.2018.8462315},
abstract = {The advent of mixed reality consumer products brings about a pressing need to develop and improve spatial sound rendering techniques for a broad user base. Despite a large body of prior work, the precise nature and importance of various sound localization cues and how they should be personalized for an individual user to improve localization performance is still an open research problem. Here we propose training a convolutional neural network (CNN) to classify the elevation angle of spatially rendered sounds and employing Layerwise Relevance Propagation (LRP) on the trained CNN model. LRP provides saliency maps that can be used to identify spectral features used by the network for classification. These maps, in addition to the convolution filters learned by the CNN, are discussed in the context of listening tests reported in the literature. The proposed approach could potentially provide an avenue for future studies on modeling and personalization of head-related transfer functions (HRTFs).},
language = {en},
urldate = {2020-03-26},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Thuillier, Etienne and Gamper, Hannes and Tashev, Ivan J.},
year = {2018},
keywords = {lu},
pages = {6797--6801},
file = {Thuillier et al. - 2018 - Spatial Audio Feature Discovery with Convolutional.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\ILSIKGZ5\\Thuillier et al. - 2018 - Spatial Audio Feature Discovery with Convolutional.pdf:application/pdf},
}
@article{ma_phased_2018,
title = {Phased microphone array for sound source localization with deep learning},
volume = {2},
abstract = {To phased microphone array for sound source localization, algorithm with both high computational efficiency and high precision is a persistent pursuit. In this paper convolutional neural network (CNN) a kind of deep learning is preliminarily applied as a new algorithm. At high frequency CNN can reconstruct the sound localizations with excellent spatial resolution as good as DAMAS, within a very short time as short as conventional beamforming. This exciting result means that CNN perfectly finds source distribution directly from cross-spectral matrix without given propagation function in advance, and thus CNN deserves to be further explored as a new algorithm.},
number = {2},
journal = {Aerospace Systems},
author = {Ma, Wei and Liu, Xun},
year = {2018},
keywords = {lu},
pages = {71--81},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\AMYA67JH\\Ma et Liu - 2018 - Phased Microphone Array for Sound Source Localizat.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\XIIWQVEX\\1802.html:text/html},
}
@inproceedings{pertila_robust_2017,
title = {Robust direction estimation with convolutional neural networks based steered response power},
doi = {10.1109/ICASSP.2017.7953333},
abstract = {The steered response power (SRP) methods can be used to build a map of sound direction likelihood. In the presence of interference and reverberation, the map will exhibit multiple peaks with heights related to the corresponding sound's spectral content. Often in realistic use cases, the target of interest (such as speech) can exhibit a lower peak compared to an interference source. This will corrupt any direction dependent method, such as beamforming. Regression has been used to predict time-frequency (TF) regions corrupted by reverberation, and static broadband noise can be efficiently estimated for TF points. TF regions dominated by noise or reverberation can then be de-emphasized to obtain more reliable source direction estimates. In this work, we propose the use of convolutional neural networks (CNNs) for the prediction of a TF mask for emphasizing the direct path speech signal in time-varying interference. SRP with phase transform (SRP-PHAT) combined with the CNN-based masking is shown to be capable of reducing the impact of time-varying interference for speaker direction estimation using real speech sources in reverberation.},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Pertilä, Pasi and Cakir, Emre},
month = mar,
year = {2017},
keywords = {lu, non-direct},
pages = {6125--6129},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\ZC44VLDM\\7953333.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\S5NCSRWV\\Pertilä et Cakir - 2017 - Robust direction estimation with convolutional neu.pdf:application/pdf},
}
@inproceedings{perotin_crnn-based_2018,
title = {{CRNN}-based joint azimuth and elevation localization with the {Ambisonics} intensity vector},
doi = {10.1109/IWAENC.2018.8521403},
abstract = {We present a source localization system for first-order Ambisonics (FOA) contents based on a stacked convolutional and recurrent neural network (CRNN). We propose to use as input to the CRNN the FOA acoustic intensity vector, which is easy to compute and closely linked to the sound direction of arrival (DoA). The system estimates the DoA of a point source in both azimuth and elevation. We conduct an experimental evaluation in configurations including reverberation, noise, and various speaker w. r. t. microphone orientations. The results show that the proposed architecture and input allow the network to return accurate location estimates in realistic conditions compared to another recent CRNN-based system.},
booktitle = {International {Workshop} on {Acoustic} {Signal} {Enhancement}},
author = {Perotin, Lauréline and Serizel, Romain and Vincent, Emmanuel and Guérin, Alexandre},
month = sep,
year = {2018},
keywords = {lu},
pages = {241--245},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\24EP5U5G\\8521403.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\JBSR8ZXL\\Perotin et al. - 2018 - CRNN-based Joint Azimuth and Elevation Localizatio.pdf:application/pdf},
}
@article{xiao_improved_2020,
title = {Improved source counting and separation for monaural mixture},
abstract = {Single-channel speech separation in time domain and frequency domain has been widely studied for voice-driven applications over the past few years. Most of previous works assume known number of speakers in advance, however, which is not easily accessible through monaural mixture in practice. In this paper, we propose a novel model of single-channel multi-speaker separation by jointly learning the time-frequency feature and the unknown number of speakers. Specifically, our model integrates the time-domain convolution encoded feature map and the frequency-domain spectrogram by attention mechanism, and the integrated features are projected into high-dimensional embedding vectors which are then clustered with deep attractor network to modify the encoded feature. Meanwhile, the number of speakers is counted by computing the Gerschgorin disks of the embedding vectors which are orthogonal for different speakers. Finally, the modified encoded feature is inverted to the sound waveform using a linear decoder. Experimental evaluation on the GRID dataset shows that the proposed method with a single model can accurately estimate the number of speakers with 96.7 \% probability of success, while achieving the state-of-the-art separation results on multi-speaker mixtures in terms of scale-invariant signal-to-noise ratio improvement (SI-SNRi) and signal-to-distortion ratio improvement (SDRi).},
journal = {arXiv:2004.00175},
author = {Xiao, Yiming and Zhang, Haijian},
month = mar,
year = {2020},
keywords = {non-lu},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\T3I9ICN3\\Xiao et Zhang - 2020 - Improved Source Counting and Separation for Monaur.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\G7A2QCMV\\2004.html:text/html},
}
@article{cohen_relative_2004,
title = {Relative transfer function identification using speech signals},
volume = {12},
issn = {1558-2353},
doi = {10.1109/TSA.2004.832975},
abstract = {An important component of a multichannel hands-free communication system is the identification of the relative transfer function between sensors in response to a desired source signal. In this paper, a robust system identification approach adapted to speech signals is proposed. A weighted least-squares optimization criterion is introduced, which considers the uncertainty of the desired signal presence in the observed signals. An asymptotically unbiased estimate for the system's transfer function is derived, and a corresponding recursive online implementation is presented. We show that compared to a competing nonstationarity-based method, a smaller error variance is achieved and generally shorter observation intervals are required. Furthermore, in the case of a time-varying system, faster convergence and higher reliability of the system identification are obtained by using the proposed method than by using the nonstationarity-based method. Evaluation of the proposed system identification approach is performed under various noise conditions, including simulated stationary and nonstationary white Gaussian noise, and car interior noise in real pseudo-stationary and nonstationary environments. The experimental results confirm the advantages of proposed approach.},
number = {5},
journal = {IEEE Transactions on Speech and Audio Processing},
author = {Cohen, Israel},
month = sep,
year = {2004},
keywords = {lu},
pages = {451--459},
file = {Cohen - 2004 - Relative transfer function identification using sp.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\TA6NM6ZQ\\Cohen - 2004 - Relative transfer function identification using sp.pdf:application/pdf},
}
@article{shalvi_system_1996,
title = {System identification using nonstationary signals},
volume = {44},
issn = {1941-0476},
doi = {10.1109/78.533725},
abstract = {The conventional method for identifying the transfer function of an unknown linear system consists of a least squares fit of its input to its output. It is equivalent to identifying the frequency response of the system by calculating the empirical cross-spectrum between the system's input and output, divided by the empirical auto-spectrum of the input process. However, if the additive noise at the system's output is correlated with the input process, e.g., in case of environmental noise that affects both system's input and output, the method may suffer from a severe bias effect. We present a modification of the cross-spectral method that exploits nonstationary features in the data in order to circumvent bias effects caused by correlated stationary noise. The proposed method is particularly attractive to problems of multichannel signal enhancement and noise cancellation, when the desired signal is nonstationary in nature, e.g., speech or image.},
number = {8},
journal = {IEEE Transactions on Signal Processing},
author = {Shalvi, Ofir and Weinstein, Ehud},
month = aug,
year = {1996},
pages = {2055--2063},
file = {Shalvi et Weinstein - 1996 - System identification using nonstationary signals.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\IKYG2JS6\\Shalvi et Weinstein - 1996 - System identification using nonstationary signals.pdf:application/pdf},
}
@inproceedings{kinoshita_tackling_2020,
title = {Tackling real noisy reverberant meetings with all-neural source separation, counting, and diarization system},
doi = {10.1109/ICASSP40776.2020.9054577},
abstract = {Automatic meeting analysis is an essential fundamental technology required to let, e.g. smart devices follow and respond to our conversations. To achieve an optimal automatic meeting analysis, we previously proposed an all-neural approach that jointly solves source separation, speaker diarization and source counting problems in an optimal way (in a sense that all the 3 tasks can be jointly optimized through error back-propagation). It was shown that the method could well handle simulated clean (noiseless and anechoic) dialog-like data, and achieved very good performance in comparison with several conventional methods. However, it was not clear whether such all-neural approach would be successfully generalized to more complicated real meeting data containing more spontaneously-speaking speakers, severe noise and reverberation, and how it performs in comparison with the state-of-the-art systems in such scenarios. In this paper, we first consider practical issues required for improving the robustness of the all-neural approach, and then experimentally show that, even in real meeting scenarios, the all-neural approach can perform effective speech enhancement, and simultaneously outperform state-of-the-art systems.},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Kinoshita, Keisuke and Delcroix, Marc and Araki, Shoko and Nakatani, Tomohiro},
month = may,
year = {2020},
keywords = {non-lu},
pages = {381--385},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\MUAYGZ7T\\9054577.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\2LVWW8LX\\Kinoshita et al. - 2020 - Tackling Real Noisy Reverberant Meetings with All-.pdf:application/pdf},
}
@article{chakrabarty_multi-speaker_2017,
title = {Multi-speaker localization using convolutional neural network trained with noise},
abstract = {The problem of multi-speaker localization is formulated as a multi-class multi-label classification problem, which is solved using a convolutional neural network (CNN) based source localization method. Utilizing the common assumption of disjoint speaker activities, we propose a novel method to train the CNN using synthesized noise signals. The proposed localization method is evaluated for two speakers and compared to a well-known steered response power method.},
journal = {arXiv:1712.04276},
author = {Chakrabarty, Soumitro and Habets, Emanuël A. P.},
year = {2017},
keywords = {lu},
file = {arXiv Fulltext PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\LYVS7JDU\\Chakrabarty et Habets - 2017 - Multi-Speaker Localization Using Convolutional Neu.pdf:application/pdf;arXiv.org Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\8KWB9XAU\\1712.html:text/html;Chakrabarty and Habets - Multi-Speaker Localization Using Convolutional Neu.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\B6R2UM6T\\Chakrabarty and Habets - Multi-Speaker Localization Using Convolutional Neu.pdf:application/pdf},
}
@book{zotter_ambisonics_2019,
title = {Ambisonics: a practical {3D} audio theory for recording, studio production, sound reinforcement, and virtual reality},
isbn = {978-3-030-17206-0 978-3-030-17207-7},
shorttitle = {Ambisonics},
language = {en},
urldate = {2020-05-02},
publisher = {Springer Nature},
author = {Zotter, Franz and Frank, Matthias},
year = {2019},
keywords = {non-lu},
file = {Zotter et Frank - 2019 - Ambisonics A Practical 3D Audio Theory for Record.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\8EBYJ9NF\\Zotter et Frank - 2019 - Ambisonics A Practical 3D Audio Theory for Record.pdf:application/pdf},
}
@inproceedings{sundar_raw_2020,
title = {Raw waveform based end-to-end deep convolutional network for spatial localization of multiple acoustic sources},
doi = {10.1109/ICASSP40776.2020.9054090},
abstract = {In this paper, we present an end-to-end deep convolutional neural network operating on multi-channel raw audio data to localize multiple simultaneously active acoustic sources in space. Previously reported deep learning based approaches work well in localizing a single source directly from multi-channel raw-audio, but are not easily extendable to localize multiple sources due to the well known permutation problem. We propose a novel encoding scheme to represent the spatial coordinates of multiple sources, which facilitates 2D localization of multiple sources in an end-to-end fashion, avoiding the permutation problem and achieving arbitrary spatial resolution. Experiments on a simulated data set and real recordings from the AV16.3 Corpus demonstrate that the proposed method generalizes well to unseen test conditions, and outperforms a recent time difference of arrival (TDOA) based multiple source localization approach reported in the literature.},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Sundar, Harshavardhan and Wang, Weiran and Sun, Ming and Wang, Chao},
month = may,
year = {2020},
keywords = {lu},
pages = {4642--4646},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\H8RDS9U8\\9054090.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\N5QJ3UVC\\Sundar et al. - 2020 - Raw Waveform Based End-to-end Deep Convolutional N.pdf:application/pdf},
}
@book{jarrett_theory_2017,
series = {Springer {Topics} in {Signal} {Processing}},
title = {Theory and applications of spherical microphone array processing},
volume = {9},
isbn = {978-3-319-42209-1},
abstract = {This book presents the signal processing algorithms that have been developed to process the signals acquired by a spherical microphone array. Spherical microphone arrays can be used to capture the sound field in three dimensions and have received significant interest from researchers and audio engineers. Algorithms for spherical array processing are different to corresponding algorithms already known in the literature of linear and planar arrays because the spherical geometry can be exploited to great beneficial effect. The authors aim to advance the field of spherical array processing by helping those new to the field to study it efficiently and from a single source, as well as by offering a way for more experienced researchers and engineers to consolidate their understanding, adding either or both of breadth and depth. The level of the presentation corresponds to graduate studies at MSc and PhD level. This book begins with a presentation of some of the essential mathematical and physical theory relevant to spherical microphone arrays, and of an acoustic impulse response simulation method, which can be used to comprehensively evaluate spherical array processing algorithms in reverberant environments. The chapter on acoustic parameter estimation describes the way in which useful descriptions of acoustic scenes can be parameterized, and the signal processing algorithms that can be used to estimate the parameter values using spherical microphone arrays. Subsequent chapters exploit these parameters including in particular measures of direction-of-arrival and of diffuseness of a sound field. The array processing algorithms are then classified into two main classes, each described in a separate chapter. These are signal-dependent and signal-independent beamforming algorithms. Although signal-dependent beamforming algorithms are in theory able to provide better performance compared to the signal-independent algorithms, they are currently rarely used in practice. The main reason for this is that the statistical information required by these algorithms is difficult to estimate. In a subsequent chapter it is shown how the estimated acoustic parameters can be used in the design of signal-dependent beamforming algorithms. This final step closes, at least in part, the gap between theory and practice.},
language = {en},
publisher = {Springer},
author = {Jarrett, Daniel P. and Habets, Emanuël A. P. and Naylor, Patrick A.},
year = {2017},
file = {Jarrett et al. - 2017 - Theory and Applications of Spherical Microphone Ar.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\WYMT73ZW\\Jarrett et al. - 2017 - Theory and Applications of Spherical Microphone Ar.pdf:application/pdf;Snapshot:C\:\\Users\\RQML4978\\Zotero\\storage\\5FVYRLW2\\9783319422091.html:text/html},
}
@inproceedings{nguyen_autonomous_2018,
title = {Autonomous sensorimotor learning for sound source localization by a humanoid robot},
abstract = {We consider the problem of learning to localize a speech source using a humanoid robot equipped with a binaural hearing system. We aim to map binaural audio features into the relative angle between the robot’s head direction and the target source direction based on a sensorimotor training framework. To this end, we make the following contributions: (i) a procedure to automatically collect and label audio and motor data for sensorimotor training; (ii) the use of a convolutional neural network (CNN) trained with white noise signal and ground truth relative source direction. Experimental evaluation with speech signals shows that the CNN can localize the speech source even without an explicit algorithm for dealing with missing spectral features.},
language = {en},
booktitle = {Workshop on {Crossmodal} {Learning} for {Intelligent} {Robotics} in conjunction with {IEEE}/{RSJ} {IROS}},
author = {Nguyen, Quan and Girin, Laurent and Bailly, Gérard and Elisei, Frédéric and Nguyen, Duc-Canh},
year = {2018},
keywords = {lu},
file = {Nguyen et al. - Autonomous Sensorimotor Learning for Sound Source .pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\434BHW29\\Nguyen et al. - Autonomous Sensorimotor Learning for Sound Source .pdf:application/pdf},
}
@inproceedings{daniel_time_2020,
title = {Time domain velocity vector for retracing the multipath propagation},
isbn = {978-1-5090-6631-5},
doi = {10.1109/ICASSP40776.2020.9054561},
abstract = {We propose a conceptually and computationally simple form of sound velocity that offers a readable view of the interference between direct and indirect sound waves. Unlike most approaches in the literature, it jointly exploits both active and reactive sound intensity measurements, as typically derived from a first order ambisonics recording. This representation has a potential both as a valuable tool for directly analyzing sound multipath propagation, as well as being a new spatial feature format for machine learning algorithms in audio and acoustics. As a showcase, we demonstrate that the Directionof-Arrival and the range of a sound source can be estimated as a development of this approach. To the best knowledge of the authors, this is the first time that range is estimated from an ambisonics recording.},
language = {en},
urldate = {2020-05-25},
booktitle = {{IEEE} {International} {Conference} on {Acoustics}, {Speech} and {Signal} {Processing}},
author = {Daniel, Jerome and Kitic, Srdan},
year = {2020},
keywords = {lu},
pages = {421--425},
file = {Daniel et Kitic - 2020 - Time Domain Velocity Vector for Retracing the Mult.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\7RWFEP3N\\Daniel et Kitic - 2020 - Time Domain Velocity Vector for Retracing the Mult.pdf:application/pdf},
}
@article{gannot_signal_2001,
title = {Signal enhancement using beamforming and nonstationarity with applications to speech},
volume = {49},
issn = {1941-0476},
doi = {10.1109/78.934132},
abstract = {We consider a sensor array located in an enclosure, where arbitrary transfer functions (TFs) relate the source signal and the sensors. The array is used for enhancing a signal contaminated by interference. Constrained minimum power adaptive beamforming, which has been suggested by Frost (1972) and, in particular, the generalized sidelobe canceler (GSC) version, which has been developed by Griffiths and Jim (1982), are the most widely used beamforming techniques. These methods rely on the assumption that the received signals are simple delayed versions of the source signal. The good interference suppression attained under this assumption is severely impaired in complicated acoustic environments, where arbitrary TFs may be encountered. In this paper, we consider the arbitrary TF case. We propose a GSC solution, which is adapted to the general TF case. We derive a suboptimal algorithm that can be implemented by estimating the TFs ratios, instead of estimating the TFs. The TF ratios are estimated by exploiting the nonstationarity characteristics of the desired signal. The algorithm is applied to the problem of speech enhancement in a reverberating room. The discussion is supported by an experimental study using speech and noise signals recorded in an actual room acoustics environment.},
number = {8},
journal = {IEEE Transactions on Signal Processing},
author = {Gannot, Sharon and Burshtein, David and Weinstein, Ehud},
month = aug,
year = {2001},
pages = {1614--1626},
file = {gannot2001.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\QDZ29X4E\\gannot2001.pdf:application/pdf;IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\E7MV6WSX\\references.html:text/html;Version soumise:C\:\\Users\\RQML4978\\Zotero\\storage\\8S8AXFEY\\Gannot et al. - 2001 - Signal enhancement using beamforming and nonstatio.pdf:application/pdf},
}
@inproceedings{opochinsky_deep_2019,
title = {Deep ranking-based sound source localization},
isbn = {978-1-72811-123-0},
doi = {10.1109/WASPAA.2019.8937159},
abstract = {Sound source localization is a cumbersome task in challenging reverberation conditions. Recently, there is a growing interest in developing learning-based localization methods. In this approach, acoustic features are extracted from the measured signals and then given as input to a model that maps them to the corresponding source positions. Typically, a massive dataset of labeled samples from known positions is required to train such models.},
language = {en},
urldate = {2020-07-02},
booktitle = {{IEEE} {Workshop} on {Applications} of {Signal} {Processing} to {Audio} and {Acoustics}},
author = {Opochinsky, Renana and Laufer-Goldshtein, Bracha and Gannot, Sharon and Chechik, Gal},
year = {2019},
keywords = {lu},
pages = {283--287},
file = {Opochinsky et al. - 2019 - Deep Ranking-Based Sound Source Localization.pdf:C\:\\Users\\RQML4978\\Zotero\\storage\\K8NJHK4T\\Opochinsky et al. - 2019 - Deep Ranking-Based Sound Source Localization.pdf:application/pdf},
}
@inproceedings{wang_speaker_2020,
title = {Speaker counting model based on transfer learning from {SincNet} bottleneck layer},
doi = {10.1109/PerCom45495.2020.9127390},
abstract = {People counting techniques have been widely researched recently and many different types of sensors can be used in this context. In this paper, we propose a system based on a deep-learning model able to identify the number of people in the crowded scenarios through the speech sound. In a nutshell the system relies on two components: counting concurrent speakers in overlapping talking sound directly and clustering single-speaker sound by speaker-identity over time. Compared to previously proposed speaker-counting systems models that only cluster single-speaker sound, this system is more accurate and less vulnerable to the overlapping sound in the crowded environment. In addition, counting speakers in overlapping sound also gives the minimal number of speakers so that it also improves the counting accuracy in a quiet environment.Our methodology is inspired by the newly proposed SincNet deep neural network framework which proves to be outstanding and highly efficient in sound processing with raw signals. By transferring the bottleneck layer of SincNet model as features fed to our speaker clustering model we reached a noticeably better performance than previous models who rely on the use MFCC and other engineered features.},
booktitle = {{IEEE} {International} {Conference} on {Pervasive} {Computing} and {Communications}},
author = {Wang, Wei and Seraj, Fatjon and Meratnia, Nirvana and Havinga, Paul J.M.},
month = mar,
year = {2020},
keywords = {non-lu},
pages = {1--8},
file = {IEEE Xplore Abstract Record:C\:\\Users\\RQML4978\\Zotero\\storage\\8UFGTFJC\\9127390.html:text/html;IEEE Xplore Full Text PDF:C\:\\Users\\RQML4978\\Zotero\\storage\\NXDGL7KB\\Wang et al. - 2020 - Speaker Counting Model based on Transfer Learning .pdf:application/pdf},