-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsources.bib
777 lines (706 loc) · 36 KB
/
sources.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
@article{Li2021AASR,
title = {{A Better and Faster End-to-End Model for Streaming ASR}},
year = {2021},
journal = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
author = {Li, Bo and Gulati, Anmol and Yu, Jiahui and Sainath, Tara N and Chiu, Chung-Cheng and Narayanan, Arun and Chang, Shuo-Yiin and Pang, Ruoming and He, Yanzhang and Qin, James and Han, Wei and Liang, Qiao and Zhang, Yu and Strohman, Trevor and Wu, Yonghui},
arxivId = {2011.10798v2},
keywords = {Conformer, Index Terms-RNN-T, cascaded encoders, latency}
}
@article{Valiant1990AComputation,
title = {{A Bridging Model for Parallel Computation}},
year = {1990},
journal = {Communications of the ACM},
author = {Valiant, Leslie G.},
number = {8},
month = {1},
pages = {103--111},
volume = {33},
doi = {10.1145/79173.79181},
keywords = {Bulksynchronous, Design, parallel model}
}
@article{Prabhavalkar2017ARecognition,
title = {{A Comparison of Sequence-to-Sequence Models for Speech Recognition}},
year = {2017},
journal = {Interspeech},
author = {Prabhavalkar, Rohit and Rao, Kanishka and Sainath, Tara N and Li, Bo and Johnson, Leif and Jaitly, Navdeep},
pages = {939--943},
url = {http://dx.doi.org/10.21437/Interspeech.2017-233},
doi = {10.21437/Interspeech.2017-233},
keywords = {[Electronic Manuscript]}
}
@article{Hammer2003AMachines,
title = {{A Note on the Universal Approximation Capability of Support Vector Machines}},
year = {2003},
journal = {Neural Processing Letters 2003 17:1},
author = {Hammer, Barbara and Gersmann, Kai},
number = {1},
month = {2},
pages = {43--53},
volume = {17},
publisher = {Springer},
url = {https://link.springer.com/article/10.1023/A:1022936519097},
doi = {10.1023/A:1022936519097},
issn = {1573-773X},
keywords = {Artificial Intelligence, Complex Systems, Computational Intelligence}
}
@article{Robbins1951AMethod,
title = {{A Stochastic Approximation Method}},
year = {1951},
journal = {The Annals of Mathematical Statistics},
author = {Robbins, Herbert and Monro, Sutton},
number = {3},
pages = {400--407},
volume = {22}
}
@article{Sayed2014ASecurity,
title = {{A Survey of Big Data Cloud Computing Security Geoinformatics Technology and Application in Sudan View project Wireless Mesh Network (P.0000091) View project A Survey of Big Data Cloud Computing Security}},
year = {2014},
journal = {Article in International Journal of Computer Science International Journal of Computer Science and Software Engineering (IJCSSE)},
author = {Sayed, Elmustafa and Ahmed, Ali and Saeed, Rashid and Saeed, Rashid A},
number = {1},
volume = {3},
url = {www.IJCSSE.org},
issn = {2409-4285},
keywords = {Big Data, Big Data Security, Cloud Computing, Cloud Providers, NAS, big data privacy}
}
@article{Mansikkaniemi2010AcousticService,
title = {{Acoustic model and language model adaptation for a mobile dictation service}},
year = {2010},
journal = {dipl S{\"{a}}hk{\"{o}}tekniikan korkeakoulu ELEC},
author = {Mansikkaniemi, André},
publisher = {Aalto University},
url = {https://aaltodoc.aalto.fi:443/handle/123456789/3176},
keywords = {acoustic model adaptation, adaptering av akustiska modeller, adaptering av spr{\aa}kmodeller, automatic speech recognition, automatisk taligenk anning, language model adaptation, mobil diktering, mobile dictation}
}
@article{Vaswani2017AttentionNeed,
title = {{Attention Is All You Need}},
year = {2017},
journal = {Advances in neural information processing systems},
author = {Vaswani, Ashish and Brain, Google and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Łukasz and Polosukhin, Illia},
pages = {5998--6008}
}
@article{Recht2012BeneathConsequences,
title = {{Beneath the valley of the noncommutative arithmetic-geometric mean inequality: conjectures, case-studies, and consequences}},
year = {2012},
journal = {arXiv preprint arXiv:1202.4184},
author = {Recht, Benjamin and R{\'{e}}, Christopher},
arxivId = {1202.4184v1},
keywords = {Inequalities, Matrix, Optimization, Positive definite matrices, Random matrices, Randomized algorithms, Stochastic gradient descent}
}
@article{Jia2018BeyondNetworks,
title = {{Beyond Data and Model Parallelism for Deep Neural Networks}},
year = {2018},
journal = {arXiv preprint arXiv:1807.05358},
author = {Jia, Zhihao and Zaharia, Matei and Aiken, Alex},
month = {7},
url = {https://arxiv.org/abs/1807.05358v1},
arxivId = {1807.05358}
}
@article{Schuster1997BidirectionalNetworks,
title = {{Bidirectional Recurrent Neural Networks}},
year = {1997},
journal = {IEEE Transactions on Signal Processing},
author = {Schuster, Mike and Paliwal, Kuldip K},
number = {11},
volume = {45}
}
@article{Ardila2020CommonCorpus,
title = {{Common Voice: A Massively-Multilingual Speech Corpus}},
year = {2020},
journal = {Proceedings of the 12th Language Resources and Evaluation Conference},
author = {Ardila, Rosana and Branson, Megan and Davis, Kelly and Henretty, Michael and Kohler, Michael and Meyer, Josh and Morais, Reuben and Saunders, Lindsay and Tyers, Francis M and Weber, Gregor},
url = {https://voice.},
arxivId = {1912.06670v2},
keywords = {Automatic Speech Recognition, low-resource languages, spoken corpus}
}
@article{Deshmukh2020ComparisonRecognition,
title = {{Comparison of Hidden Markov Model and Recurrent Neural Network in Automatic Speech Recognition}},
year = {2020},
journal = {pdfs.semanticscholar.org},
author = {Deshmukh, AM},
number = {8},
volume = {5},
url = {https://pdfs.semanticscholar.org/8975/3a65e437d68302602a143e51d5c041616b36.pdf},
doi = {10.24018/ejers.2020.5.8.2077},
keywords = {Automatic Speech Recognition, Deep Neural Net-work, Gaussian Mixture Model, Hidden Markov Model, Index Terms-Recurrent Neural Network}
}
@article{Gulati2020Conformer:Recognition,
title = {{Conformer: Convolution-augmented Transformer for Speech Recognition}},
year = {2020},
journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
author = {Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
month = {5},
pages = {5036--5040},
volume = {2020-October},
publisher = {International Speech Communication Association},
url = {https://arxiv.org/abs/2005.08100v1},
arxivId = {2005.08100},
keywords = {Attention, Convolutional neural networks, End-to-end, Speech recognition, Transformer}
}
@article{Gulati2020Conformer:Recognitionb,
title = {{Conformer: Convolution-augmented Transformer for Speech Recognition}},
year = {2020},
journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
author = {Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
month = {5},
pages = {5036--5040},
volume = {2020-October},
publisher = {International Speech Communication Association},
url = {https://arxiv.org/abs/2005.08100v1},
arxivId = {2005.08100},
keywords = {Attention, Convolutional neural networks, End-to-end, Speech recognition, Transformer}
}
@article{Graves2006ConnectionistNetworks,
title = {{Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks}},
year = {2006},
journal = {Proceedings of the 23rd international conference on Machine learning},
author = {Graves, Alex and Fern{\'{a}}ndez, Santiago and Gomez, Faustino and Schmidhuber, Jürgen},
pages = {369--376}
}
@article{Dahl2012Context-DependentRecognition,
title = {{Context-Dependent Pre-Trained Deep Neural Networks for Large-Vocabulary Speech Recognition}},
year = {2012},
journal = {IEEE Transactions on Audio, Speech, and Language Processing},
author = {Dahl, George E and Yu, Dong and Deng, Li and Acero, Alex},
number = {1},
volume = {20},
url = {http://ieeexplore.ieee.org.},
doi = {10.1109/TASL.2011.2134090}
}
@article{Koliousis2019CROSSBOW:Servers,
title = {{CROSSBOW: Scaling Deep Learning with Small Batch Sizes on Multi-GPU Servers}},
year = {2019},
journal = {arxiv.org},
author = {Koliousis, Alexandros and Weidlich, Matthias and Mai, Luo and Costa, Paolo and Pietzuch, Peter},
url = {https://github.com/lsds/Crossbow.},
arxivId = {1901.02244v1}
}
@article{Morgan2012DeepRecognition,
title = {{Deep and Wide: Multiple Layers in Automatic Speech Recognition}},
year = {2012},
journal = {IEEE Transactions on Audio, Speech, and Language Processing},
author = {Morgan, Nelson},
number = {1},
pages = {7},
volume = {20},
doi = {10.1109/TASL.2011.2116010}
}
@article{Hestness2017DeepEmpirically,
title = {{Deep Learning Scaling is Predictable, Empirically}},
year = {2017},
journal = {arXiv preprint arXiv:1712.00409},
author = {Hestness, Joel and Narang, Sharan and Ardalani, Newsha and Diamos, Gregory and Jun, Heewoo and Kianinejad, Hassan and Mostofa Ali Patwary, Md and Yang, Yang and Zhou, Yanqi},
arxivId = {1712.00409v1}
}
@article{Hannun2014DeepRecognition,
title = {{Deep Speech: Scaling up end-to-end speech recognition}},
year = {2014},
journal = {arXiv preprint arXiv:1412.5567},
author = {Hannun, Awni and Case, Carl and Casper, Jared and Catanzaro, Bryan and Diamos, Greg and Elsen, Erich and Prenger, Ryan and Satheesh, Sanjeev and Sengupta, Shubho and Coates, Adam and Ng, Andrew Y.},
month = {12},
url = {http://arxiv.org/abs/1412.5567},
arxivId = {1412.5567}
}
@article{Mirhoseini2017DeviceLearning,
title = {{Device Placement Optimization with Reinforcement Learning}},
year = {2017},
journal = {International Conference on Machine Learning},
author = {Mirhoseini, Azalia and Pham, Hieu and Le, Quoc V. and Steiner, Benoit and Larsen, Rasmus and Zhou, Yuefeng and Kumar, Naveen and Norouzi, Mohammad and Bengio, Samy and Dean, Jeff},
month = {7},
pages = {2430--2439},
publisher = {PMLR},
url = {http://proceedings.mlr.press/v70/mirhoseini17a.html},
issn = {2640-3498}
}
@article{Langer2020DistributedPerspective,
title = {{Distributed Training of Deep Learning Models: A Taxonomic Perspective}},
year = {2020},
journal = {IEEE Transactions on Parallel and Distributed Systems},
author = {Langer, Matthias and He, Zhen and Rahayu, Wenny and Xue, Yanbo},
number = {12},
month = {7},
pages = {2802--2818},
volume = {31},
publisher = {IEEE Computer Society},
url = {https://arxiv.org/abs/2007.03970v1},
doi = {10.1109/tpds.2020.3003307},
arxivId = {2007.03970},
keywords = {Survey, big data, deep learning, distributed systems, machine learning, stochastic gradient descent}
}
@article{Ney1999DynamicRecognition,
title = {{Dynamic programming search for continuous speech recognition}},
year = {1999},
journal = {IEEE Signal Processing Magazine},
author = {Ney, Hermann J. and Ortmanns, Stefan},
number = {5},
pages = {64--83},
volume = {16},
publisher = {IEEE},
doi = {10.1109/79.790984}
}
@article{DelRio2021Earnings-21:Wild,
title = {{Earnings-21: A Practical Benchmark for ASR in the Wild}},
year = {2021},
journal = {arXiv:2104.11348},
author = {Del Rio, Miguel and Delworth, Natalie and Westerman, Ryan and Huang, Michelle and Bhandari, Nishchal and Palakapilly, Joseph and McNamara, Quinten and Dong, Joshua and Zelasko, Piotr and Jette, Miguel},
month = {4},
url = {https://arxiv.org/abs/2104.11348v3},
arxivId = {2104.11348},
keywords = {Index Terms: automatic speech recognition, dataset, earnings call, named entity recognition}
}
@article{Tanaka2021End-to-EndLearning,
title = {{End-to-End Rich Transcription-Style Automatic Speech Recognition with Semi-Supervised Learning}},
year = {2021},
journal = {arxiv.org},
author = {Tanaka, Tomohiro and Masumura, Ryo and Ihori, Mana and Takashima, Akihiko and Orihashi, Shota and Makishima, Naoki},
month = {7},
url = {https://arxiv.org/abs/2107.05382v1},
arxivId = {2107.05382},
keywords = {Index Terms: automatic speech recognition, pseudo-labeling, rich transcription, semi-supervised learning}
}
@article{Watanabe2018ESPnet:Toolkit,
title = {{ESPnet: End-to-End Speech Processing Toolkit}},
year = {2018},
journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
author = {Watanabe, Shinji and Hori, Takaaki and Karita, Shigeki and Hayashi, Tomoki and Nishitoba, Jiro and Unno, Yuya and Soplin, Nelson Enrique Yalta and Heymann, Jahn and Wiesner, Matthew and Chen, Nanxin and Renduchintala, Adithya and Ochiai, Tsubasa},
month = {3},
pages = {2207--2211},
volume = {2018-September},
publisher = {International Speech Communication Association},
url = {https://arxiv.org/abs/1804.00015v1},
arxivId = {1804.00015},
keywords = {Dynamical neural network, End-to-end, Kaldi, Open source software, Speech recognition}
}
@article{Rouhe2017FinitePrompts,
title = {{Finite state models for recognition and validation of read prompts}},
year = {2017},
journal = {dipl S{\"{a}}hk{\"{o}}tekniikan korkeakoulu ELEC},
author = {Rouhe, Aku},
month = {7},
publisher = {AALTO-YLIOPISTO},
address = {Finland},
keywords = {automatic speech recognition, language modeling, reading miscue, weighted finite state transducer},
language = {English}
}
@article{Ciresan2011FlexibleClassification,
title = {{Flexible, High Performance Convolutional Neural Networks for Image Classification}},
year = {2011},
journal = {Twenty-second international joint conference on artificial intelligence},
author = {Ciresan, Dan C. and Meier, Ueli and Masci, Jonathan and Gambardella, Luca M. and Urgen Schmidhuber, Jurgen},
keywords = {Machine Learning}
}
@article{Xiao2018Gandiva:Learning,
title = {{Gandiva: Introspective Cluster Scheduling for Deep Learning}},
year = {2018},
journal = {Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation},
author = {Xiao, Wencong and Bhardwaj, Romil and Ramjee, Ramachandran and Sivathanu, Muthian and Kwatra, Nipun and Han, Zhenhua and Patel, Pratyush and Peng, Xuan and Zhao, Hanyu and Zhang, Quanlu and Yang, Fan and Zhou, Lidong},
number = {},
pages = {595--610},
url = {https://www.usenix.org/conference/osdi18/presentation/klimovic},
isbn = {978-1-939133-08-3}
}
@article{Cui2016GeePS:Server,
title = {{GeePS: Scalable deep learning on distributed GPUs with a GPU-specialized parameter server}},
year = {2016},
journal = {Proceedings of the Eleventh European Conference on Computer Systems},
author = {Cui, Henggang and Zhang, Hao and Ganger, Gregory R and Gibbons, Phillip B and Xing, Eric P},
publisher = {ACM},
url = {http://dx.doi.org/10.1145/2901318.2901323},
address = {New York, NY, USA},
doi = {10.1145/2901318}
}
@article{Aizman2019HighLearning,
title = {{High Performance I/O for Large Scale Deep Learning}},
year = {2019},
journal = {Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019},
author = {Aizman, Alex and Maltby, Gavin and Breuel, Thomas},
month = {12},
pages = {5965--5967},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
doi = {10.1109/BIGDATA47090.2019.9005703},
keywords = {deep learning, performance, petascale, scale-out}
}
@article{Niu2011HOGWILD:Descent,
title = {{HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent}},
year = {2011},
journal = {arXiv preprint arXiv:1106.5730},
author = {Niu, Feng and Recht, Benjamin and R{\'{e}}, Christopher and Wright, Stephen J}
}
@article{Krizhevsky2012ImageNetNetworks,
title = {{ImageNet Classification with Deep Convolutional Neural Networks}},
year = {2012},
journal = {Advances in neural information processing systems},
author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
pages = {1097--1105},
volume = {25},
url = {http://code.google.com/p/cuda-convnet/}
}
@article{Mai2020KungFu:Adaptive,
title = {{KungFu: Making Training in Distributed Machine Learning Adaptive}},
year = {2020},
journal = {14th USENIX Symposium on Operating Systems Design and Implementation},
author = {Mai, Luo and Li, Guo and Wagenl{\"{a}}nder, Marcel and Fertakis, Konstantinos and Brabete, Andrei-Octavian and Pietzuch, Peter},
pages = {937--954},
volume = {},
url = {www.usenix.org/conference/osdi20/presentation/mai},
isbn = {978-1-939133-19-9}
}
@article{Dean2012LargeNetworks,
title = {{Large Scale Distributed Deep Networks}},
year = {2012},
journal = {Advances in neural information processing systems},
author = {Dean, Jeffrey and Corrado, Greg S and Monga, Rajat and Chen, Kai and Devin, Matthieu and Le, Quoc V and Mao, Mark Z and Ranzato, Aurelio and Senior, Andrew and Tucker, Paul and Yang, Ke and Ng, Andrew Y},
pages = {1223--1231},
volume = {25}
}
@article{Kannan2019Large-ScaleModel,
title = {{Large-Scale Multilingual Speech Recognition with a Streaming End-to-End Model}},
year = {2019},
journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
author = {Kannan, Anjuli and Datta, Arindrima and Sainath, Tara N. and Weinstein, Eugene and Ramabhadran, Bhuvana and Wu, Yonghui and Bapna, Ankur and Chen, Zhifeng and Lee, Seungji},
month = {9},
pages = {2130--2134},
volume = {2019-September},
publisher = {International Speech Communication Association},
url = {https://arxiv.org/abs/1909.05330v1},
arxivId = {1909.05330},
keywords = {Multilingual, RNN-T, Residual adapter, Speech recognition}
}
@article{Gers2000LearningLSTM,
title = {{Learning to Forget: Continual Prediction with LSTM}},
year = {2000},
journal = {Neural Computation},
author = {Gers, Felix A. and Schmidhuber, Jürgen and Cummins, Fred},
number = {10},
month = {10},
pages = {2451--2471},
volume = {12},
publisher = {MIT Press},
url = {http://direct.mit.edu/neco/article-pdf/12/10/2451/814643/089976600300015015.pdf},
doi = {10.1162/089976600300015015},
issn = {0899-7667}
}
@article{Panayotov2015Librispeech:Books,
title = {{Librispeech: An ASR corpus based on public domain audio books}},
year = {2015},
journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
author = {Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
month = {8},
pages = {5206--5210},
volume = {2015-August},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
doi = {10.1109/ICASSP.2015.7178964},
keywords = {Corpus, LibriVox, Speech Recognition}
}
@article{Chan2016ListenRecognition,
title = {{Listen, attend and spell: A neural network for large vocabulary conversational speech recognition}},
year = {2016},
journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
author = {Chan, William and Jaitly, Navdeep and Le, Quoc and Vinyals, Oriol},
month = {5},
pages = {4960--4964},
volume = {2016-May},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
doi = {10.1109/ICASSP.2016.7472621},
keywords = {Recurrent neural network, end-to-end speech recognition, neural attention}
}
@article{Hochreiter1997LongMemory,
title = {{Long Short-Term Memory}},
year = {1997},
journal = {Neural Computation},
author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
number = {8},
month = {11},
pages = {1735--1780},
volume = {9},
publisher = {MIT Press},
url = {http://direct.mit.edu/neco/article-pdf/9/8/1735/813796/neco.1997.9.8.1735.pdf},
doi = {10.1162/NECO.1997.9.8.1735},
issn = {0899-7667}
}
@article{Pratap2020MLS:Research,
title = {{MLS: A Large-Scale Multilingual Dataset for Speech Research}},
year = {2020},
journal = {arXiv preprint arXiv:2012.03411},
author = {Pratap, Vineel and Xu, Qiantong and Sriram, Anuroop and Synnaeve, Gabriel and Collobert, Ronan},
url = {http://www.openslr.org.},
doi = {10.21437/Interspeech.2020-2826},
keywords = {[Electronic Manuscript]}
}
@article{Enarvi2018ModelingRecognition,
title = {{Modeling Conversational Finnish for Automatic Speech Recognition}},
year = {2018},
journal = {dipl S{\"{a}}hk{\"{o}}tekniikan korkeakoulu ELEC},
author = {Enarvi, Seppo},
pages = {117 + app. 73},
series = {Aalto University publication series DOCTORAL DISSERTATIONS; 52/2018},
publisher = {Aalto University},
url = {http://urn.fi/URN:ISBN:978-952-60-7908-0},
institution = {School of Electrical Engineering},
isbn = {978-952-60-7908-0 (electronic), 978-952-60-7907-3 (printed)},
issn = {1799-4942 (electronic), 1799-4934 (printed), 1799-4934 (ISSN-L)},
keywords = {artificial neural networks, automatic speech recognition, data collection, language modeling, word classes},
language = {English}
}
@article{Nielsen2015NeuralLearning,
title = {{Neural Networks and Deep Learning}},
year = {2015},
journal = {http://neuralnetworksanddeeplearning.com/},
author = {Nielsen, Michael A.},
publisher = {Determination Press},
url = {http://neuralnetworksanddeeplearning.com}
}
@article{Li2020OnRecognition,
title = {{On the Comparison of Popular End-to-End Models for Large Scale Speech Recognition}},
year = {2020},
journal = {arXiv preprint arXiv:2005.14327},
author = {Li, Jinyu and Wu, Yu and Gaur, Yashesh and Wang, Chengyi and Zhao, Rui and Liu, Shujie},
arxivId = {2005.14327v2},
keywords = {Index Terms: end-to-end, RNN-transducer, attention-based encoder-decoder, transformer}
}
@article{Krizhevsky2014OneNetworks,
title = {{One weird trick for parallelizing convolutional neural networks}},
year = {2014},
journal = {arXiv preprint arXiv:1404.5997},
author = {Krizhevsky, Alex},
month = {4},
url = {https://arxiv.org/abs/1404.5997v2},
arxivId = {1404.5997}
}
@article{Peng2018Optimus:Clusters,
title = {{Optimus: An Efficient Dynamic Resource Scheduler for Deep Learning Clusters}},
year = {2018},
journal = {Proceedings of the 13th EuroSys Conference, EuroSys 2018},
author = {Peng, Yanghua and Bao, Yixin and Chen, Yangrui and Wu, Chuan and Guo, Chuanxiong},
month = {4},
pages = {14},
volume = {2018-January},
publisher = {Association for Computing Machinery, Inc},
url = {https://doi.org/10.1145/3190508.3190517},
doi = {10.1145/3190508.3190517},
keywords = {Resource management; deep learning}
}
@article{Deyringer2017ParallelizationHogwild,
title = {{Parallelization of Neural Network Training for NLP with Hogwild!}},
year = {2017},
journal = {Prague Bull. Math. Linguistics},
author = {Deyringer, Valentin and Fraser, Alexander and Schmid, Helmut and Okita, Tsuyoshi},
pages = {29--38},
volume = {109},
url = {http://github.com/valentindey/async-train.}
}
@article{Zhang2017Poseidon:Clusters,
title = {{Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters}},
year = {2017},
journal = {2017 USENIX Annual Technical Conference},
author = {Zhang, Hao and Xu, Shizhen and Dai, Wei and Liang, Xiaodan and Hu, Zhiting and Wei, Jinliang and Xie, Pengtao and Zheng, Zeyu and Ho, Qirong and Xing, Eric P},
url = {https://www.usenix.org/conference/atc17/technical-sessions/presentation/zhang},
isbn = {978-1-931971-38-6}
}
@article{Chilimbi2014ProjectSystem,
title = {{Project Adam: Building an Efficient and Scalable Deep Learning Training System}},
year = {2014},
journal = {Big Learning Workshop},
author = {Chilimbi, Trishul and Suzue, Yutaka and Apacible, Johnson and Kalyanaraman, Karthik},
pages = {571--582},
url = {http://www.istc-cc.cmu.edu/publications/papers/2013/ps.pdf},
isbn = {978-1-931971-16-4}
}
@article{Li2020PyTorchTraining,
title = {{PyTorch Distributed: Experiences on Accelerating Data Parallel Training}},
year = {2020},
journal = {PyTorch Distributed: Experi-ences on Accelerating Data Parallel Training. PVLDB},
author = {Li, Shen and Zhao, Yanli and Varma, Rohan and Salpekar, Omkar and Noordhuis, Pieter and Li, Teng and Paszke, Adam and Smith, Jeff and Vaughan, Brian and Damania, Pritam and Chintala, Soumith and No-ordhuis, Pieter},
number = {12},
pages = {2150--8097},
volume = {13},
url = {https://doi.org/10.14778/3415478.3415530},
doi = {10.14778/3415478.3415530}
}
@article{Paszke2019PyTorch:Library,
title = {{PyTorch: An Imperative Style, High-Performance Deep Learning Library}},
year = {2019},
journal = {Advances in Neural Information Processing Systems},
author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
volume = {32}
}
@article{Deng2013RecentMicrosoft,
title = {{Recent Advances in Deep Learning for Speech Research at Microsoft}},
year = {2013},
journal = {2013 IEEE International Conference on Acoustics, Speech and Signal Processing},
author = {Deng, Li and Li, Jinyu and Huang, Jui-Ting and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael L and Zweig, Geoff and He, Xiaodong and Williams, Jason and Gong, Yifan and Acero, Alex},
pages = {8604--8608},
volume = {},
keywords = {Index Terms-deep learning, convolution, dialogue, multilingual, neural network, spectral features, speech recognition}
}
@article{Narayanan2019RecognizingModels,
title = {{Recognizing long-form speech using streaming end-to-end models}},
year = {2019},
journal = {2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
author = {Narayanan, Arun and Prabhavalkar, Rohit and Chiu, Chung-Cheng and Rybach, David and Sainath, Tara N and Google, Trevor Strohman},
arxivId = {1910.11455v1},
keywords = {Index Terms-speech recognition, end-to-end, long-form, rnnt, sequence-to-sequence}
}
@article{Hagner2017RecurrentModel,
title = {{Recurrent Neural Networks for End-to-End Speech Recognition A comparison of gated units in an acoustic model}},
year = {2017},
journal = {cs.umu.se},
author = {Hagner, Johan}
}
@article{Mukhedkar2014RobustEnvironments,
title = {{Robust feature extraction methods for speech recognition in noisy environments}},
year = {2014},
journal = {1st International Conference on Networks and Soft Computing, ICNSC 2014 - Proceedings},
author = {Mukhedkar, Ajinkya Sunil and Alex, John Sahaya Rani},
pages = {295--299},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
doi = {10.1109/CNSC.2014.6906692},
keywords = {ASR, BFCC, Feature Extraction, HMM, MFCC, WMFCC}
}
@article{Mayer2020ScalableInfrastructures,
title = {{Scalable Deep Learning on Distributed Infrastructures}},
year = {2020},
journal = {ACM Computing Surveys (CSUR)},
author = {Mayer, Ruben and Jacobsen, Hans-Arno},
number = {1},
month = {2},
volume = {53},
publisher = {{\\\} ACM{\\\} PUB27{\\\} New York, NY, USA{\\\} },
url = {https://dl.acm.org/doi/abs/10.1145/3363554},
doi = {10.1145/3363554},
keywords = {Deep, learning systems}
}
@article{Cipar2013SolvingStaleness,
title = {{Solving the Straggler Problem with Bounded Staleness}},
year = {2013},
journal = {14th Workshop on Hot Topics in Operating Systems (HotOS {\{}XIV{\}})},
author = {Cipar, James and Ho, Qirong and Kim, Jin Kyu and Lee, Seunghak and Ganger, Gregory R. and Gibson, Garth and Keeton, Kimberly and Xing, Eric},
url = {https://www.usenix.org/conference/hotos13/session/cipar}
}
@article{Yuan2008SpeakerCorpus,
title = {{Speaker identification on the SCOTUS corpus}},
year = {2008},
journal = {ling.upenn.edu},
author = {Yuan, Jiahong and Liberman, Mark},
keywords = {Hidden Markov models, Index Terms-Speaker recognition, Speech analysis}
}
@article{Graves2013SpeechNetworks,
title = {{Speech recognition with deep recurrent neural networks}},
year = {2013},
journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
author = {Graves, Alex and Mohamed, Abdel Rahman and Hinton, Geoffrey},
month = {10},
pages = {6645--6649},
isbn = {9781479903566},
doi = {10.1109/ICASSP.2013.6638947},
issn = {15206149},
arxivId = {1303.5778},
keywords = {deep neural networks, recurrent neural networks, speech recognition}
}
@article{Ravanelli2021SpeechBrain:Toolkit,
title = {{SpeechBrain: A General-Purpose Speech Toolkit}},
year = {2021},
journal = {arXiv preprint arXiv:2106.04624},
author = {Ravanelli, Mirco and Parcollet, Titouan and Plantinga, Peter and Rouhe, Aku and Cornell, Samuele and Lugosch, Loren and Subakan, Cem and Dawalatabad, Nauman and Heba, Abdelwahab and Zhong, Jianyuan and Chou, Ju-Chieh and Yeh, Sung-Lin and Fu, Szu-Wei and Liao, Chien-Feng and Rastorgueva, Elena and Grondin, François and Aris, William and Na, Hwidong and Gao, Yan and De Mori, Renato and Bengio, Yoshua},
arxivId = {2106.04624v1}
}
@article{Oneill2021SPGISpeech:Recognition,
title = {{SPGISpeech: 5,000 hours of transcribed financial audio for fully formatted end-to-end speech recognition}},
year = {2021},
journal = {arXiv:preprint arXiv:2104.02014v2},
author = {O'neill, Patrick K and Lavrukhin, Vitaly and Majumdar, Somshubra and Noroozi, Vahid and Zhang, Yuekai and Kuchaiev, Oleksii and Balam, Jagadeesh and Dovzhenko, Yuliya and Freyberg, Keenan and Shulman, Michael D and Ginsburg, Boris and Watanabe, Shinji and Kucsko, Georg},
url = {https://datasets.kensho.com/datasets/scribe},
isbn = {1,966,10939,3},
arxivId = {2104.02014v2}
}
@article{Chiu2017State-of-the-artModels,
title = {{State-of-the-art Speech Recognition With Sequence-to-Sequence Models}},
year = {2017},
journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
author = {Chiu, Chung-Cheng and Sainath, Tara N. and Wu, Yonghui and Prabhavalkar, Rohit and Nguyen, Patrick and Chen, Zhifeng and Kannan, Anjuli and Weiss, Ron J. and Rao, Kanishka and Gonina, Ekaterina and Jaitly, Navdeep and Li, Bo and Chorowski, Jan and Bacchiani, Michiel},
month = {12},
pages = {4774--4778},
volume = {2018-April},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
url = {https://arxiv.org/abs/1712.01769v6},
arxivId = {1712.01769}
}
@article{Abadi2016TensorFlow:Systems,
title = {{TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems}},
year = {2016},
journal = {arXiv e-prints},
author = {Abadi, Martín and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and Chen, Zhifeng and Citro, Craig and Corrado, Greg S and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Goodfellow, Ian and Harp, Andrew and Irving, Geoffrey and Isard, Michael and Jia, Yangqing and Jozefowicz, Rafal and Kaiser, Lukasz and Kudlur, Manjunath and Levenberg, Josh and Man{\'{e}}, Dan and Monga, Rajat and Moore, Sherry and Murray, Derek and Olah, Chris and Schuster, Mike and Shlens, Jonathon and Steiner, Benoit and Sutskever, Ilya and Talwar, Kunal and Tucker, Paul and Vanhoucke, Vincent and Vasudevan, Vijay and Vi{\'{e}}gas, Fernanda and Vinyals, Oriol and Warden, Pete and Wattenberg, Martin and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
url = {www.tensorflow.org.}
}
@article{Young2002TheBook,
title = {{The HTK Book}},
year = {2002},
journal = {Cambridge university engineering department},
author = {Young, Steve and Evermann, Gunnar and Kershaw, Dan and Moore, Gareth and Odell, Julian and Ollason, Dave and Povey, Dan and Valtchev, Valtcho and Woodland, Phil}
}
@article{Povey2011TheToolkit,
title = {{The Kaldi Speech Recognition Toolkit}},
year = {2011},
journal = {IEEE 2011 workshop on automatic speech recognition and understanding},
author = {Povey, Daniel and Ghoshal, Arnab and Boulianne, Gilles and Burget, Lukáš and Glembek, Ondřej and Goel, Nagendra and Hannemann, Mirko and Motl{\'{i}}{\v{c}}ek, Petr and Qian, Yanmin and Schwarz, Petr and Silovsk´y, Jan Silovsk´y and Stemmer, Georg and Vesel´y, Karel Vesel´y},
url = {http://kaldi.sf.net/}
}
@article{Galvez2021TheUsage,
title = {{The People's Speech: A Large-Scale Diverse English Speech Recognition Dataset for Commercial Usage}},
year = {2021},
journal = {35th Conference on Neural Information Processing Systems (NeurIPS 2021},
author = {Galvez, Daniel and Diamos, Greg and Manuel, Juan and Torres, Ciro and Ai, Factored and Achorn, Keith and Gopi, Anjali and Kanter, David and Lam, Maximilian and Mazumder, Mark and Reddi, Vijay Janapa},
url = {https://github.com/mlcommons/peoples-speech}
}
@article{Mayer2017ThePath,
title = {{The tensor flow partitioning and scheduling problem: It's the critical path!}},
year = {2017},
journal = {DIDL 2017 - Proceedings of the 1st Workshop on Distributed Infrastructures for Deep Learning, Part of Middleware 2017},
author = {Mayer, Ruben and Mayer, Christian and Laich, Larissa},
month = {12},
pages = {1--6},
publisher = {Association for Computing Machinery, Inc},
url = {https://doi.org/10.1145/3154842.3154843},
doi = {10.1145/3154842.3154843},
keywords = {Critical path, Partitioning, Scheduling, Tensor flow}
}
@article{Hochreiter1998TheSolutions,
title = {{The vanishing gradient problem during learning recurrent neural nets and problem solutions}},
year = {1998},
journal = {International Journal of Uncertainty, Fuzziness and Knowlege-Based Systems},
author = {Hochreiter, Sepp},
number = {2},
pages = {107--116},
volume = {6},
publisher = {World Scientific Publishing Co. Pte Ltd},
doi = {10.1142/S0218488598000094},
keywords = {Long Short-Term Memory, Long-term dependencies, Recurrent neural nets, Vanishing gradient}
}
@article{Narayanan2019TowardTraining,
title = {{Toward Domain-Invariant Speech Recognition via Large Scale Training}},
year = {2019},
journal = {2018 IEEE Spoken Language Technology Workshop, SLT 2018 - Proceedings},
author = {Narayanan, Arun and Misra, Ananya and Sim, Khe Chai and Pundak, Golan and Tripathi, Anshuman and Elfeky, Mohamed and Haghani, Parisa and Strohman, Trevor and Bacchiani, Michiel},
pages = {441--447},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
doi = {10.1109/SLT.2018.8639610},
keywords = {codecs, domain robustness, multidomain model, noise robustness, speech recognition}
}
@article{Zhang2017VeryRecognition,
title = {{Very deep convolutional networks for end-to-end speech recognition}},
year = {2017},
journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
author = {Zhang, Yu and Chan, William and Jaitly, Navdeep},
month = {6},
pages = {4845--4849},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
doi = {10.1109/ICASSP.2017.7953077},
keywords = {Automatic Speech Recognition, End-to-End Speech Recognition, Very Deep Convolutional Neural Networks}
}
@article{Vetterli1992WaveletsDesign,
title = {{Wavelets and filter banks: Theory and design}},
year = {1992},
journal = {infoscience.epfl.ch},
author = {Vetterli, M and Herley, C},
url = {https://infoscience.epfl.ch/record/33904}
}