sources.bib

@article{Li2021AASR,
    title = {{A Better and Faster End-to-End Model for Streaming ASR}},
    year = {2021},
    journal = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
    author = {Li, Bo and Gulati, Anmol and Yu, Jiahui and Sainath, Tara N and Chiu, Chung-Cheng and Narayanan, Arun and Chang, Shuo-Yiin and Pang, Ruoming and He, Yanzhang and Qin, James and Han, Wei and Liang, Qiao and Zhang, Yu and Strohman, Trevor and Wu, Yonghui},
    arxivId = {2011.10798v2},
    keywords = {Conformer, Index Terms-RNN-T, cascaded encoders, latency}
}

@article{Valiant1990AComputation,
    title = {{A Bridging Model for Parallel Computation}},
    year = {1990},
    journal = {Communications of the ACM},
    author = {Valiant, Leslie G.},
    number = {8},
    month = {1},
    pages = {103--111},
    volume = {33},
    doi = {10.1145/79173.79181},
    keywords = {Bulksynchronous, Design, parallel model}
}

@article{Prabhavalkar2017ARecognition,
    title = {{A Comparison of Sequence-to-Sequence Models for Speech Recognition}},
    year = {2017},
    journal = {Interspeech},
    author = {Prabhavalkar, Rohit and Rao, Kanishka and Sainath, Tara N and Li, Bo and Johnson, Leif and Jaitly, Navdeep},
    pages = {939--943},
    url = {http://dx.doi.org/10.21437/Interspeech.2017-233},
    doi = {10.21437/Interspeech.2017-233},
    keywords = {[Electronic Manuscript]}
}

@article{Hammer2003AMachines,
    title = {{A Note on the Universal Approximation Capability of Support Vector Machines}},
    year = {2003},
    journal = {Neural Processing Letters 2003 17:1},
    author = {Hammer, Barbara and Gersmann, Kai},
    number = {1},
    month = {2},
    pages = {43--53},
    volume = {17},
    publisher = {Springer},
    url = {https://link.springer.com/article/10.1023/A:1022936519097},
    doi = {10.1023/A:1022936519097},
    issn = {1573-773X},
    keywords = {Artificial Intelligence, Complex Systems, Computational Intelligence}
}

@article{Robbins1951AMethod,
    title = {{A Stochastic Approximation Method}},
    year = {1951},
    journal = {The Annals of Mathematical Statistics},
    author = {Robbins, Herbert and Monro, Sutton},
    number = {3},
    pages = {400--407},
    volume = {22}
}

@article{Sayed2014ASecurity,
    title = {{A Survey of Big Data Cloud Computing Security Geoinformatics Technology and Application in Sudan View project Wireless Mesh Network (P.0000091) View project A Survey of Big Data Cloud Computing Security}},
    year = {2014},
    journal = {Article in International Journal of Computer Science International Journal of Computer Science and Software Engineering (IJCSSE)},
    author = {Sayed, Elmustafa and Ahmed, Ali and Saeed, Rashid and Saeed, Rashid A},
    number = {1},
    volume = {3},
    url = {www.IJCSSE.org},
    issn = {2409-4285},
    keywords = {Big Data, Big Data Security, Cloud Computing, Cloud Providers, NAS, big data privacy}
}

@article{Mansikkaniemi2010AcousticService,
    title = {{Acoustic model and language model adaptation for a mobile dictation service}},
    year = {2010},
    journal = {dipl S{\"{a}}hk{\"{o}}tekniikan korkeakoulu ELEC},
    author = {Mansikkaniemi, André},
    publisher = {Aalto University},
    url = {https://aaltodoc.aalto.fi:443/handle/123456789/3176},
    keywords = {acoustic model adaptation, adaptering av akustiska modeller, adaptering av spr{\aa}kmodeller, automatic speech recognition, automatisk taligenk anning, language model adaptation, mobil diktering, mobile dictation}
}

@article{Vaswani2017AttentionNeed,
    title = {{Attention Is All You Need}},
    year = {2017},
    journal = {Advances in neural information processing systems},
    author = {Vaswani, Ashish and Brain, Google and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Łukasz and Polosukhin, Illia},
    pages = {5998--6008}
}

@article{Recht2012BeneathConsequences,
    title = {{Beneath the valley of the noncommutative arithmetic-geometric mean inequality: conjectures, case-studies, and consequences}},
    year = {2012},
    journal = {arXiv preprint arXiv:1202.4184},
    author = {Recht, Benjamin and R{\'{e}}, Christopher},
    arxivId = {1202.4184v1},
    keywords = {Inequalities, Matrix, Optimization, Positive definite matrices, Random matrices, Randomized algorithms, Stochastic gradient descent}
}

@article{Jia2018BeyondNetworks,
    title = {{Beyond Data and Model Parallelism for Deep Neural Networks}},
    year = {2018},
    journal = {arXiv preprint arXiv:1807.05358},
    author = {Jia, Zhihao and Zaharia, Matei and Aiken, Alex},
    month = {7},
    url = {https://arxiv.org/abs/1807.05358v1},
    arxivId = {1807.05358}
}

@article{Schuster1997BidirectionalNetworks,
    title = {{Bidirectional Recurrent Neural Networks}},
    year = {1997},
    journal = {IEEE Transactions on Signal Processing},
    author = {Schuster, Mike and Paliwal, Kuldip K},
    number = {11},
    volume = {45}
}

@article{Ardila2020CommonCorpus,
    title = {{Common Voice: A Massively-Multilingual Speech Corpus}},
    year = {2020},
    journal = {Proceedings of the 12th Language Resources and Evaluation Conference},
    author = {Ardila, Rosana and Branson, Megan and Davis, Kelly and Henretty, Michael and Kohler, Michael and Meyer, Josh and Morais, Reuben and Saunders, Lindsay and Tyers, Francis M and Weber, Gregor},
    url = {https://voice.},
    arxivId = {1912.06670v2},
    keywords = {Automatic Speech Recognition, low-resource languages, spoken corpus}
}

@article{Deshmukh2020ComparisonRecognition,
    title = {{Comparison of Hidden Markov Model and Recurrent Neural Network in Automatic Speech Recognition}},
    year = {2020},
    journal = {pdfs.semanticscholar.org},
    author = {Deshmukh, AM},
    number = {8},
    volume = {5},
    url = {https://pdfs.semanticscholar.org/8975/3a65e437d68302602a143e51d5c041616b36.pdf},
    doi = {10.24018/ejers.2020.5.8.2077},
    keywords = {Automatic Speech Recognition, Deep Neural Net-work, Gaussian Mixture Model, Hidden Markov Model, Index Terms-Recurrent Neural Network}
}

@article{Gulati2020Conformer:Recognition,
    title = {{Conformer: Convolution-augmented Transformer for Speech Recognition}},
    year = {2020},
    journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    author = {Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
    month = {5},
    pages = {5036--5040},
    volume = {2020-October},
    publisher = {International Speech Communication Association},
    url = {https://arxiv.org/abs/2005.08100v1},
    arxivId = {2005.08100},
    keywords = {Attention, Convolutional neural networks, End-to-end, Speech recognition, Transformer}
}

@article{Gulati2020Conformer:Recognitionb,
    title = {{Conformer: Convolution-augmented Transformer for Speech Recognition}},
    year = {2020},
    journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    author = {Gulati, Anmol and Qin, James and Chiu, Chung-Cheng and Parmar, Niki and Zhang, Yu and Yu, Jiahui and Han, Wei and Wang, Shibo and Zhang, Zhengdong and Wu, Yonghui and Pang, Ruoming},
    month = {5},
    pages = {5036--5040},
    volume = {2020-October},
    publisher = {International Speech Communication Association},
    url = {https://arxiv.org/abs/2005.08100v1},
    arxivId = {2005.08100},
    keywords = {Attention, Convolutional neural networks, End-to-end, Speech recognition, Transformer}
}

@article{Graves2006ConnectionistNetworks,
    title = {{Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks}},
    year = {2006},
    journal = {Proceedings of the 23rd international conference on Machine learning},
    author = {Graves, Alex and Fern{\'{a}}ndez, Santiago and Gomez, Faustino and Schmidhuber, Jürgen},
    pages = {369--376}
}

@article{Dahl2012Context-DependentRecognition,
    title = {{Context-Dependent Pre-Trained Deep Neural Networks for Large-Vocabulary Speech Recognition}},
    year = {2012},
    journal = {IEEE Transactions on Audio, Speech, and Language Processing},
    author = {Dahl, George E and Yu, Dong and Deng, Li and Acero, Alex},
    number = {1},
    volume = {20},
    url = {http://ieeexplore.ieee.org.},
    doi = {10.1109/TASL.2011.2134090}
}

@article{Koliousis2019CROSSBOW:Servers,
    title = {{CROSSBOW: Scaling Deep Learning with Small Batch Sizes on Multi-GPU Servers}},
    year = {2019},
    journal = {arxiv.org},
    author = {Koliousis, Alexandros and Weidlich, Matthias and Mai, Luo and Costa, Paolo and Pietzuch, Peter},
    url = {https://github.com/lsds/Crossbow.},
    arxivId = {1901.02244v1}
}

@article{Morgan2012DeepRecognition,
    title = {{Deep and Wide: Multiple Layers in Automatic Speech Recognition}},
    year = {2012},
    journal = {IEEE Transactions on Audio, Speech, and Language Processing},
    author = {Morgan, Nelson},
    number = {1},
    pages = {7},
    volume = {20},
    doi = {10.1109/TASL.2011.2116010}
}

@article{Hestness2017DeepEmpirically,
    title = {{Deep Learning Scaling is Predictable, Empirically}},
    year = {2017},
    journal = {arXiv preprint arXiv:1712.00409},
    author = {Hestness, Joel and Narang, Sharan and Ardalani, Newsha and Diamos, Gregory and Jun, Heewoo and Kianinejad, Hassan and Mostofa Ali Patwary, Md and Yang, Yang and Zhou, Yanqi},
    arxivId = {1712.00409v1}
}

@article{Hannun2014DeepRecognition,
    title = {{Deep Speech: Scaling up end-to-end speech recognition}},
    year = {2014},
    journal = {arXiv preprint arXiv:1412.5567},
    author = {Hannun, Awni and Case, Carl and Casper, Jared and Catanzaro, Bryan and Diamos, Greg and Elsen, Erich and Prenger, Ryan and Satheesh, Sanjeev and Sengupta, Shubho and Coates, Adam and Ng, Andrew Y.},
    month = {12},
    url = {http://arxiv.org/abs/1412.5567},
    arxivId = {1412.5567}
}

@article{Mirhoseini2017DeviceLearning,
    title = {{Device Placement Optimization with Reinforcement Learning}},
    year = {2017},
    journal = {International Conference on Machine Learning},
    author = {Mirhoseini, Azalia and Pham, Hieu and Le, Quoc V. and Steiner, Benoit and Larsen, Rasmus and Zhou, Yuefeng and Kumar, Naveen and Norouzi, Mohammad and Bengio, Samy and Dean, Jeff},
    month = {7},
    pages = {2430--2439},
    publisher = {PMLR},
    url = {http://proceedings.mlr.press/v70/mirhoseini17a.html},
    issn = {2640-3498}
}

@article{Langer2020DistributedPerspective,
    title = {{Distributed Training of Deep Learning Models: A Taxonomic Perspective}},
    year = {2020},
    journal = {IEEE Transactions on Parallel and Distributed Systems},
    author = {Langer, Matthias and He, Zhen and Rahayu, Wenny and Xue, Yanbo},
    number = {12},
    month = {7},
    pages = {2802--2818},
    volume = {31},
    publisher = {IEEE Computer Society},
    url = {https://arxiv.org/abs/2007.03970v1},
    doi = {10.1109/tpds.2020.3003307},
    arxivId = {2007.03970},
    keywords = {Survey, big data, deep learning, distributed systems, machine learning, stochastic gradient descent}
}

@article{Ney1999DynamicRecognition,
    title = {{Dynamic programming search for continuous speech recognition}},
    year = {1999},
    journal = {IEEE Signal Processing Magazine},
    author = {Ney, Hermann J. and Ortmanns, Stefan},
    number = {5},
    pages = {64--83},
    volume = {16},
    publisher = {IEEE},
    doi = {10.1109/79.790984}
}

@article{DelRio2021Earnings-21:Wild,
    title = {{Earnings-21: A Practical Benchmark for ASR in the Wild}},
    year = {2021},
    journal = {arXiv:2104.11348},
    author = {Del Rio, Miguel and Delworth, Natalie and Westerman, Ryan and Huang, Michelle and Bhandari, Nishchal and Palakapilly, Joseph and McNamara, Quinten and Dong, Joshua and Zelasko, Piotr and Jette, Miguel},
    month = {4},
    url = {https://arxiv.org/abs/2104.11348v3},
    arxivId = {2104.11348},
    keywords = {Index Terms: automatic speech recognition, dataset, earnings call, named entity recognition}
}

@article{Tanaka2021End-to-EndLearning,
    title = {{End-to-End Rich Transcription-Style Automatic Speech Recognition with Semi-Supervised Learning}},
    year = {2021},
    journal = {arxiv.org},
    author = {Tanaka, Tomohiro and Masumura, Ryo and Ihori, Mana and Takashima, Akihiko and Orihashi, Shota and Makishima, Naoki},
    month = {7},
    url = {https://arxiv.org/abs/2107.05382v1},
    arxivId = {2107.05382},
    keywords = {Index Terms: automatic speech recognition, pseudo-labeling, rich transcription, semi-supervised learning}
}

@article{Watanabe2018ESPnet:Toolkit,
    title = {{ESPnet: End-to-End Speech Processing Toolkit}},
    year = {2018},
    journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    author = {Watanabe, Shinji and Hori, Takaaki and Karita, Shigeki and Hayashi, Tomoki and Nishitoba, Jiro and Unno, Yuya and Soplin, Nelson Enrique Yalta and Heymann, Jahn and Wiesner, Matthew and Chen, Nanxin and Renduchintala, Adithya and Ochiai, Tsubasa},
    month = {3},
    pages = {2207--2211},
    volume = {2018-September},
    publisher = {International Speech Communication Association},
    url = {https://arxiv.org/abs/1804.00015v1},
    arxivId = {1804.00015},
    keywords = {Dynamical neural network, End-to-end, Kaldi, Open source software, Speech recognition}
}

@article{Rouhe2017FinitePrompts,
    title = {{Finite state models for recognition and validation of read prompts}},
    year = {2017},
    journal = {dipl S{\"{a}}hk{\"{o}}tekniikan korkeakoulu ELEC},
    author = {Rouhe, Aku},
    month = {7},
    publisher = {AALTO-YLIOPISTO},
    address = {Finland},
    keywords = {automatic speech recognition, language modeling, reading miscue, weighted finite state transducer},
    language = {English}
}

@article{Ciresan2011FlexibleClassification,
    title = {{Flexible, High Performance Convolutional Neural Networks for Image Classification}},
    year = {2011},
    journal = {Twenty-second international joint conference on artificial intelligence},
    author = {Ciresan, Dan C. and Meier, Ueli and Masci, Jonathan and Gambardella, Luca M. and Urgen Schmidhuber, Jurgen},
    keywords = {Machine Learning}
}

@article{Xiao2018Gandiva:Learning,
    title = {{Gandiva: Introspective Cluster Scheduling for Deep Learning}},
    year = {2018},
    journal = {Proceedings of the 13th USENIX Symposium on Operating Systems Design and Implementation},
    author = {Xiao, Wencong and Bhardwaj, Romil and Ramjee, Ramachandran and Sivathanu, Muthian and Kwatra, Nipun and Han, Zhenhua and Patel, Pratyush and Peng, Xuan and Zhao, Hanyu and Zhang, Quanlu and Yang, Fan and Zhou, Lidong},
    number = {},
    pages = {595--610},
    url = {https://www.usenix.org/conference/osdi18/presentation/klimovic},
    isbn = {978-1-939133-08-3}
}

@article{Cui2016GeePS:Server,
    title = {{GeePS: Scalable deep learning on distributed GPUs with a GPU-specialized parameter server}},
    year = {2016},
    journal = {Proceedings of the Eleventh European Conference on Computer Systems},
    author = {Cui, Henggang and Zhang, Hao and Ganger, Gregory R and Gibbons, Phillip B and Xing, Eric P},
    publisher = {ACM},
    url = {http://dx.doi.org/10.1145/2901318.2901323},
    address = {New York, NY, USA},
    doi = {10.1145/2901318}
}

@article{Aizman2019HighLearning,
    title = {{High Performance I/O for Large Scale Deep Learning}},
    year = {2019},
    journal = {Proceedings - 2019 IEEE International Conference on Big Data, Big Data 2019},
    author = {Aizman, Alex and Maltby, Gavin and Breuel, Thomas},
    month = {12},
    pages = {5965--5967},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    doi = {10.1109/BIGDATA47090.2019.9005703},
    keywords = {deep learning, performance, petascale, scale-out}
}

@article{Niu2011HOGWILD:Descent,
    title = {{HOGWILD!: A Lock-Free Approach to Parallelizing Stochastic Gradient Descent}},
    year = {2011},
    journal = {arXiv preprint arXiv:1106.5730},
    author = {Niu, Feng and Recht, Benjamin and R{\'{e}}, Christopher and Wright, Stephen J}
}

@article{Krizhevsky2012ImageNetNetworks,
    title = {{ImageNet Classification with Deep Convolutional Neural Networks}},
    year = {2012},
    journal = {Advances in neural information processing systems},
    author = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
    pages = {1097--1105},
    volume = {25},
    url = {http://code.google.com/p/cuda-convnet/}
}

@article{Mai2020KungFu:Adaptive,
    title = {{KungFu: Making Training in Distributed Machine Learning Adaptive}},
    year = {2020},
    journal = {14th USENIX Symposium on Operating Systems Design and Implementation},
    author = {Mai, Luo and Li, Guo and Wagenl{\"{a}}nder, Marcel and Fertakis, Konstantinos and Brabete, Andrei-Octavian and Pietzuch, Peter},
    pages = {937--954},
    volume = {},
    url = {www.usenix.org/conference/osdi20/presentation/mai},
    isbn = {978-1-939133-19-9}
}

@article{Dean2012LargeNetworks,
    title = {{Large Scale Distributed Deep Networks}},
    year = {2012},
    journal = {Advances in neural information processing systems},
    author = {Dean, Jeffrey and Corrado, Greg S and Monga, Rajat and Chen, Kai and Devin, Matthieu and Le, Quoc V and Mao, Mark Z and Ranzato, Aurelio and Senior, Andrew and Tucker, Paul and Yang, Ke and Ng, Andrew Y},
    pages = {1223--1231},
    volume = {25}
}

@article{Kannan2019Large-ScaleModel,
    title = {{Large-Scale Multilingual Speech Recognition with a Streaming End-to-End Model}},
    year = {2019},
    journal = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH},
    author = {Kannan, Anjuli and Datta, Arindrima and Sainath, Tara N. and Weinstein, Eugene and Ramabhadran, Bhuvana and Wu, Yonghui and Bapna, Ankur and Chen, Zhifeng and Lee, Seungji},
    month = {9},
    pages = {2130--2134},
    volume = {2019-September},
    publisher = {International Speech Communication Association},
    url = {https://arxiv.org/abs/1909.05330v1},
    arxivId = {1909.05330},
    keywords = {Multilingual, RNN-T, Residual adapter, Speech recognition}
}

@article{Gers2000LearningLSTM,
    title = {{Learning to Forget: Continual Prediction with LSTM}},
    year = {2000},
    journal = {Neural Computation},
    author = {Gers, Felix A. and Schmidhuber, Jürgen and Cummins, Fred},
    number = {10},
    month = {10},
    pages = {2451--2471},
    volume = {12},
    publisher = {MIT Press},
    url = {http://direct.mit.edu/neco/article-pdf/12/10/2451/814643/089976600300015015.pdf},
    doi = {10.1162/089976600300015015},
    issn = {0899-7667}
}

@article{Panayotov2015Librispeech:Books,
    title = {{Librispeech: An ASR corpus based on public domain audio books}},
    year = {2015},
    journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    author = {Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},
    month = {8},
    pages = {5206--5210},
    volume = {2015-August},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    doi = {10.1109/ICASSP.2015.7178964},
    keywords = {Corpus, LibriVox, Speech Recognition}
}

@article{Chan2016ListenRecognition,
    title = {{Listen, attend and spell: A neural network for large vocabulary conversational speech recognition}},
    year = {2016},
    journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    author = {Chan, William and Jaitly, Navdeep and Le, Quoc and Vinyals, Oriol},
    month = {5},
    pages = {4960--4964},
    volume = {2016-May},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    doi = {10.1109/ICASSP.2016.7472621},
    keywords = {Recurrent neural network, end-to-end speech recognition, neural attention}
}

@article{Hochreiter1997LongMemory,
    title = {{Long Short-Term Memory}},
    year = {1997},
    journal = {Neural Computation},
    author = {Hochreiter, Sepp and Schmidhuber, Jürgen},
    number = {8},
    month = {11},
    pages = {1735--1780},
    volume = {9},
    publisher = {MIT Press},
    url = {http://direct.mit.edu/neco/article-pdf/9/8/1735/813796/neco.1997.9.8.1735.pdf},
    doi = {10.1162/NECO.1997.9.8.1735},
    issn = {0899-7667}
}

@article{Pratap2020MLS:Research,
    title = {{MLS: A Large-Scale Multilingual Dataset for Speech Research}},
    year = {2020},
    journal = {arXiv preprint arXiv:2012.03411},
    author = {Pratap, Vineel and Xu, Qiantong and Sriram, Anuroop and Synnaeve, Gabriel and Collobert, Ronan},
    url = {http://www.openslr.org.},
    doi = {10.21437/Interspeech.2020-2826},
    keywords = {[Electronic Manuscript]}
}

@article{Enarvi2018ModelingRecognition,
    title = {{Modeling Conversational Finnish for Automatic Speech Recognition}},
    year = {2018},
    journal = {dipl S{\"{a}}hk{\"{o}}tekniikan korkeakoulu ELEC},
    author = {Enarvi, Seppo},
    pages = {117 + app. 73},
    series = {Aalto University publication series DOCTORAL DISSERTATIONS; 52/2018},
    publisher = {Aalto University},
    url = {http://urn.fi/URN:ISBN:978-952-60-7908-0},
    institution = {School of Electrical Engineering},
    isbn = {978-952-60-7908-0 (electronic), 978-952-60-7907-3 (printed)},
    issn = {1799-4942 (electronic), 1799-4934 (printed), 1799-4934 (ISSN-L)},
    keywords = {artificial neural networks, automatic speech recognition, data collection, language modeling, word classes},
    language = {English}
}

@article{Nielsen2015NeuralLearning,
    title = {{Neural Networks and Deep Learning}},
    year = {2015},
    journal = {http://neuralnetworksanddeeplearning.com/},
    author = {Nielsen, Michael A.},
    publisher = {Determination Press},
    url = {http://neuralnetworksanddeeplearning.com}
}

@article{Li2020OnRecognition,
    title = {{On the Comparison of Popular End-to-End Models for Large Scale Speech Recognition}},
    year = {2020},
    journal = {arXiv preprint arXiv:2005.14327},
    author = {Li, Jinyu and Wu, Yu and Gaur, Yashesh and Wang, Chengyi and Zhao, Rui and Liu, Shujie},
    arxivId = {2005.14327v2},
    keywords = {Index Terms: end-to-end, RNN-transducer, attention-based encoder-decoder, transformer}
}

@article{Krizhevsky2014OneNetworks,
    title = {{One weird trick for parallelizing convolutional neural networks}},
    year = {2014},
    journal = {arXiv preprint arXiv:1404.5997},
    author = {Krizhevsky, Alex},
    month = {4},
    url = {https://arxiv.org/abs/1404.5997v2},
    arxivId = {1404.5997}
}

@article{Peng2018Optimus:Clusters,
    title = {{Optimus: An Efficient Dynamic Resource Scheduler for Deep Learning Clusters}},
    year = {2018},
    journal = {Proceedings of the 13th EuroSys Conference, EuroSys 2018},
    author = {Peng, Yanghua and Bao, Yixin and Chen, Yangrui and Wu, Chuan and Guo, Chuanxiong},
    month = {4},
    pages = {14},
    volume = {2018-January},
    publisher = {Association for Computing Machinery, Inc},
    url = {https://doi.org/10.1145/3190508.3190517},
    doi = {10.1145/3190508.3190517},
    keywords = {Resource management; deep learning}
}

@article{Deyringer2017ParallelizationHogwild,
    title = {{Parallelization of Neural Network Training for NLP with Hogwild!}},
    year = {2017},
    journal = {Prague Bull. Math. Linguistics},
    author = {Deyringer, Valentin and Fraser, Alexander and Schmid, Helmut and Okita, Tsuyoshi},
    pages = {29--38},
    volume = {109},
    url = {http://github.com/valentindey/async-train.}
}

@article{Zhang2017Poseidon:Clusters,
    title = {{Poseidon: An Efficient Communication Architecture for Distributed Deep Learning on GPU Clusters}},
    year = {2017},
    journal = {2017 USENIX Annual Technical Conference},
    author = {Zhang, Hao and Xu, Shizhen and Dai, Wei and Liang, Xiaodan and Hu, Zhiting and Wei, Jinliang and Xie, Pengtao and Zheng, Zeyu and Ho, Qirong and Xing, Eric P},
    url = {https://www.usenix.org/conference/atc17/technical-sessions/presentation/zhang},
    isbn = {978-1-931971-38-6}
}

@article{Chilimbi2014ProjectSystem,
    title = {{Project Adam: Building an Efficient and Scalable Deep Learning Training System}},
    year = {2014},
    journal = {Big Learning Workshop},
    author = {Chilimbi, Trishul and Suzue, Yutaka and Apacible, Johnson and Kalyanaraman, Karthik},
    pages = {571--582},
    url = {http://www.istc-cc.cmu.edu/publications/papers/2013/ps.pdf},
    isbn = {978-1-931971-16-4}
}

@article{Li2020PyTorchTraining,
    title = {{PyTorch Distributed: Experiences on Accelerating Data Parallel Training}},
    year = {2020},
    journal = {PyTorch Distributed: Experi-ences on Accelerating Data Parallel Training. PVLDB},
    author = {Li, Shen and Zhao, Yanli and Varma, Rohan and Salpekar, Omkar and Noordhuis, Pieter and Li, Teng and Paszke, Adam and Smith, Jeff and Vaughan, Brian and Damania, Pritam and Chintala, Soumith and No-ordhuis, Pieter},
    number = {12},
    pages = {2150--8097},
    volume = {13},
    url = {https://doi.org/10.14778/3415478.3415530},
    doi = {10.14778/3415478.3415530}
}

@article{Paszke2019PyTorch:Library,
    title = {{PyTorch: An Imperative Style, High-Performance Deep Learning Library}},
    year = {2019},
    journal = {Advances in Neural Information Processing Systems},
    author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Kopf, Andreas and Yang, Edward and DeVito, Zachary and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
    volume = {32}
}

@article{Deng2013RecentMicrosoft,
    title = {{Recent Advances in Deep Learning for Speech Research at Microsoft}},
    year = {2013},
    journal = {2013 IEEE International Conference on Acoustics, Speech and Signal Processing},
    author = {Deng, Li and Li, Jinyu and Huang, Jui-Ting and Yao, Kaisheng and Yu, Dong and Seide, Frank and Seltzer, Michael L and Zweig, Geoff and He, Xiaodong and Williams, Jason and Gong, Yifan and Acero, Alex},
    pages = {8604--8608},
    volume = {},
    keywords = {Index Terms-deep learning, convolution, dialogue, multilingual, neural network, spectral features, speech recognition}
}

@article{Narayanan2019RecognizingModels,
    title = {{Recognizing long-form speech using streaming end-to-end models}},
    year = {2019},
    journal = {2019 IEEE Automatic Speech Recognition and Understanding Workshop (ASRU)},
    author = {Narayanan, Arun and Prabhavalkar, Rohit and Chiu, Chung-Cheng and Rybach, David and Sainath, Tara N and Google, Trevor Strohman},
    arxivId = {1910.11455v1},
    keywords = {Index Terms-speech recognition, end-to-end, long-form, rnnt, sequence-to-sequence}
}

@article{Hagner2017RecurrentModel,
    title = {{Recurrent Neural Networks for End-to-End Speech Recognition A comparison of gated units in an acoustic model}},
    year = {2017},
    journal = {cs.umu.se},
    author = {Hagner, Johan}
}

@article{Mukhedkar2014RobustEnvironments,
    title = {{Robust feature extraction methods for speech recognition in noisy environments}},
    year = {2014},
    journal = {1st International Conference on Networks and Soft Computing, ICNSC 2014 - Proceedings},
    author = {Mukhedkar, Ajinkya Sunil and Alex, John Sahaya Rani},
    pages = {295--299},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    doi = {10.1109/CNSC.2014.6906692},
    keywords = {ASR, BFCC, Feature Extraction, HMM, MFCC, WMFCC}
}

@article{Mayer2020ScalableInfrastructures,
    title = {{Scalable Deep Learning on Distributed Infrastructures}},
    year = {2020},
    journal = {ACM Computing Surveys (CSUR)},
    author = {Mayer, Ruben and Jacobsen, Hans-Arno},
    number = {1},
    month = {2},
    volume = {53},
    publisher = {{\\\}		ACM{\\\}		PUB27{\\\}		New York, NY, USA{\\\}	},
    url = {https://dl.acm.org/doi/abs/10.1145/3363554},
    doi = {10.1145/3363554},
    keywords = {Deep, learning systems}
}

@article{Cipar2013SolvingStaleness,
    title = {{Solving the Straggler Problem with Bounded Staleness}},
    year = {2013},
    journal = {14th Workshop on Hot Topics in Operating Systems (HotOS {\{}XIV{\}})},
    author = {Cipar, James and Ho, Qirong and Kim, Jin Kyu and Lee, Seunghak and Ganger, Gregory R. and Gibson, Garth and Keeton, Kimberly and Xing, Eric},
    url = {https://www.usenix.org/conference/hotos13/session/cipar}
}

@article{Yuan2008SpeakerCorpus,
    title = {{Speaker identification on the SCOTUS corpus}},
    year = {2008},
    journal = {ling.upenn.edu},
    author = {Yuan, Jiahong and Liberman, Mark},
    keywords = {Hidden Markov models, Index Terms-Speaker recognition, Speech analysis}
}

@article{Graves2013SpeechNetworks,
    title = {{Speech recognition with deep recurrent neural networks}},
    year = {2013},
    journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    author = {Graves, Alex and Mohamed, Abdel Rahman and Hinton, Geoffrey},
    month = {10},
    pages = {6645--6649},
    isbn = {9781479903566},
    doi = {10.1109/ICASSP.2013.6638947},
    issn = {15206149},
    arxivId = {1303.5778},
    keywords = {deep neural networks, recurrent neural networks, speech recognition}
}

@article{Ravanelli2021SpeechBrain:Toolkit,
    title = {{SpeechBrain: A General-Purpose Speech Toolkit}},
    year = {2021},
    journal = {arXiv preprint arXiv:2106.04624},
    author = {Ravanelli, Mirco and Parcollet, Titouan and Plantinga, Peter and Rouhe, Aku and Cornell, Samuele and Lugosch, Loren and Subakan, Cem and Dawalatabad, Nauman and Heba, Abdelwahab and Zhong, Jianyuan and Chou, Ju-Chieh and Yeh, Sung-Lin and Fu, Szu-Wei and Liao, Chien-Feng and Rastorgueva, Elena and Grondin, François and Aris, William and Na, Hwidong and Gao, Yan and De Mori, Renato and Bengio, Yoshua},
    arxivId = {2106.04624v1}
}

@article{Oneill2021SPGISpeech:Recognition,
    title = {{SPGISpeech: 5,000 hours of transcribed financial audio for fully formatted end-to-end speech recognition}},
    year = {2021},
    journal = {arXiv:preprint arXiv:2104.02014v2},
    author = {O'neill, Patrick K and Lavrukhin, Vitaly and Majumdar, Somshubra and Noroozi, Vahid and Zhang, Yuekai and Kuchaiev, Oleksii and Balam, Jagadeesh and Dovzhenko, Yuliya and Freyberg, Keenan and Shulman, Michael D and Ginsburg, Boris and Watanabe, Shinji and Kucsko, Georg},
    url = {https://datasets.kensho.com/datasets/scribe},
    isbn = {1,966,10939,3},
    arxivId = {2104.02014v2}
}

@article{Chiu2017State-of-the-artModels,
    title = {{State-of-the-art Speech Recognition With Sequence-to-Sequence Models}},
    year = {2017},
    journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    author = {Chiu, Chung-Cheng and Sainath, Tara N. and Wu, Yonghui and Prabhavalkar, Rohit and Nguyen, Patrick and Chen, Zhifeng and Kannan, Anjuli and Weiss, Ron J. and Rao, Kanishka and Gonina, Ekaterina and Jaitly, Navdeep and Li, Bo and Chorowski, Jan and Bacchiani, Michiel},
    month = {12},
    pages = {4774--4778},
    volume = {2018-April},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    url = {https://arxiv.org/abs/1712.01769v6},
    arxivId = {1712.01769}
}

@article{Abadi2016TensorFlow:Systems,
    title = {{TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems}},
    year = {2016},
    journal = {arXiv e-prints},
    author = {Abadi, Martín and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and Chen, Zhifeng and Citro, Craig and Corrado, Greg S and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Goodfellow, Ian and Harp, Andrew and Irving, Geoffrey and Isard, Michael and Jia, Yangqing and Jozefowicz, Rafal and Kaiser, Lukasz and Kudlur, Manjunath and Levenberg, Josh and Man{\'{e}}, Dan and Monga, Rajat and Moore, Sherry and Murray, Derek and Olah, Chris and Schuster, Mike and Shlens, Jonathon and Steiner, Benoit and Sutskever, Ilya and Talwar, Kunal and Tucker, Paul and Vanhoucke, Vincent and Vasudevan, Vijay and Vi{\'{e}}gas, Fernanda and Vinyals, Oriol and Warden, Pete and Wattenberg, Martin and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang},
    url = {www.tensorflow.org.}
}

@article{Young2002TheBook,
    title = {{The HTK Book}},
    year = {2002},
    journal = {Cambridge university engineering department},
    author = {Young, Steve and Evermann, Gunnar and Kershaw, Dan and Moore, Gareth and Odell, Julian and Ollason, Dave and Povey, Dan and Valtchev, Valtcho and Woodland, Phil}
}

@article{Povey2011TheToolkit,
    title = {{The Kaldi Speech Recognition Toolkit}},
    year = {2011},
    journal = {IEEE 2011 workshop on automatic speech recognition and understanding},
    author = {Povey, Daniel and Ghoshal, Arnab and Boulianne, Gilles and Burget, Lukáš and Glembek, Ondřej and Goel, Nagendra and Hannemann, Mirko and Motl{\'{i}}{\v{c}}ek, Petr and Qian, Yanmin and Schwarz, Petr and Silovsk´y, Jan Silovsk´y and Stemmer, Georg and Vesel´y, Karel Vesel´y},
    url = {http://kaldi.sf.net/}
}

@article{Galvez2021TheUsage,
    title = {{The People's Speech: A Large-Scale Diverse English Speech Recognition Dataset for Commercial Usage}},
    year = {2021},
    journal = {35th Conference on Neural Information Processing Systems (NeurIPS 2021},
    author = {Galvez, Daniel and Diamos, Greg and Manuel, Juan and Torres, Ciro and Ai, Factored and Achorn, Keith and Gopi, Anjali and Kanter, David and Lam, Maximilian and Mazumder, Mark and Reddi, Vijay Janapa},
    url = {https://github.com/mlcommons/peoples-speech}
}

@article{Mayer2017ThePath,
    title = {{The tensor flow partitioning and scheduling problem: It's the critical path!}},
    year = {2017},
    journal = {DIDL 2017 - Proceedings of the 1st Workshop on Distributed Infrastructures for Deep Learning, Part of Middleware 2017},
    author = {Mayer, Ruben and Mayer, Christian and Laich, Larissa},
    month = {12},
    pages = {1--6},
    publisher = {Association for Computing Machinery, Inc},
    url = {https://doi.org/10.1145/3154842.3154843},
    doi = {10.1145/3154842.3154843},
    keywords = {Critical path, Partitioning, Scheduling, Tensor flow}
}

@article{Hochreiter1998TheSolutions,
    title = {{The vanishing gradient problem during learning recurrent neural nets and problem solutions}},
    year = {1998},
    journal = {International Journal of Uncertainty, Fuzziness and Knowlege-Based Systems},
    author = {Hochreiter, Sepp},
    number = {2},
    pages = {107--116},
    volume = {6},
    publisher = {World Scientific Publishing Co. Pte Ltd},
    doi = {10.1142/S0218488598000094},
    keywords = {Long Short-Term Memory, Long-term dependencies, Recurrent neural nets, Vanishing gradient}
}

@article{Narayanan2019TowardTraining,
    title = {{Toward Domain-Invariant Speech Recognition via Large Scale Training}},
    year = {2019},
    journal = {2018 IEEE Spoken Language Technology Workshop, SLT 2018 - Proceedings},
    author = {Narayanan, Arun and Misra, Ananya and Sim, Khe Chai and Pundak, Golan and Tripathi, Anshuman and Elfeky, Mohamed and Haghani, Parisa and Strohman, Trevor and Bacchiani, Michiel},
    pages = {441--447},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    doi = {10.1109/SLT.2018.8639610},
    keywords = {codecs, domain robustness, multidomain model, noise robustness, speech recognition}
}

@article{Zhang2017VeryRecognition,
    title = {{Very deep convolutional networks for end-to-end speech recognition}},
    year = {2017},
    journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
    author = {Zhang, Yu and Chan, William and Jaitly, Navdeep},
    month = {6},
    pages = {4845--4849},
    publisher = {Institute of Electrical and Electronics Engineers Inc.},
    doi = {10.1109/ICASSP.2017.7953077},
    keywords = {Automatic Speech Recognition, End-to-End Speech Recognition, Very Deep Convolutional Neural Networks}
}

@article{Vetterli1992WaveletsDesign,
    title = {{Wavelets and filter banks: Theory and design}},
    year = {1992},
    journal = {infoscience.epfl.ch},
    author = {Vetterli, M and Herley, C},
    url = {https://infoscience.epfl.ch/record/33904}
}