citations.bib

@book{shalevshwartz_2014_understanding,
  author = {Shalev-Shwartz, Shai and Ben-David, Shai},
  publisher = {Cambridge University Press},
  title = {Understanding Machine Learning: From Theory to Algorithms},
  url = {https://www.cambridge.org/core/books/understanding-machine-learning/3059695661405D25673058E43C8BE2A6},
  urldate = {2022-01-04},
  year = {2014},
  organization = {Cambridge University Press}
}
@book{cohn_2013_measure,
  author = {Cohn, Donald L},
  publisher = {Birkhäuser},
  title = {Measure theory},
  year = {2013}
}
@book{rynne_2008_linear,
  author = {Rynne, Bryan P and Youngson, Martin A},
  publisher = {Springer},
  title = {Linear functional analysis},
  year = {2008}
}
@book{folland_1999_real,
  author = {Folland, Gerald B},
  publisher = {John Wiley And Sons},
  title = {Real analysis : modern techniques and their applications},
  year = {1999}
}
@book{ross_2015_elementary,
  author = {Ross, Kenneth A and López, Jorge M},
  publisher = {Springer},
  title = {Elementary analysis : the theory of calculus},
  year = {2015}
}
@book{wade_2014_introduction,
  author = {Wade, William R},
  publisher = {Pearson},
  title = {Introduction to Analysis: Pearson New International Edition},
  year = {2014}
}
@book{bass2011real,
  title={Real Analysis for Graduate Students: Measure and Integration Theory (Version 4.2)},
  author={Bass, R.F.},
  isbn={9781502514455},
  url={https://bass.math.uconn.edu/real.html},
  year={2014},
  publisher={Bass, R.F.}
}
@book{munkres_2014_topology,
  author = {James Raymond Munkres},
  publisher = {Pearson},
  title = {Topology},
  year = {2014}
}
@book{jacod_2004_probability,
  author = {Jacod, Jean and Protter, Philip E},
  publisher = {Springer},
  title = {Probability essentials},
  year = {2004}
}
@book{loukasgrafakos_2014_classical,
  author = {Loukas Grafakos},
  publisher = {Springer},
  title = {Classical Fourier analysis},
  year = {2014}
}
@inbook{bartle_measure_integration,
author = {Bartle, Robert G},
publisher = {John Wiley \& Sons, Ltd},
isbn = {9781118164471},
title = {Decomposition of Measures},
booktitle = {The Elements of Integration and Lebesgue Measure},
chapter = {8},
pages = {80-95},
doi = {https://doi.org/10.1002/9781118164471.ch8},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9781118164471.ch8},
eprint = {https://onlinelibrary.wiley.com/doi/pdf/10.1002/9781118164471.ch8},
year = {1995},
keywords = {decomposing measures, Hahn decomposition theorem, negative variation, Jordan decomposition theorem, monotone convergence theorem},
abstract = {Summary This chapter contains sections titled: Riesz Representation Theorem Exercises}
}
@article{cybenko_1989_approximation,
  author = {Cybenko, G.},
  month = {12},
  pages = {303-314},
  title = {Approximation by superpositions of a sigmoidal function},
  doi = {10.1007/bf02551274},
  url = {https://link.springer.com/article/10.1007%2FBF02551274},
  volume = {2},
  year = {1989},
  journal = {Mathematics of Control, Signals, and Systems}
}
@article{hornik,
title = {Multilayer feedforward networks are universal approximators},
journal = {Neural Networks},
volume = {2},
number = {5},
pages = {359-366},
year = {1989},
issn = {0893-6080},
doi = {https://doi.org/10.1016/0893-6080(89)90020-8},
url = {https://www.sciencedirect.com/science/article/pii/0893608089900208},
author = {Kurt Hornik and Maxwell Stinchcombe and Halbert White},
keywords = {Feedforward networks, Universal approximation, Mapping networks, Network representation capability, Stone-Weierstrass Theorem, Squashing functions, Sigma-Pi networks, Back-propagation networks},
abstract = {This paper rigorously establishes that standard multilayer feedforward networks with as few as one hidden layer using arbitrary squashing functions are capable of approximating any Borel measurable function from one finite dimensional space to another to any desired degree of accuracy, provided sufficiently many hidden units are available. In this sense, multilayer feedforward networks are a class of universal approximators.}
}
@misc{lu,
  author = {Lu, Zhou and Pu, Hongming and Wang, Feicheng and Hu, Zhiqiang and Wang, Liwei},
  title = {The Expressive Power of Neural Networks: A View from the Width},
  url = {https://proceedings.neurips.cc/paper/2017/file/32cbf687880eb1674a07bf717761dd3a-Paper.pdf},
  urldate = {2022-02-22}
}
@article{hornik_1991_approximation,
  author = {Hornik, Kurt},
  pages = {251-257},
  title = {Approximation capabilities of multilayer feedforward networks},
  doi = {10.1016/0893-6080(91)90009-t},
  urldate = {2020-02-28},
  volume = {4},
  year = {1991},
  journal = {Neural Networks}
}
@article{park_2020_minimum,
  author = {Park, Sejun and Yun, Chulhee and Lee, Jaeho and Shin, Jinwoo},
  month = {06},
  title = {Minimum Width for Universal Approximation},
  url = {https://arxiv.org/abs/2006.08859},
  year = {2020},
  journal = {arxiv.org}
}
@article{kidger_2020_universal,
  author = {Kidger, Patrick and Lyons, Terry},
  pages = {1-22},
  title = {Universal Approximation with Deep Narrow Networks},
  url = {https://arxiv.org/pdf/1905.08539.pdf},
  urldate = {2022-02-22},
  volume = {TBD},
  year = {2020},
  journal = {Proceedings of Machine Learning Research}
}
@book{bishop_1998_neural,
  author = {Bishop, Christopher M},
  publisher = {Springer},
  title = {Neural networks and machine learning},
  year = {1998}
}
@article{leshno_1993_multilayer,
  author = {Leshno, Moshe and Lin, Vladimir Ya. and Pinkus, Allan and Schocken, Shimon},
  month = {01},
  pages = {861-867},
  title = {Multilayer feedforward networks with a nonpolynomial activation function can approximate any function},
  doi = {10.1016/s0893-6080(05)80131-5},
  urldate = {2020-02-28},
  volume = {6},
  year = {1993},
  journal = {Neural Networks}
}
@article{senior_2020_improved,
  author = {Senior, Andrew W. and Evans, Richard and Jumper, John and Kirkpatrick, James and Sifre, Laurent and Green, Tim and Qin, Chongli and Žídek, Augustin and Nelson, Alexander W. R. and Bridgland, Alex and Penedones, Hugo and Petersen, Stig and Simonyan, Karen and Crossan, Steve and Kohli, Pushmeet and Jones, David T. and Silver, David and Kavukcuoglu, Koray and Hassabis, Demis},
  month = {01},
  pages = {706-710},
  title = {Improved protein structure prediction using potentials from deep learning},
  doi = {10.1038/s41586-019-1923-7},
  url = {https://www.nature.com/articles/s41586-019-1923-7},
  volume = {577},
  year = {2020},
  journal = {Nature}
}

@misc{vaswani_2017_attention,
  author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia},
  title = {Attention Is All You Need},
  url = {https://arxiv.org/abs/1706.03762},
  year = {2017},
  organization = {arXiv.org}
}

@article{rumelhart_1986_learning,
  author = {Rumelhart, David E. and Hinton, Geoffrey E. and Williams, Ronald J.},
  month = {10},
  pages = {533-536},
  title = {Learning representations by back-propagating errors},
  doi = {10.1038/323533a0},
  url = {https://www.nature.com/articles/323533a0?error=cookies_not_supported&code=2926f83e-9c3a-46a7-9b28-b3d19d46768a},
  volume = {323},
  year = {1986},
  journal = {Nature}
}

@misc{szegedy_2014_going,
  author = {Szegedy, Christian and Liu, Wei and Jia, Yangqing and Sermanet, Pierre and Reed, Scott and Anguelov, Dragomir and Erhan, Dumitru and Vanhoucke, Vincent and Rabinovich, Andrew},
  title = {Going Deeper with Convolutions},
  url = {https://arxiv.org/abs/1409.4842},
  year = {2014},
  organization = {arXiv.org}
}

@misc{goodfellow_2014_generative,
  author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  title = {Generative Adversarial Nets},
  url = {https://arxiv.org/pdf/1406.2661v1.pdf},
  urldate = {2019-05-31},
  year = {2014}
}

@misc{silver_2017_mastering,
  author = {Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and Lillicrap, Timothy and Simonyan, Karen and Hassabis, Demis},
  title = {Mastering Chess and Shogi by Self-Play with a General Reinforcement Learning Algorithm},
  url = {https://arxiv.org/abs/1712.01815},
  year = {2017},
  organization = {arXiv.org}
}

@misc{brown_2020_language,
  author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and Mccandlish, Sam and Radford, Alec and Sutskever, Ilya and Openai, Dario},
  title = {Language Models are Few-Shot Learners},
  url = {https://arxiv.org/pdf/2005.14165.pdf},
  year = {2020}
}
@misc{bochkovskiy_2020_yolov4,
  author = {Bochkovskiy, Alexey and Wang, Chien-Yao and Liao, Hong-Yuan},
  month = {04},
  title = {YOLOv4: Optimal Speed and Accuracy of Object Detection},
  url = {https://arxiv.org/pdf/2004.10934.pdf},
  year = {2020}
}
@misc{he_2015_deep,
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  month = {12},
  title = {Deep Residual Learning for Image Recognition},
  url = {https://arxiv.org/pdf/1512.03385.pdf},
  year = {2015}
}

@misc{li_2020_the,
  author = {Li, Mingzhen and Liu, Y and Liu, Xiaoyan and Sun, Qingxiao and You, Xin and Yang, Hailong and Luan, Zhongzhi and Gan, Lin and Yang, Guangwen and Qian, Depei and Beihang, },
  month = {08},
  title = {The Deep Learning Compiler: A Comprehensive Survey},
  url = {https://arxiv.org/pdf/2002.03794.pdf},
  urldate = {2022-02-25},
  year = {2020}
}

@misc{glorot_understanding,
  author = {Glorot, Xavier and Bengio, Yoshua},
  title = {Understanding the difficulty of training deep feedforward neural networks},
  url = {https://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf}
}

@misc{he_2015_delving,
  author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
  url = {https://arxiv.org/pdf/1502.01852.pdf},
  year = {2015}
}

@misc{ruder_2017_an,
  author = {Ruder, Sebastian},
  month = {06},
  title = {An overview of gradient descent optimization algorithms},
  url = {https://arxiv.org/pdf/1609.04747.pdf},
  year = {2017}
}
@article{li_2018_visualizing,
  author = {Li, Hao and Xu, Zheng and Taylor, Gavin and Studer, Christoph and Goldstein, Tom},
  month = {11},
  title = {Visualizing the Loss Landscape of Neural Nets},
  url = {https://arxiv.org/abs/1712.09913},
  urldate = {2022-02-26},
  year = {2018},
  journal = {arXiv:1712.09913 [cs, stat]}
}
@misc{klambauer_2017_selfnormalizing,
  author = {Klambauer, G and Unterthiner, T and Mayr, A and Hochreiter, S},
  title = {Self-Normalizing Neural Networks},
  url = {https://arxiv.org/pdf/1706.02515.pdf},
  urldate = {2020-08-27},
  year = {2017}
}

@misc{clevert_2015_fast,
  author = {Clevert, Djork-Arné and Unterthiner, Thomas and Hochreiter, Sepp},
  title = {Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)},
  url = {https://arxiv.org/abs/1511.07289},
  year = {2015},
  organization = {arXiv.org}
}

@article{trottier_2018_parametric,
  author = {Trottier, Ludovic and Giguère, Philippe and Chaib-draa, Brahim},
  month = {01},
  title = {Parametric Exponential Linear Unit for Deep Convolutional Neural Networks},
  url = {https://arxiv.org/abs/1605.09332},
  urldate = {2022-02-27},
  year = {2018},
  journal = {arXiv:1605.09332 [cs]}
}
@misc{bridle_training,
  author = {Bridle, John},
  title = {Training Stochastic Model Recognition Algorithms 211 Training Stochastic Model Recognition Algorithms as Networks can lead to Maximum Mutual Information Estimation of Parameters},
  url = {https://proceedings.neurips.cc/paper/1989/file/0336dcbab05b9d5ad24f4333c7658a0e-Paper.pdf}
}
 @book{pmlbook,
 author = "Kevin P. Murphy",
 title = "Probabilistic Machine Learning: An introduction",
 publisher = "MIT Press",
 year = 2022,
 url = "probml.ai"
}
@misc{tensorflow2015-whitepaper,
title={ {TensorFlow}: Large-Scale Machine Learning on Heterogeneous Systems},
url={https://www.tensorflow.org/},
note={Software available from tensorflow.org},
author={
    Mart\'{i}n~Abadi and
    Ashish~Agarwal and
    Paul~Barham and
    Eugene~Brevdo and
    Zhifeng~Chen and
    Craig~Citro and
    Greg~S.~Corrado and
    Andy~Davis and
    Jeffrey~Dean and
    Matthieu~Devin and
    Sanjay~Ghemawat and
    Ian~Goodfellow and
    Andrew~Harp and
    Geoffrey~Irving and
    Michael~Isard and
    Yangqing Jia and
    Rafal~Jozefowicz and
    Lukasz~Kaiser and
    Manjunath~Kudlur and
    Josh~Levenberg and
    Dandelion~Man\'{e} and
    Rajat~Monga and
    Sherry~Moore and
    Derek~Murray and
    Chris~Olah and
    Mike~Schuster and
    Jonathon~Shlens and
    Benoit~Steiner and
    Ilya~Sutskever and
    Kunal~Talwar and
    Paul~Tucker and
    Vincent~Vanhoucke and
    Vijay~Vasudevan and
    Fernanda~Vi\'{e}gas and
    Oriol~Vinyals and
    Pete~Warden and
    Martin~Wattenberg and
    Martin~Wicke and
    Yuan~Yu and
    Xiaoqiang~Zheng},
  year={2015},
}
@software{jax2018github,
  author = {James Bradbury and Roy Frostig and Peter Hawkins and Matthew James Johnson and Chris Leary and Dougal Maclaurin and George Necula and Adam Paszke and Jake Vander{P}las and Skye Wanderman-{M}ilne and Qiao Zhang},
  title = {{JAX}: composable transformations of {P}ython+{N}um{P}y programs},
  url = {http://github.com/google/jax},
  version = {0.2.5},
  year = {2018},
}
@software{autogradgithub,
  author = {Dougal Maclaurin and David Duvenaud and Matthew Johnson and Ryan P. Adams},
  title = {Autograd: Reverse-mode differentiation of native {P}ython},
  url = {http://github.com/HIPS/autograd},
  version = {1.1.2},
  year = {2015},
}
@misc{paszke_2019_pytorch,
  author = {Paszke, Adam and Gross, Sam and Massa, Francisco and Lerer, Adam and Bradbury, James and Chanan, Gregory and Killeen, Trevor and Lin, Zeming and Gimelshein, Natalia and Antiga, Luca and Desmaison, Alban and Köpf, Andreas and Yang, Edward and DeVito, Zach and Raison, Martin and Tejani, Alykhan and Chilamkurthy, Sasank and Steiner, Benoit and Fang, Lu and Bai, Junjie and Chintala, Soumith},
  title = {PyTorch: An Imperative Style, High-Performance Deep Learning Library},
  url = {https://arxiv.org/abs/1912.01703},
  year = {2019},
  organization = {arXiv.org}
}
@online{fashionmnist,
  author       = {Han Xiao and Kashif Rasul and Roland Vollgraf},
  title        = {Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms},
  date         = {2017-08-28},
  year         = {2017},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  eprint       = {cs.LG/1708.07747},
}


@article{mnist,
  title={The mnist database of handwritten digit images for machine learning research},
  author={Deng, Li},
  journal={IEEE Signal Processing Magazine},
  volume={29},
  number={6},
  pages={141--142},
  year={2012},
  publisher={IEEE}
}
@misc{frazier_2018_a,
  author = {Frazier, Peter},
  title = {A Tutorial on Bayesian Optimization},
  url = {https://arxiv.org/pdf/1807.02811.pdf},
  year = {2018}
}
@misc{agnihotri2020exploring,
  author = {Agnihotri, Apoorv and Batra, Nipun},
  title = {Exploring Bayesian Optimization},
  journal = {Distill},
  year = {2020},
  note = {https://distill.pub/2020/bayesian-optimization},
  doi = {10.23915/distill.00026}
}
@misc{shirishkeskar_2017_on,
  author = {Shirish Keskar, Nitish and Mudigere, Dheevatsa and Nocedal, Jorge and Smelyanskiy, Mikhail and Tak, Ping and Tang, Peter},
  month = {02},
  title = {ON LARGE-BATCH TRAINING FOR DEEP LEARNING: GENERALIZATION GAP AND SHARP MINIMA},
  url = {https://arxiv.org/pdf/1609.04836.pdf},
  year = {2017}
}
@misc{hoffer_2018_train,
  author = {Hoffer, Elad and Hubara, Itay and Soudry, Daniel},
  month = {01},
  title = {Train longer, generalize better: closing the generalization gap in large batch training of neural networks},
  url = {https://arxiv.org/pdf/1705.08741.pdf},
  year = {2018}
}
@book{calin_2020_deep,
  author = {Calin, Ovidiu},
  publisher = {Springer International Publishing},
  title = {Deep Learning Architectures},
  doi = {10.1007/978-3-030-36721-3},
  urldate = {2022-03-09},
  year = {2020},
  organization = {Springer Series in the Data Sciences}
}
@article{higham_2018_deep,
  author = {Higham, Catherine F. and Higham, Desmond J.},
  month = {01},
  title = {Deep Learning: An Introduction for Applied Mathematicians},
  url = {https://arxiv.org/abs/1801.05894},
  urldate = {2022-03-09},
  year = {2018},
  journal = {arXiv:1801.05894 [cs, math, stat]}
}
@misc{neutelings_neural,
  author = {Neutelings, Izaak},
  title = {Neural networks},
  url = {https://tikz.net/neural_networks/},
  urldate = {2022-03-10}
}

@misc{stutz_2020_illustrating,
  author = {Stutz, David},
  month = {06},
  title = {Illustrating (Convolutional) Neural Networks in LaTeX with TikZ • David Stutz},
  url = {https://davidstutz.de/illustrating-convolutional-neural-networks-in-latex-with-tikz/},
  urldate = {2022-03-10},
  year = {2020},
  organization = {David Stutz}
}

@misc{fashionmnistgithub,
  month = {11},
  title = {Fashion MNIST Github},
  author = {Zalando, Research},
  url = {https://github.com/zalandoresearch/fashion-mnist},
  year = {2020},
  organization = {GitHub}
}