references.bib

@book{mingo2017free,
  title={Free probability and random matrices},
  author={Mingo, James A and Speicher, Roland},
  volume={35},
  publisher={Springer},
  year={2017}
}

@article{adlam2019random,
  title={A random matrix perspective on mixtures of nonlinearities for deep learning},
  author={Adlam, Ben and Levinson, Jake and Pennington, Jeffrey},
  journal={arXiv preprint arXiv:1912.00827},
  year={2019}
}

@article{jacot2020kernel,
  title={Kernel alignment risk estimator: risk prediction from training data},
  author={Jacot, Arthur and {\c{S}}im{\c{s}}ek, Berfin and Spadaro, Francesco and Hongler, Cl{\'e}ment and Gabriel, Franck},
  journal={arXiv preprint arXiv:2006.09796},
  year={2020}
}

@article{bartlett2020benign,
  title={Benign overfitting in linear regression},
  author={Bartlett, Peter L and Long, Philip M and Lugosi, G{\'a}bor and Tsigler, Alexander},
  journal={Proceedings of the National Academy of Sciences},
  volume={117},
  number={48},
  pages={30063--30070},
  year={2020},
  publisher={National Acad Sciences}
}

@article{mitra2019understanding,
  title={Understanding overfitting peaks in generalization error: Analytical risk curves for $ l\_2 $ and $ l\_1 $ penalized interpolation},
  author={Mitra, Partha P},
  journal={arXiv preprint arXiv:1906.03667},
  year={2019}
}

@article{belkin2020two,
  title={Two models of double descent for weak features},
  author={Belkin, Mikhail and Hsu, Daniel and Xu, Ji},
  journal={SIAM Journal on Mathematics of Data Science},
  volume={2},
  number={4},
  pages={1167--1180},
  year={2020},
  publisher={SIAM}
}

@inproceedings{rakhlin2019consistency,
  title={Consistency of interpolation with laplace kernels is a high-dimensional phenomenon},
  author={Rakhlin, Alexander and Zhai, Xiyu},
  booktitle={Conference on Learning Theory},
  pages={2595--2623},
  year={2019},
  organization={PMLR}
}

@article{liang2020just,
  title={Just interpolate: Kernel “ridgeless” regression can generalize},
  author={Liang, Tengyuan and Rakhlin, Alexander},
  journal={The Annals of Statistics},
  volume={48},
  number={3},
  pages={1329--1347},
  year={2020},
  publisher={Institute of Mathematical Statistics}
}

@inproceedings{belkin2018understand,
  title={To understand deep learning we need to understand kernel learning},
  author={Belkin, Mikhail and Ma, Siyuan and Mandal, Soumik},
  booktitle={International Conference on Machine Learning},
  pages={541--549},
  year={2018},
  organization={PMLR}
}

@article{nakkiran2019deep,
  title={Deep double descent: Where bigger models and more data hurt},
  author={Nakkiran, Preetum and Kaplun, Gal and Bansal, Yamini and Yang, Tristan and Barak, Boaz and Sutskever, Ilya},
  journal={arXiv preprint arXiv:1912.02292},
  year={2019}
}

@article{advani2020high,
  title={High-dimensional dynamics of generalization error in neural networks},
  author={Advani, Madhu S and Saxe, Andrew M and Sompolinsky, Haim},
  journal={Neural Networks},
  volume={132},
  pages={428--446},
  year={2020},
  publisher={Elsevier}
}

@article{belkin2019reconciling,
  title={Reconciling modern machine-learning practice and the classical bias--variance trade-off},
  author={Belkin, Mikhail and Hsu, Daniel and Ma, Siyuan and Mandal, Soumik},
  journal={Proceedings of the National Academy of Sciences},
  volume={116},
  number={32},
  pages={15849--15854},
  year={2019},
  publisher={National Acad Sciences}
}

@article{zhang2021understanding,
  title={Understanding deep learning (still) requires rethinking generalization},
  author={Zhang, Chiyuan and Bengio, Samy and Hardt, Moritz and Recht, Benjamin and Vinyals, Oriol},
  journal={Communications of the ACM},
  volume={64},
  number={3},
  pages={107--115},
  year={2021},
  publisher={ACM New York, NY, USA}
}

@article{canziani2016analysis,
  title={An analysis of deep neural network models for practical applications},
  author={Canziani, Alfredo and Paszke, Adam and Culurciello, Eugenio},
  journal={arXiv preprint arXiv:1605.07678},
  year={2016}
}

@article {bun2016cleaning,
    AUTHOR = {Bun, Jo\"{e}l and Bouchaud, Jean-Philippe and Potters, Marc},
     TITLE = {Cleaning large correlation matrices: tools from random matrix
              theory},
   JOURNAL = {Phys. Rep.},
  FJOURNAL = {Physics Reports. A Review Section of Physics Letters},
    VOLUME = {666},
      YEAR = {2017},
     PAGES = {1--109},
      ISSN = {0370-1573},
   MRCLASS = {60B20 (15B52 65F99 91G10)},
  MRNUMBER = {3590056},
       DOI = {10.1016/j.physrep.2016.10.005},
       URL = {https://10.1016/j.physrep.2016.10.005},
}

@InProceedings{pennington2017geometry,
  title = 	 {Geometry of Neural Network Loss Surfaces via Random Matrix Theory},
  author =       {Jeffrey Pennington and Yasaman Bahri},
  booktitle = 	 {Proceedings of the 34th International Conference on Machine Learning},
  pages = 	 {2798--2806},
  year = 	 {2017},
  volume = 	 {70},
  series = 	 {Proceedings of Machine Learning Research},
  publisher =    {PMLR},
  pdf = 	 {http://proceedings.mlr.press/v70/pennington17a/pennington17a.pdf},
  url = 	 {
http://proceedings.mlr.press/v70/pennington17a.html
},
}

@article {baik2006eigenvalues,
    AUTHOR = {Baik, Jinho and Silverstein, Jack W.},
     TITLE = {Eigenvalues of large sample covariance matrices of spiked
              population models},
   JOURNAL = {J. Multivariate Anal.},
  FJOURNAL = {Journal of Multivariate Analysis},
    VOLUME = {97},
      YEAR = {2006},
    NUMBER = {6},
     PAGES = {1382--1408},
       DOI = {10.1016/j.jmva.2005.08.003},
       URL = {https://10.1016/j.jmva.2005.08.003},
}

@book{bai2010spectral,
  title={\href{https://link.springer.com/book/10.1007\%2F978-1-4419-0661-8}{Spectral analysis of large dimensional random matrices}},
  author={Bai, Z. and Silverstein, J.},
  volume={20},
  year={2010},
  publisher={Springer}
}

@phdthesis{liao2019random,
  author       = {Zhenyu Liao}, 
  title        = {A random matrix framework for large dimensional
machine learning and neural networks},
  school       = {Université Paris-Saclay},
  year         = 2019
}
@article{domingo2020average,
  title={Average-case acceleration for bilinear games and normal matrices},
  author={Domingo-Enrich, Carles and Pedregosa, Fabian and Scieur, Damien},
  journal={arXiv preprint arXiv:2010.02076},
  year={2020}
}
@inproceedings{montgomery1973pair,
  title={The pair correlation of zeros of the zeta function},
  author={Montgomery, Hugh L},
  booktitle={Proc. Symp. Pure Math},
  year={1973},
  url={http://www-personal.umich.edu/~hlm/paircor1.pdf}
}
@book{anderson2010introduction,
  title={An introduction to random matrices},
  author={Anderson, Greg W and Guionnet, Alice and Zeitouni, Ofer},
  year={2010},
  publisher={Cambridge university press}
}
@article{bouchaud2009financial,
  title={Financial applications of random matrix theory: a short review},
  author={Bouchaud, Jean-Philippe and Potters, Marc},
  journal={arXiv preprint arXiv:0910.1205},
  year={2009},
  url={https://arxiv.org/pdf/0910.1205}
}
@article{erdos1960evolution,
  title={On the evolution of random graphs},
  author={Erdos, Paul and R{\'e}nyi, Alfr{\'e}d},
  journal={Publ. Math. Inst. Hung. Acad. Sci},
  year={1960},
  url={http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.348.530&rep=rep1&type=pdf}
}
@book{tulino2004random,
  title={Random matrix theory and wireless communications},
  author={Tulino, Antonia M and Verd{\'u}, Sergio and Verdu, Sergio},
  year={2004},
  publisher={Now Publishers Inc},
  url={http://dx.doi.org/10.1561/0100000001}
}
@article{wishart1928generalised,
  title={The generalised product moment distribution in samples from a normal multivariate population},
  author={Wishart, John},
  journal={Biometrika},
  year={1928},
  publisher={JSTOR},
  url={https://doi.org/10.2307/2331939}
}

@inproceedings{choromanska2015loss,
  title={The loss surfaces of multilayer networks},
  author={Choromanska, Anna and Henaff, Mikael and Mathieu, Michael and Arous, G{\'e}rard Ben and LeCun, Yann},
  booktitle={Artificial intelligence and statistics},
  year={2015},
  organization={PMLR},
  url={https://arxiv.org/pdf/1412.0233.pdf}
}

@inproceedings{yao2020pyhessian,
  title={Pyhessian: Neural networks through the lens of the hessian},
  author={Yao, Zhewei and Gholami, Amir and Keutzer, Kurt and Mahoney, Michael W},
  booktitle={2020 IEEE International Conference on Big Data (Big Data)},
  year={2020},
  organization={IEEE},
  url={https://github.com/amirgholami/PyHessian}
}

@book{mehta2004random,
  title={Random matrices},
  author={Mehta, Madan Lal},
  year={2004},
  publisher={Elsevier},
  url={https://www.elsevier.com/books/random-matrices/lal-mehta/978-0-12-088409-4}
}

@inproceedings{pedregosa2020acceleration,
  title={Acceleration through spectral density estimation},
  author={Pedregosa, Fabian and Scieur, Damien},
  booktitle={International Conference on Machine Learning},
  year={2020},
  url={https://arxiv.org/pdf/2002.04756.pdf},
  organization={PMLR}
}

@inproceedings{lacotte2020optimal,
  title={Optimal randomized first-order methods for least-squares problems},
  author={Lacotte, Jonathan and Pilanci, Mert},
  booktitle={International Conference on Machine Learning},
  year={2020},
  organization={PMLR},
  url={https://arxiv.org/pdf/2002.09488.pdf}
}

@inproceedings{adlam2020neural,
  title={The neural tangent kernel in high dimensions: Triple descent and a multi-scale theory of generalization},
  author={Adlam, Ben and Pennington, Jeffrey},
  booktitle={International Conference on Machine Learning},
  year={2020},
  url={https://arxiv.org/pdf/2008.06786.pdf},
  organization={PMLR}
}

@article{marvcenko1967distribution,
  title={Distribution of eigenvalues for some sets of random matrices},
  author={Mar{\v{c}}enko, Vladimir A and Pastur, Leonid Andreevich},
  journal={Mathematics of the USSR-Sbornik},
  year={1967},
  url={https://iopscience.iop.org/article/10.1070/SM1967v001n04ABEH001994/meta},
  publisher={IOP Publishing}
}

@article{liao2020random,
  title={A random matrix analysis of random Fourier features: beyond the Gaussian kernel, a precise phase transition, and the corresponding double descent},
  author={Liao, Zhenyu and Couillet, Romain and Mahoney, Michael W},
  journal={arXiv preprint arXiv:2006.05013},
  url={https://arxiv.org/pdf/2006.05013.pdf},
  year={2020}
}

@article{halko2011finding,
  title={Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions},
  author={Halko, Nathan and Martinsson, Per-Gunnar and Tropp, Joel A},
  journal={SIAM review},
  year={2011},
  publisher={SIAM},
  url={https://www.jstor.org/stable/23065163}
}

@inproceedings{ghorbani2019investigation,
  title={An investigation into neural net optimization via hessian eigenvalue density},
  author={Ghorbani, Behrooz and Krishnan, Shankar and Xiao, Ying},
  booktitle={International Conference on Machine Learning},
  year={2019},
  organization={PMLR},
  url={https://arxiv.org/pdf/1901.10159.pdf}
}

@article{sagun2014explorations,
  title={Explorations on high dimensional landscapes},
  author={Sagun, Levent and Guney, V Ugur and Arous, Gerard Ben and LeCun, Yann},
  journal={arXiv preprint arXiv:1412.6615},
  year={2014},
  url={https://arxiv.org/pdf/1412.6615.pdf}
}

@inproceedings{baity2018comparing,
  title={Comparing dynamics: Deep neural networks versus glassy systems},
  author={Baity-Jesi, Marco and Sagun, Levent and Geiger, Mario and Spigler, Stefano and Arous, G{\'e}rard Ben and Cammarota, Chiara and LeCun, Yann and Wyart, Matthieu and Biroli, Giulio},
  booktitle={International Conference on Machine Learning},
  year={2018},
  url={https://arxiv.org/pdf/1803.06969.pdf},
  organization={PMLR}
}

@article{gardner1988optimal,
  title={Optimal storage properties of neural network models},
  author={Gardner, Elizabeth and Derrida, Bernard},
  journal={Journal of Physics A: Mathematical and general},
  year={1988},
  publisher={IOP Publishing},
  url={http://www.phys.ens.fr/~derrida/PAPIERS/1988/optimal-storage.pdf}
}

@book{dotsenko1995introduction,
  title={An introduction to the theory of spin glasses and neural networks},
  author={Dotsenko, Viktor},
  year={1995},
  url={https://doi.org/10.1142/2460},
  publisher={World Scientific}
}
@article{pennington2017nonlinear,
  title={Nonlinear random matrix theory for deep learning},
  author={Pennington, Jeffrey and Worah, Pratik},
  year={2017},
  url={https://papers.nips.cc/paper/2017/file/0f3d014eead934bbdbacb62a01dc4831-Paper.pdf}
}
@incollection{keating1993riemann,
  title={The {Riemann} zeta-function and quantum chaology},
  author={Keating, J},
  booktitle={Quantum chaos},
  year={1993},
  publisher={Elsevier},
  url={https://arxiv.org/pdf/0708.4223.pdf}
}
@article{odlyzko1987distribution,
  title={On the distribution of spacings between zeros of the zeta function},
  author={Odlyzko, Andrew M},
  journal={Mathematics of Computation},
  url={https://doi.org/10.1090/S0025-5718-1987-0866115-0},
  year={1987}
}
@article{peche2019note,
  title={A note on the pennington-worah distribution},
  author={P{\'e}ch{\'e}, S and others},
  journal={Electronic Communications in Probability},
  year={2019},
  url={https://doi.org/10.1214/19-ECP262},
  publisher={The Institute of Mathematical Statistics and the Bernoulli Society}
}

@article{spielman2011spectral,
  title={Spectral sparsification of graphs},
  author={Spielman, Daniel A and Teng, Shang-Hua},
  journal={SIAM Journal on Computing},
  year={2011},
  publisher={SIAM},
  url={https://arxiv.org/pdf/0808.4134.pdf}
}

@article{paquette2020universality,
  title={Universality for the conjugate gradient and MINRES algorithms on sample covariance matrices},
  author={Paquette, Elliot and Trogdon, Thomas},
  journal={arXiv preprint arXiv:2007.00640},
  year={2020},
  url={https://arxiv.org/pdf/2007.00640.pdf}
}

@inproceedings{williams2001using,
  title={Using the Nystr{\"o}m method to speed up kernel machines},
  author={Williams, Christopher and Seeger, Matthias},
  booktitle={Proceedings of the 14th annual conference on neural information processing systems},
  year={2001},
  url={https://papers.nips.cc/paper/2000/hash/19de10adbaa1b2ee13f77f679fa1483a-Abstract.html}
}

@article{mei2019generalization,
  title={The generalization error of random features regression: Precise asymptotics and double descent curve},
  author={Mei, Song and Montanari, Andrea},
  journal={arXiv preprint arXiv:1908.05355},
  year={2019},
  url={https://arxiv.org/pdf/1908.05355.pdf}
}

@article{drineas2005nystrom,
  title={On the Nystr{\"o}m Method for Approximating a Gram Matrix for Improved Kernel-Based Learning.},
  author={Drineas, Petros and Mahoney, Michael W and Cristianini, Nello},
  journal={journal of machine learning research},
  url={https://www.jmlr.org/papers/volume6/drineas05a/drineas05a.pdf},
  year={2005},
}

@article{gower2015randomized,
  title={Randomized iterative methods for linear systems},
  author={Gower, Robert M and Richt{\'a}rik, Peter},
  journal={SIAM Journal on Matrix Analysis and Applications},
  year={2015},
  url={https://arxiv.org/pdf/1506.03296.pdf},
  publisher={SIAM}
}

@inproceedings{rudi2015less,
  title={Less is More: Nystr{\"o}m Computational Regularization.},
  author={Rudi, Alessandro and Camoriano, Raffaello and Rosasco, Lorenzo},
  booktitle={Advances in Neural Information Processing Systems},
  url={https://arxiv.org/pdf/1507.04717.pdf},
  year={2015}
}

@article{drineas2018lectures,
  title={Lectures on randomized numerical linear algebra},
  author={Drineas, Petros and Mahoney, Michael W},
  journal={The Mathematics of Data},
  year={2018},
  publisher={American Mathematical Soc.},
  url={https://arxiv.org/pdf/1712.08880.pdf}
 }

@article{amit1985spin,
  title={Spin-glass models of neural networks},
  author={Amit, Daniel J and Gutfreund, Hanoch and Sompolinsky, Haim},
  journal={Physical Review A},
  year={1985},
  publisher={APS},
  url={https://doi.org/10.1103/PhysRevA.32.1007}
}

@article{JMLR:v21:20-933,
  author  = {Vardan Papyan},
  title   = {Traces of Class/Cross-Class Structure Pervade Deep Learning Spectra},
  journal = {Journal of Machine Learning Research},
  year    = {2020},
  url     = {http://jmlr.org/papers/v21/20-933.html}
}

@article{dauphin2014identifying,
  title={Identifying and attacking the saddle point problem in high-dimensional non-convex optimization},
  author={Dauphin, Yann and Pascanu, Razvan and Gulcehre, Caglar and Cho, Kyunghyun and Ganguli, Surya and Bengio, Yoshua},
  journal={arXiv preprint arXiv:1406.2572},
  year={2014},
  URL={https://arxiv.org/pdf/1406.2572.pdf}
}

@article{martin2018implicit,
  title={Implicit self-regularization in deep neural networks: Evidence from random matrix theory and implications for learning},
  author={Martin, Charles H and Mahoney, Michael W},
  journal={arXiv preprint arXiv:1810.01075},
  year={2018},
  url={https://arxiv.org/pdf/1810.01075.pdf}
}

@article{baskerville2021applicability,
  title={Applicability of Random Matrix Theory in Deep Learning},
  author={Baskerville, Nicholas P and Granziol, Diego and Keating, Jonathan P},
  journal={arXiv preprint arXiv:2102.06740},
  year={2021},
  url={https://arxiv.org/pdf/2102.06740.pdf}
}

@article{hastie2019surprises,
  title={Surprises in high-dimensional ridgeless least squares interpolation},
  author={Hastie, Trevor and Montanari, Andrea and Rosset, Saharon and Tibshirani, Ryan J},
  journal={arXiv preprint arXiv:1903.08560},
  url={https://arxiv.org/pdf/1903.08560.pdf},
  year={2019}
}

@article {bai2005CLT,
    AUTHOR = {Bai, Z. D. and Silverstein, Jack W.},
     TITLE = {C{LT} for linear spectral statistics of large-dimensional
              sample covariance matrices},
   JOURNAL = {Ann. Probab.},
  FJOURNAL = {The Annals of Probability},
    VOLUME = {32},
      YEAR = {2004},
    NUMBER = {1A},
     PAGES = {553--605},
      ISSN = {0091-1798},
   MRCLASS = {60F05 (15A52 62G30 62H05)},
  MRNUMBER = {2040792},
MRREVIEWER = {Sreenivasan Ravi},
       DOI = {10.1214/aop/1078415845},
       URL = {https://10.1214/aop/1078415845},
}

@article{krbalek2000statistical,
  title={The statistical properties of the city transport in Cuernavaca (Mexico) and random matrix ensembles},
  author={Krb{\'a}lek, Milan and Seba, Petr},
  journal={Journal of Physics A: Mathematical and General},
  year={2000},
  publisher={IOP Publishing},
  url={https://arxiv.org/pdf/nlin/0001015.pdf}
}

@article{jagannath2017random,
  title={Random matrices and the New York City subway system},
  author={Jagannath, Aukosh and Trogdon, Thomas},
  journal={Physical Review E},
  year={2017},
  publisher={APS},
  url={https://arxiv.org/pdf/1703.02537.pdf}
}

@article{wigner1955characteristic,
  title={Characteristic Vectors of Bordered Matrices With Infinite Dimensions},
  author={Wigner, Eugene},
  journal={Annals of Mathematics},
  year={1955},
  publisher={JSTOR},
  url={https://doi.org/10.1007/978-3-662-02781-3_35}
}

@article{Kuczynski1992,
author = {Kuczy{\'{n}}ski, J and Wo{\'{z}}niakowski, H},
doi = {10.1137/0613066},
issn = {0895-4798},
journal = {SIAM Journal on Matrix Analysis and Applications},
month = {oct},
number = {4},
pages = {1094--1122},
title = {{Estimating the Largest Eigenvalue by the Power and Lanczos Algorithms with a Random Start}},
url = {http://epubs.siam.org/doi/10.1137/0613066},
volume = {13},
year = {1992}
}

@inproceedings{Deshpande,
author = {Deshpande, A and Spielman, D A},
booktitle = {46th Annual IEEE Symposium on Foundations of Computer Science (FOCS'05)},
doi = {10.1109/SFCS.2005.44},
isbn = {0-7695-2468-0},
pages = {349--356},
publisher = {IEEE},
title = {{Improved Smoothed Analysis of the Shadow Vertex Simplex Method}},
url = {http://ieeexplore.ieee.org/document/1530727/},
year = {2005}
}

@article{DiagonalRMT,
author = {Pfrang, C W and Deift, P and Menon, G},
file = {:Users/thomastrogdon/Library/Application Support/Mendeley Desktop/Downloaded/Pfrang, Deift, Menon - 2014 - How long does it take to compute the eigenvalues of a random symmetric matrix.pdf:pdf},
journal = {Random matrix theory, interacting particle systems, and integrable systems, MSRI Publications},
pages = {411--442},
title = {{How long does it take to compute the eigenvalues of a random symmetric matrix?}},
url = {http://arxiv.org/abs/1203.4635},
volume = {65},
year = {2014}
}


@article{Vershynin2009a,
author = {Vershynin, R},
journal = {SIAM Journal on Computing},
title = {{Beyond Hirsch Conjecture: Walks on Random Polytopes and Smoothed Complexity of the Simplex Method}},
url = {http://epubs.siam.org/doi/10.1137/070683386},
volume = {39},
year = {2009}
}
@article{PhysRev.120.1698,
  title = {"Repulsion of Energy Levels" in Complex Atomic Spectra},
  author = {Rosenzweig, Norbert and Porter, Charles E.},
  journal = {Phys. Rev.},
  year = {1960},
  url = {https://link.aps.org/doi/10.1103/PhysRev.120.1698}
}
@book{tao2012topics,
  title={Topics in random matrix theory},
  author={Tao, Terence},
  year={2012},
  publisher={American Mathematical Soc.}
}
@article{Liao2021,
abstract = {Given an optimization problem, the Hessian matrix and its eigenspectrum can be used in many ways, ranging from designing more efficient second-order algorithms to performing model analysis and regression diagnostics. When nonlinear models and non-convex problems are considered, strong simplifying assumptions are often made to make Hessian spectral analysis more tractable. This leads to the question of how relevant the conclusions of such analyses are for more realistic nonlinear models. In this paper, we exploit deterministic equivalent techniques from random matrix theory to make a $\backslash$emph{\{}precise{\}} characterization of the Hessian eigenspectra for a broad family of nonlinear models, including models that generalize the classical generalized linear models, without relying on strong simplifying assumptions used previously. We show that, depending on the data properties, the nonlinear response model, and the loss function, the Hessian can have $\backslash$emph{\{}qualitatively{\}} different spectral behaviors: of bounded or unbounded support, with single- or multi-bulk, and with isolated eigenvalues on the left- or right-hand side of the bulk. By focusing on such a simple but nontrivial nonlinear model, our analysis takes a step forward to unveil the theoretical origin of many visually striking features observed in more complex machine learning models.},
archivePrefix = {arXiv},
arxivId = {2103.01519},
author = {Liao, Z and Mahoney, M W},
eprint = {2103.01519},
month = {mar},
title = {{Hessian Eigenspectra of More Realistic Nonlinear Models}},
url = {http://arxiv.org/abs/2103.01519},
year = {2021}
}

@article{Bourgade2014,
author = {Bourgade, P and Erdős, L and Yau, H-T},
doi = {10.1007/s00220-014-2120-z},
issn = {0010-3616},
journal = {Communications in Mathematical Physics},
month = {nov},
number = {1},
pages = {261--353},
title = {{Edge Universality of Beta Ensembles}},
url = {http://link.springer.com/10.1007/s00220-014-2120-z},
volume = {332},
year = {2014}
}

@book{DeiftOrthogonalPolynomials,
address = {Providence, RI},
author = {Deift, P},
pages = {257},
publisher = {Amer. Math. Soc.},
title = {{Orthogonal Polynomials and Random Matrices: a Riemann-Hilbert Approach}},
year = {2000}
}


@article{Paquette2021,
abstract = {We propose a new framework, inspired by random matrix theory, for analyzing the dynamics of stochastic gradient descent (SGD) when both number of samples and dimensions are large. This framework applies to any fixed stepsize and the finite sum setting. Using this new framework, we show that the dynamics of SGD on a least squares problem with random data become deterministic in the large sample and dimensional limit. Furthermore, the limiting dynamics are governed by a Volterra integral equation. This model predicts that SGD undergoes a phase transition at an explicitly given critical stepsize that ultimately affects its convergence rate, which we also verify experimentally. Finally, when input data is isotropic, we provide explicit expressions for the dynamics and average-case convergence rates (i.e., the complexity of an algorithm averaged over all possible inputs). These rates show significant improvement over the worst-case complexities.},
archivePrefix = {arXiv},
arxivId = {2102.04396},
author = {Paquette, C and Lee, K and Pedregosa, F and Paquette, E},
eprint = {2102.04396},
journal = {arXiv preprint 2102.04396},
month = {feb},
title = {{SGD in the Large: Average-case Analysis, Asymptotics, and Stepsize Criticality}},
url = {http://arxiv.org/abs/2102.04396},
year = {2021}
}

@article{Erdos2013a,
author = {Erdős, L and Knowles, A and Yau, H-T and Yin, J},
doi = {10.1214/11-AOP734},
issn = {0091-1798},
journal = {The Annals of Probability},
month = {may},
number = {3B},
title = {{Spectral statistics of Erdős–R{\'{e}}nyi graphs I: Local semicircle law}},
url = {https://projecteuclid.org/journals/annals-of-probability/volume-41/issue-3B/Spectral-statistics-of-ErdősR{\'{e}}nyi-graphs-I-Local-semicircle-law/10.1214/11-AOP734.full},
volume = {41},
year = {2013}
}


@article{Baik2005,
author = {Baik, J and {Ben Arous}, G and P{\'{e}}ch{\'{e}}, S},
doi = {10.1214/009117905000000233},
issn = {0091-1798},
journal = {The Annals of Probability},
month = {sep},
number = {5},
title = {{Phase transition of the largest eigenvalue for nonnull complex sample covariance matrices}},
url = {https://projecteuclid.org/journals/annals-of-probability/volume-33/issue-5/Phase-transition-of-the-largest-eigenvalue-for-nonnull-complex-sample/10.1214/009117905000000233.full},
volume = {33},
year = {2005}
}

@article{Bloemendal2013,
author = {Bloemendal, A and Vir{\'{a}}g, B},
doi = {10.1007/s00440-012-0443-2},
issn = {0178-8051},
journal = {Probability Theory and Related Fields},
month = {aug},
number = {3-4},
pages = {795--825},
title = {{Limits of spiked random matrices I}},
url = {http://link.springer.com/10.1007/s00440-012-0443-2},
volume = {156},
year = {2013}
}


@article{Ding2019,
abstract = {We introduce a class of separable sample covariance matrices of the form {\$}\backslashwidetilde{\{}\backslashmathcal{\{}Q{\}}{\}}{\_}1:=\backslashwidetilde A{\^{}}{\{}1/2{\}} X \backslashwidetilde B X{\^{}}* \backslashwidetilde A{\^{}}{\{}1/2{\}}.{\$} Here {\$}\backslashwidetilde{\{}A{\}}{\$} and {\$}\backslashwidetilde{\{}B{\}}{\$} are positive definite matrices whose spectrums consist of bulk spectrums plus several spikes, i.e. larger eigenvalues that are separated from the bulks. Conceptually, we call {\$}\backslashwidetilde{\{}\backslashmathcal{\{}Q{\}}{\}}{\_}1{\$} a $\backslash$emph{\{}spiked separable covariance matrix model{\}}. On the one hand, this model includes the spiked covariance matrix as a special case with {\$}\backslashwidetilde{\{}B{\}}=I{\$}. On the other hand, it allows for more general correlations of datasets. In particular, for spatio-temporal dataset, {\$}\backslashwidetilde{\{}A{\}}{\$} and {\$}\backslashwidetilde{\{}B{\}}{\$} represent the spatial and temporal correlations, respectively. In this paper, we study the outlier eigenvalues and eigenvectors, i.e. the principal components, of the spiked separable covariance model {\$}\backslashwidetilde{\{}\backslashmathcal{\{}Q{\}}{\}}{\_}1{\$}. We prove the convergence of the outlier eigenvalues {\$}\backslashwidetilde \backslashlambda{\_}i{\$} and the generalized components (i.e. {\$}\backslashlangle \backslashmathbf v, \backslashwidetilde{\{}\backslashmathbf{\{}\backslashxi{\}}{\}}{\_}i \backslashrangle{\$} for any deterministic vector {\$}\backslashmathbf v{\$}) of the outlier eigenvectors {\$}\backslashwidetilde{\{}\backslashmathbf{\{}\backslashxi{\}}{\}}{\_}i{\$} with optimal convergence rates. Moreover, we also prove the delocalization of the non-outlier eigenvectors. We state our results in full generality, in the sense that they also hold near the so-called BBP transition and for degenerate outliers. Our results highlight both the similarity and difference between the spiked separable covariance matrix model and the spiked covariance model. In particular, we show that the spikes of both {\$}\backslashwidetilde{\{}A{\}}{\$} and {\$}\backslashwidetilde{\{}B{\}}{\$} will cause outliers of the eigenvalue spectrum, and the eigenvectors can help us to select the outliers that correspond to the spikes of {\$}\backslashwidetilde{\{}A{\}}{\$} (or {\$}\backslashwidetilde{\{}B{\}}{\$}).},
archivePrefix = {arXiv},
arxivId = {1905.13060},
author = {Ding, X and Yang, F},
eprint = {1905.13060},
journal = {arXiv preprint arXiv:1905.13060},
month = {may},
title = {{Spiked separable covariance matrices and principal components}},
url = {http://arxiv.org/abs/1905.13060},
year = {2019}
}


@article{Paquette2020a,
abstract = {Average-case analysis computes the complexity of an algorithm averaged over all possible inputs. Compared to worst-case analysis, it is more representative of the typical behavior of an algorithm, but remains largely unexplored in optimization. One difficulty is that the analysis can depend on the probability distribution of the inputs to the model. However, we show that this is not the case for a class of large-scale problems trained with gradient descent including random least squares and one-hidden layer neural networks with random weights. In fact, the halting time exhibits a universality property: it is independent of the probability distribution. With this barrier for average-case analysis removed, we provide the first explicit average-case convergence rates showing a tighter complexity not captured by traditional worst-case analysis. Finally, numerical simulations suggest this universality property holds for a more general class of algorithms and problems.},
archivePrefix = {arXiv},
arxivId = {2006.04299},
author = {Paquette, C and van Merri{\"{e}}nboer, B and Pedregosa, F},
eprint = {2006.04299},
journal = {arXiv preprint arXiv:2006.04299},
month = {jun},
title = {{Halting Time is Predictable for Large Models: A Universality Property and Average-case Analysis}},
url = {http://arxiv.org/abs/2006.04299},
year = {2020}
}

@article{Hestenes1952,
author = {Hestenes, M and Steifel, E},
journal = {J. Research Nat. Bur. Standards},
pages = {409--436},
title = {{Method of Conjugate Gradients for Solving Linear Systems}},
volume = {20},
year = {1952}
}


@article{Ding2021,
abstract = {We consider the conjugate gradient algorithm applied to a general class of spiked sample covariance matrices. The main result of the paper is that the norms of the error and residual vectors at any finite step concentrate on deterministic values determined by orthogonal polynomials with respect to a deformed Marchenko--Pastur law. The first-order limits and fluctuations are shown to be universal. Additionally, for the case where the bulk eigenvalues lie in a single interval we show a stronger universality result in that the asymptotic rate of convergence of the conjugate gradient algorithm only depends on the support of the bulk, provided the spikes are well-separated from the bulk. In particular, this shows that the classical condition number bound for the conjugate gradient algorithm is pessimistic for spiked matrices.},
archivePrefix = {arXiv},
arxivId = {2106.13902},
author = {Ding, X and Trogdon, T},
eprint = {2106.13902},
journal = {arXiv preprint arXiv:2106.13902},
month = {jun},
title = {{The conjugate gradient algorithm on a general class of spiked covariance matrices}},
url = {http://arxiv.org/abs/2106.13902},
year = {2021}
}


@article{Dadush2020,
author = {Dadush, D and Huiberts, S},
doi = {10.1137/18M1197205},
issn = {0097-5397},
journal = {SIAM Journal on Computing},
month = {jan},
number = {5},
pages = {STOC18--449--STOC18--499},
title = {{A Friendly Smoothed Analysis of the Simplex Method}},
url = {https://epubs.siam.org/doi/10.1137/18M1197205},
volume = {49},
year = {2020}
}


@incollection{Dantzig1990,
address = {New York, NY, USA},
author = {Dantzig, G B},
booktitle = {A history of scientific computing},
doi = {10.1145/87252.88081},
month = {jun},
pages = {141--151},
publisher = {ACM},
title = {{Origins of the simplex method}},
url = {http://dl.acm.org/doi/10.1145/87252.88081},
year = {1990}
}

@article{kaczmarz1937angenaherte,
  title={Angen{\"a}herte Aufl{\"o}sung von systemen linearer Gleichungen (English translation by Jason Stockmann): Bulletin International de l’Acad{\'e}mie Polonaise des Sciences et des Lettres},
  author={Kaczmarz, S},
  year={1937}
}

@article{TracyWidom,
author = {Tracy, C A and Widom, H},
file = {:Users/thomastrogdon/Library/Application Support/Mendeley Desktop/Downloaded/Tracy, Widom - 1994 - Level-spacing distributions and the Airy kernel.pdf:pdf},
journal = {Comm. Math. Phys.},
pages = {151--174},
title = {{Level-spacing distributions and the Airy kernel}},
volume = {159},
year = {1994}
}


@article{Deift2017,
arxivId = {1701.01896},
author = {Deift, P and Trogdon, T},
journal = {SIAM Journal on Numerical Analysis},
title = {{Universality for Eigenvalue Algorithms on Sample Covariance Matrices}},
url = {http://arxiv.org/abs/1701.01896},
year = {2017}
}


@article{Kostlan1988,
abstract = {In this paper the statistical properties of problems that occur in numerical linear algebra are studied. Bounds are calculated for the average performance of the power method for the calculation of eigenvectors of symmetric and Hermitian matrices, thus an upper bound is found for the average complexity of eigenvector calculation. The condition number of a matrix is studied and sharp bounds are calculated for the average (and the variance) loss of precision encountered when one solves a system of linear equations.},
author = {Kostlan, E},
doi = {10.1016/0377-0427(88)90402-5},
file = {:Users/thomastrogdon/Library/Application Support/Mendeley Desktop/Downloaded/Kostlan - 1988 - Complexity theory of numerical linear algebra.pdf:pdf},
issn = {03770427},
journal = {Journal of Computational and Applied Mathematics},
keywords = {Complexity,Gaussian ensembles,condition number,large systems,linear algebra,numerical analysis,power method,probability densities,random matrices},
month = {jun},
number = {2-3},
pages = {219--230},
title = {{Complexity theory of numerical linear algebra}},
url = {http://www.sciencedirect.com/science/article/pii/0377042788904025},
volume = {22},
year = {1988}
}


@article{Spielman2004,
author = {Spielman, D A and Teng, S-H},
doi = {10.1145/990308.990310},
issn = {00045411},
journal = {Journal of the ACM},
keywords = {Simplex method,complexity,perturbation,smoothed analysis},
month = {may},
number = {3},
pages = {385--463},
publisher = {ACM},
title = {{Smoothed analysis of algorithms}},
url = {http://dl.acm.org/citation.cfm?id=990308.990310},
volume = {51},
year = {2004}
}


@incollection{Dantzig1951,
abstract = {The late George B. Dantzig , widely known as the father of linear programming, was a major influence in mathematics, operations research, and economics. As Professor Emeritus at Stanford University, he continued his decades of research on linear programming and related subjects. Dantzig was awarded eight honorary doctorates, the National Medal of Science, and the John von Neumann Theory Prize from the Institute for Operations Research and the Management Sciences.The 24 chapters of this volume highlight the amazing breadth and enduring influence of Dantzigs research. Short, non-technical summaries at the opening of each major section introduce a specific research area and discuss the current significance of Dantzigs work in that field. Among the topics covered are mathematical statistics, the Simplex Method of linear programming, economic modeling, network optimization, and nonlinear programming. The book also includes a complete bibliography of Dantzigs writings.},
author = {Dantzig, G B},
booktitle = {Activity Analysis of Production and Allocation},
isbn = {0804748349},
pages = {339--347},
title = {{Maximization of a linear function of variables subject to linear inequalities}},
url = {http://cowles.econ.yale.edu/P/cm/m13/m13-21.pdf},
year = {1951}
}


@article{Smale1983,
author = {Smale, S},
doi = {10.1007/BF02591902},
issn = {0025-5610},
journal = {Mathematical Programming},
month = {oct},
number = {3},
pages = {241--262},
title = {{On the average number of steps of the simplex method of linear programming}},
url = {http://link.springer.com/10.1007/BF02591902},
volume = {27},
year = {1983}
}

@book{Borgwardt1987,
address = {Berlin, Heidelberg},
author = {Borgwardt, K H},
publisher = {Springer--Verlag},
title = {{The simplex method: A probabilistic analysis}},
url={https://www.springer.com/gp/book/9783540170969},
year = {1987}
}

@article{Strohmer2009,
author = {Strohmer, T and Vershynin, R},
doi = {10.1007/s00041-008-9030-4},
issn = {1069-5869},
journal = {Journal of Fourier Analysis and Applications},
month = {apr},
number = {2},
pages = {262--278},
title = {{A Randomized Kaczmarz Algorithm with Exponential Convergence}},
url = {http://link.springer.com/10.1007/s00041-008-9030-4},
volume = {15},
year = {2009}
}


@article{deift2014universality,
  title={Universality in numerical computations with random data},
  author={Deift, Percy A and Menon, Govind and Olver, Sheehan and Trogdon, Thomas},
  journal={Proceedings of the National Academy of Sciences},
  year={2014},
  publisher={National Acad Sciences},
  url={https://doi.org/10.1073/pnas.1413446111}
}

@article{sagun2015universal,
  title={Universal halting times in optimization and machine learning},
  author={Sagun, Levent and Trogdon, Thomas and LeCun, Yann},
  journal={Quarterly of Applied Mathematics},
  year={2017},
  url={https://doi.org/10.1090/qam/1483}
}
@article{goldstine1951numerical,
  title={Numerical inverting of matrices of high order. II},
  author={Goldstine, Herman H and Von Neumann, John},
  journal={Proceedings of the American Mathematical Society},
  year={1951},
  url={https://doi.org/10.2307/2032484}
}
@article{Knuth92,
        author = "D.E. Knuth",
        title = "Two notes on notation",
        journal = "Amer. Math. Monthly",
        volume = "99",
        year = "1992",
        pages = "403--422",
}

@article{Knowles2017,
author = {Knowles, A and Yin, J},
doi = {10.1007/s00440-016-0730-4},
issn = {0178-8051},
journal = {Probability Theory and Related Fields},
month = {oct},
number = {1-2},
pages = {257--352},
title = {{Anisotropic local laws for random matrices}},
url = {http://link.springer.com/10.1007/s00440-016-0730-4},
volume = {169},
year = {2017}
}


@book{ConcreteMath,
        author = "R.L. Graham and D.E. Knuth and O. Patashnik",
        title = "Concrete mathematics",
        publisher = "Addison-Wesley",
        address = "Reading, MA",
        year = "1989"
}

@unpublished{Simpson,
        author = "H. Simpson",
        title = "Proof of the {R}iemann {H}ypothesis",
        note = "preprint (2003), available at
        \texttt{http://www.math.drofnats.edu/riemann.ps}",
        year = "2003"
}


@incollection{Er01,
        author = "P. Erd{\H o}s",
        title = "A selection of problems and results in combinatorics",
        booktitle = "Recent trends in combinatorics (Matrahaza, 1995)",
        publisher = "Cambridge Univ. Press",
        address = "Cambridge",
        pages = "1--6",
        year = "1995"
}
@article{greenwade93,
    author  = "George D. Greenwade",
    title   = "The {C}omprehensive {T}ex {A}rchive {N}etwork ({CTAN})",
    year    = "1993",
    journal = "TUGBoat",
    volume  = "14",
    number  = "3",
    pages   = "342--351"
}