refs.bib

@article{goto2008anatomy,
author = {Goto, Kazushige and Geijn, Robert A. van de},
title = {Anatomy of High-Performance Matrix Multiplication},
year = {2008},
issue_date = {May 2008},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {34},
number = {3},
issn = {0098-3500},
url = {https://doi.org/10.1145/1356052.1356053},
doi = {10.1145/1356052.1356053},
journal = {ACM Trans. Math. Softw.},
month = may,
articleno = {Article 12},
numpages = {25},
keywords = {matrix multiplication, Linear algebra, basic linear algebra subprogrms}
}

@online{ibmarchive,
title = {{IBM - Archives - History of IBM - United States}},
organization = {International Business Machines},
date = {2003-01-23},
urldate = {2020-09-15},
url = {https://www.ibm.com/ibm/history/history/history_intro.html}
}

@article{montoye1990design,
author={R. K. {Montoye} and E. {Hokenek} and S. L. {Runyon}},
journal={IBM Journal of Research and Development},
title={Design of the IBM RISC System/6000 floating-point execution unit},
year={1990},
volume={34},
number={1},
pages={59-70}
}

@article{tomasulo1967efficient,
author={R. M. {Tomasulo}},
journal={IBM Journal of Research and Development},
title={An Efficient Algorithm for Exploiting Multiple Arithmetic Units},
year={1967},
volume={11},
number={1},
pages={25-33}
}

@inproceedings{blue1992training,
author = {James L. Blue and Patrick J. Grother},
title = {{Training feed-forward neural networks using conjugate gradients}},
volume = {1661},
booktitle = {Machine Vision Applications in Character Recognition and Industrial Inspection},
editor = {Donald P. D'Amato and Wolf-Ekkehard Blanz and Byron E. Dom and Sargur N. Srihari},
organization = {International Society for Optics and Photonics},
publisher = {SPIE},
pages = {179 -- 190},
year = {1992},
doi = {10.1117/12.130286},
URL = {https://doi.org/10.1117/12.130286}
}

@book{rojas1996neural,
title={Neural networks: a systematic introduction},
author={Rojas, Ra{\'u}l},
year={1996},
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
isbn="978-3-642-61068-4",
doi="10.1007/978-3-642-61068-4_7",
url="https://doi.org/10.1007/978-3-642-61068-4_7"
}

@article{flynn1972some,
author={M. J. {Flynn}},
journal={IEEE Transactions on Computers},
title={Some Computer Organizations and Their Effectiveness},
year={1972},
volume={C-21},
number={9},
pages={948-960},
doi={10.1109/TC.1972.5009071}
}

@article{barnes1968illiac,
author={G. H. {Barnes} and R. M. {Brown} and M. {Kato} and D. J. {Kuck} and D. L. {Slotnick} and R. A. {Stokes}},
journal={IEEE Transactions on Computers},
title={{The ILLIAC IV Computer}},
year={1968},
volume={C-17},
number={8},
pages={746-757},
doi={10.1109/TC.1968.229158}
}

@INPROCEEDINGS{tyler1999altivec,
author={J. {Tyler} and J. {Lent} and A. {Mather} and  {Huy Nguyen}},
booktitle={1999 IEEE International Performance, Computing and Communications Conference (Cat. No.99CH36305)},
title={{AltiVec/sup TM/: bringing vector technology to the PowerPC/sup TM/ processor family}},
year={1999},
volume={},
number={},
pages={437-444},
doi={10.1109/PCCC.1999.749469}
}

@online{llvmLangref,
  author = {{\relax LLVM} Foundation},
  title = {{LLVM} Language Reference Manual},
  year = 2020,
  url = {https://llvm.org/docs/LangRef.html},
  urldate = {2021-01-04}
}

@unpublished{kuzma2021fast,
  author={Braedy Kuzma and Ivan Korostelev and João P. L. de Carvalho and José Moreira and Christopher Barton and Guido Araujo and José Nelson Amaral},
  title = {Fast Matrix Multiplication via Compiler-only Layered Data Reorganization and Intrinsic Lowering},
  note = {under revision}
}

@ARTICLE{eisen2007ibm,
  author={Eisen, L. and Ward, J. W. and Tast, H.-W. and Mading, N. and Leenstra, J. and Mueller, S. M. and Jacobi, C. and Preiss, J. and Schwarz, E. M. and Carlough, S. R.},
  journal={IBM Journal of Research and Development},
  title={IBM POWER6 accelerators: VMX and DFU},
  year={2007},
  volume={51},
  number={6},
  pages={1-21},
  doi={10.1147/rd.516.0663}
}

@MastersThesis{lattner2002llvm,
  author  = {Chris Lattner},
  title   = "{LLVM: An Infrastructure for Multi-Stage Optimization}",
  school  = "{Computer Science Dept., University of Illinois at Urbana-Champaign}",
  year    = {2002},
  address = {Urbana, IL},
  month   = dec,
  note    = {{\em See {\tt http://llvm.cs.uiuc.edu}.}}
}

@INPROCEEDINGS{lattner2004llvm,
  author={Lattner, C. and Adve, V.},
  booktitle={International Symposium on Code Generation and Optimization, 2004. CGO 2004.},
  title={LLVM: a compilation framework for lifelong program analysis transformation},
  year={2004},
  volume={},
  number={},
  pages={75-86},
  doi={10.1109/CGO.2004.1281665}
}

@inproceedings{grosser2011polly,
  title={Polly-Polyhedral optimization in LLVM},
  author={Grosser, Tobias and Zheng, Hongbin and Aloor, Raghesh and Simb{\"u}rger, Andreas and Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l},
  booktitle={Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  volume={2011},
  pages={1},
  year={2011}
}

@article{alves2015runtime,
author = {Alves, P\'{e}ricles and Gruber, Fabian and Doerfert, Johannes and Lamprineas, Alexandros and Grosser, Tobias and Rastello, Fabrice and Pereira, Fernando Magno Quint\~{a}o},
title = {Runtime Pointer Disambiguation},
year = {2015},
issue_date = {October 2015},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {50},
number = {10},
issn = {0362-1340},
url = {https://doi.org/10.1145/2858965.2814285},
doi = {10.1145/2858965.2814285},
journal = {SIGPLAN Not.},
month = oct,
pages = {589–606},
numpages = {18},
keywords = {Alias analysis, dynamic guards, optimization}
}

@inproceedings{sui2016interprocedural,
author = {Sui, Yulei and Xue, Jingling},
title = {SVF: Interprocedural Static Value-Flow Analysis in LLVM},
year = {2016},
isbn = {9781450342414},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/2892208.2892235},
doi = {10.1145/2892208.2892235},
booktitle = {Proceedings of the 25th International Conference on Compiler Construction},
pages = {265–266},
numpages = {2},
keywords = {Pointer Analysis, Value-Flow, SVF},
location = {Barcelona, Spain},
series = {CC 2016}
}

@inproceedings{hardekopf2009semi,
author = {Hardekopf, Ben and Lin, Calvin},
title = {Semi-Sparse Flow-Sensitive Pointer Analysis},
year = {2009},
isbn = {9781605583792},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/1480881.1480911},
doi = {10.1145/1480881.1480911},
booktitle = {Proceedings of the 36th Annual ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages},
pages = {226–238},
numpages = {13},
keywords = {pointer analysis, alias analysis},
location = {Savannah, GA, USA},
series = {POPL '09}
}

@article{lozano2019combinatorial,
author = {Lozano, Roberto Casta\~{n}eda and Carlsson, Mats and Blindell, Gabriel Hjort and Schulte, Christian},
title = {Combinatorial Register Allocation and Instruction Scheduling},
year = {2019},
issue_date = {July 2019},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {3},
issn = {0164-0925},
url = {https://doi.org/10.1145/3332373},
doi = {10.1145/3332373},
journal = {ACM Trans. Program. Lang. Syst.},
month = jul,
articleno = {17},
numpages = {53},
keywords = {register allocation, instruction scheduling, Combinatorial optimization}
}

@article{pereira2008register,
author = {Quint\~{a}o Pereira, Fernando Magno and Palsberg, Jens},
title = {Register Allocation by Puzzle Solving},
year = {2008},
issue_date = {June 2008},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {43},
number = {6},
issn = {0362-1340},
url = {https://doi.org/10.1145/1379022.1375609},
doi = {10.1145/1379022.1375609},
journal = {SIGPLAN Not.},
month = jun,
pages = {216–226},
numpages = {11},
keywords = {register allocation, puzzle solving, register aliasing}
}

@inproceedings{cytron1989efficient,
  title={An efficient method of computing static single assignment form},
  author={Cytron, Ron and Ferrante, Jeanne and Rosen, Barry K and Wegman, Mark N and Zadeck, F Kenneth},
  booktitle={Proceedings of the 16th ACM SIGPLAN-SIGACT symposium on Principles of programming languages},
  pages={25--35},
  year={1989}
}

@inproceedings{rosen1988global,
  title={Global value numbers and redundant computations},
  author={Rosen, Barry K and Wegman, Mark N and Zadeck, F Kenneth},
  booktitle={Proceedings of the 15th ACM SIGPLAN-SIGACT symposium on Principles of programming languages},
  pages={12--27},
  year={1988}
}

@inproceedings{alpern1988detecting,
  title={Detecting equality of variables in programs},
  author={Alpern, Bowen and Wegman, Mark N and Zadeck, F Kenneth},
  booktitle={Proceedings of the 15th ACM SIGPLAN-SIGACT symposium on Principles of programming languages},
  pages={1--11},
  year={1988}
}

@article{cytron1991efficiently,
author = {Cytron, Ron and Ferrante, Jeanne and Rosen, Barry K. and Wegman, Mark N. and Zadeck, F. Kenneth},
title = {Efficiently Computing Static Single Assignment Form and the Control Dependence Graph},
year = {1991},
issue_date = {Oct. 1991},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {13},
number = {4},
issn = {0164-0925},
url = {https://doi.org/10.1145/115372.115320},
doi = {10.1145/115372.115320},
journal = {ACM Trans. Program. Lang. Syst.},
month = oct,
pages = {451–490},
numpages = {40},
keywords = {optimizing compilers, def-use chain, dominator, control dependence, control flow graph}
}

@article{brandis1994single,
author = {Brandis, Marc M. and M\"{o}ssenb\"{o}ck, Hanspeter},
title = {Single-Pass Generation of Static Single-Assignment Form for Structured Languages},
year = {1994},
issue_date = {Nov. 1994},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {16},
number = {6},
issn = {0164-0925},
url = {https://doi.org/10.1145/197320.197331},
doi = {10.1145/197320.197331},
journal = {ACM Trans. Program. Lang. Syst.},
month = nov,
pages = {1684–1698},
numpages = {15},
keywords = {static single-assignment form, structured languages, dominator tree}
}

@inproceedings{10.1145/800028.808480,
author = {Cocke, John},
title = {Global Common Subexpression Elimination},
year = {1970},
isbn = {9781450373869},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/800028.808480},
doi = {10.1145/800028.808480},
abstract = {When considering compiler optimization, there are two questions that immediately come to mind; one, why and to what extent is optimization necessary and two, to what extent is it possible.When considering the second question, one might immediately become discouraged since it is well known that the program equivalency problem is recursively unsolvable. It is, of course, clear from this that there will never be techniques for generating a completely optimum program. These unsolvability results, however, do not preclude the possibility of ad hoc techniques for program improvement or even a partial theory which produces a class of equivalent programs optimized in varying degrees.The reasons why optimization is required seem to me to fall in two major categories. The first I will call “local” and the second “global”.},
booktitle = {Proceedings of a Symposium on Compiler Optimization},
pages = {20–24},
numpages = {5},
location = {Urbana-Champaign, Illinois}
}

@article{cocke1970global,
author = {Cocke, John},
title = {Global Common Subexpression Elimination},
year = {1970},
issue_date = {July 1970},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {5},
number = {7},
issn = {0362-1340},
url = {https://doi.org/10.1145/390013.808480},
doi = {10.1145/390013.808480},
journal = {SIGPLAN Not.},
month = jul,
pages = {20–24},
numpages = {5}
}

@INPROCEEDINGS{domke2021matrix,
author = {J. Domke and E. Vatai and A. Drozd and P. ChenT and Y. Oyama and L. Zhang and S. Salaria and D. Mukunoki and A. Podobas and M. WahibT and S. Matsuoka},
booktitle = {2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
title = {Matrix Engines for High Performance Computing: A Paragon of Performance or Grasping at Straws?},
year = {2021},
volume = {},
issn = {},
pages = {1056-1065},
keywords = {deep learning;program processors;tensors;machine learning algorithms;benchmark testing;throughput;supercomputers},
doi = {10.1109/IPDPS49936.2021.00114},
url = {https://doi.ieeecomputersociety.org/10.1109/IPDPS49936.2021.00114},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month = {may}
}

@article{wang2019bfloat16,
  title={Bfloat16: the secret to high performance on cloud tpus},
  author={Wang, Shibo and Kanwar, Pankaj},
  journal={Google Cloud Blog},
  year={2019}
}

@article{nakasato2011fast,
author = {Nakasato, Naohito},
title = {A Fast GEMM Implementation on the Cypress GPU},
year = {2011},
issue_date = {March 2011},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {38},
number = {4},
issn = {0163-5999},
url = {https://doi.org/10.1145/1964218.1964227},
doi = {10.1145/1964218.1964227},
journal = {SIGMETRICS Perform. Eval. Rev.},
month = mar,
pages = {50–55},
numpages = {6}
}

@InProceedings{yu2020toward,
author="Yu, Tan
and Cai, Yunfeng
and Li, Ping",
editor="Vedaldi, Andrea
and Bischof, Horst
and Brox, Thomas
and Frahm, Jan-Michael",
title="Toward Faster and Simpler Matrix Normalization via Rank-1 Update",
booktitle="Computer Vision -- ECCV 2020",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="203--219",
isbn="978-3-030-58529-7"
}

@INPROCEEDINGS{pal2018outerspace,
author={Pal, Subhankar and Beaumont, Jonathan and Park, Dong-Hyeon and Amarnath, Aporva and Feng, Siying and Chakrabarti, Chaitali and Kim, Hun-Seok and Blaauw, David and Mudge, Trevor and Dreslinski, Ronald},
booktitle={2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
title={OuterSPACE: An Outer Product Based Sparse Matrix Multiplication Accelerator},
year={2018},
volume={},
number={},
pages={724-736},
doi={10.1109/HPCA.2018.00067}
}

@INPROCEEDINGS{srivastava2020matraptor,
author={Srivastava, Nitish and Jin, Hanchen and Liu, Jie and Albonesi, David and Zhang, Zhiru},
booktitle={2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
title={MatRaptor: A Sparse-Sparse Matrix Multiplication Accelerator Based on Row-Wise Product},
year={2020},
volume={},
number={},
pages={766-780},
doi={10.1109/MICRO50266.2020.00068}
}

@article{wu2016achieving,
author = {Wu, Jing and Jaja, Joseph},
title = {Achieving Native GPU Performance for Out-of-Card Large Dense Matrix Multiplication},
journal = {Parallel Processing Letters},
volume = {26},
number = {02},
pages = {1650007},
year = {2016},
doi = {10.1142/S0129626416500079},
URL = {https://doi.org/10.1142/S0129626416500079},
eprint = {https://doi.org/10.1142/S0129626416500079}
}

@InProceedings{waugh2020use,
author="Waugh, Harry
and McIntosh-Smith, Simon",
editor="Nichols, Jeffrey
and Verastegui, Becky
and Maccabe, Arthur `Barney'
and Hernandez, Oscar
and Parete-Koon, Suzanne
and Ahearn, Theresa",
title="On the Use of BLAS Libraries in Modern Scientific Codes at Scale",
booktitle="Driving Scientific and Engineering Discoveries Through the Convergence of HPC, Big Data and AI",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="67--79",
isbn="978-3-030-63393-6"
}

@inproceedings{abadi2016tensorflow,
author = {Mart{\'\i}n Abadi and Paul Barham and Jianmin Chen and Zhifeng Chen and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Geoffrey Irving and Michael Isard and Manjunath Kudlur and Josh Levenberg and Rajat Monga and Sherry Moore and Derek G. Murray and Benoit Steiner and Paul Tucker and Vijay Vasudevan and Pete Warden and Martin Wicke and Yuan Yu and Xiaoqiang Zheng},
title = {TensorFlow: A System for Large-Scale Machine Learning},
booktitle = {12th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 16)},
year = {2016},
isbn = {978-1-931971-33-1},
address = {Savannah, GA},
pages = {265--283},
url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/abadi},
publisher = {{USENIX} Association},
month = nov,
}

@article{lawson1979basic,
  title={Basic linear algebra subprograms for Fortran usage},
  author={Lawson, Chuck L and Hanson, Richard J. and Kincaid, David R and Krogh, Fred T.},
  journal={ACM Transactions on Mathematical Software (TOMS)},
  volume={5},
  number={3},
  pages={308--323},
  year={1979},
  publisher={ACM New York, NY, USA}
}

@article{zee2016blis,
author = {Zee, Field G. Van and Smith, Tyler M. and Marker, Bryan and Low, Tze Meng and Geijn, Robert A. Van De and Igual, Francisco D. and Smelyanskiy, Mikhail and Zhang, Xianyi and Kistler, Michael and Austel, Vernon and Gunnels, John A. and Killough, Lee},
title = {The BLIS Framework: Experiments in Portability},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {42},
number = {2},
issn = {0098-3500},
url = {https://doi.org/10.1145/2755561},
doi = {10.1145/2755561},
abstract = {BLIS is a new software framework for instantiating high-performance BLAS-like dense linear algebra libraries. We demonstrate how BLIS acts as a productivity multiplier by using it to implement the level-3 BLAS on a variety of current architectures. The systems for which we demonstrate the framework include state-of-the-art general-purpose, low-power, and many-core architectures. We show, with very little effort, how the BLIS framework yields sequential and parallel implementations that are competitive with the performance of ATLAS, OpenBLAS (an effort to maintain and extend the GotoBLAS), and commercial vendor implementations such as AMD’s ACML, IBM’s ESSL, and Intel’s MKL libraries. Although most of this article focuses on single-core implementation, we also provide compelling results that suggest the framework’s leverage extends to the multithreaded domain.},
journal = {ACM Trans. Math. Softw.},
month = jun,
articleno = {12},
numpages = {19},
keywords = {Linear algebra, matrix, BLAS, multiplication, libraries, high performance}
}

@article{vanzee2015blis,
author = {Van Zee, Field G. and van de Geijn, Robert A.},
title = {BLIS: A Framework for Rapidly Instantiating BLAS Functionality},
year = {2015},
issue_date = {June 2015},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {3},
issn = {0098-3500},
url = {https://doi.org/10.1145/2764454},
doi = {10.1145/2764454},
abstract = {The BLAS-like Library Instantiation Software (BLIS) framework is a new infrastructure for rapidly instantiating Basic Linear Algebra Subprograms (BLAS) functionality. Its fundamental innovation is that virtually all computation within level-2 (matrix-vector) and level-3 (matrix-matrix) BLAS operations can be expressed and optimized in terms of very simple kernels. While others have had similar insights, BLIS reduces the necessary kernels to what we believe is the simplest set that still supports the high performance that the computational science community demands. Higher-level framework code is generalized and implemented in ISO C99 so that it can be reused and/or reparameterized for different operations (and different architectures) with little to no modification. Inserting high-performance kernels into the framework facilitates the immediate optimization of any BLAS-like operations which are cast in terms of these kernels, and thus the framework acts as a productivity multiplier. Users of BLAS-dependent applications are given a choice of using the traditional Fortran-77 BLAS interface, a generalized C interface, or any other higher level interface that builds upon this latter API. Preliminary performance of level-2 and level-3 operations is observed to be competitive with two mature open source libraries (OpenBLAS and ATLAS) as well as an established commercial product (Intel MKL).},
journal = {ACM Trans. Math. Softw.},
month = jun,
articleno = {14},
numpages = {33},
keywords = {libraries, BLAS, matrix, Linear algebra, high-performance}
}

@article{low2016analytical,
  title={Analytical modeling is enough for high-performance BLIS},
  author={Low, Tze Meng and Igual, Francisco D and Smith, Tyler M and Quintana-Orti, Enrique S},
  journal={ACM Transactions on Mathematical Software (TOMS)},
  volume={43},
  number={2},
  pages={1--18},
  year={2016},
  publisher={ACM New York, NY, USA}
}

@INPROCEEDINGS{zulehner2019matrix,
author={Zulehner, Alwin and Wille, Robert},
booktitle={2019 Design, Automation   Test in Europe Conference   Exhibition (DATE)},
title={Matrix-Vector vs. Matrix-Matrix Multiplication: Potential in DD-based Simulation of Quantum Computations},
year={2019},
volume={},
number={},
pages={90-95},
doi={10.23919/DATE.2019.8714836}
}

@InProceedings{krol2014matrix,
author="Krol, Dawid
and Zydek, Dawid
and Selvaraj, Henry",
editor="Swi{\k{a}}tek, Jerzy
and Grzech, Adam
and Swi{\k{a}}tek, Pawe{\l}
and Tomczak, Jakub M.",
title="Matrix Multiplication in Multiphysics Systems Using CUDA",
booktitle="Advances in Systems Science",
year="2014",
publisher="Springer International Publishing",
address="Cham",
pages="493--502",
isbn="978-3-319-01857-7"
}

@article{akutsu2000algorithms,
author = {Akutsu, Tatsuya and Miyano, Satoru and Kuhara, Satoru},
title = {Algorithms for Identifying Boolean Networks and Related Biological Networks Based on Matrix Multiplication and Fingerprint Function},
journal = {Journal of Computational Biology},
volume = {7},
number = {3-4},
pages = {331-343},
year = {2000},
doi = {10.1089/106652700750050817},
note ={PMID: 11108466},
URL = {https://doi.org/10.1089/106652700750050817},
eprint = {https://doi.org/10.1089/106652700750050817}
}

@article{weber2015semiempirical,
author = {Weber, Valéry and Laino, Teodoro and Pozdneev, Alexander and Fedulova, Irina and Curioni, Alessandro},
title = {Semiempirical Molecular Dynamics (SEMD) I: Midpoint-Based Parallel Sparse Matrix–Matrix Multiplication Algorithm for Matrices with Decay},
journal = {Journal of Chemical Theory and Computation},
volume = {11},
number = {7},
pages = {3145-3152},
year = {2015},
doi = {10.1021/acs.jctc.5b00382},
note ={PMID: 26575751},
URL = {https://doi.org/10.1021/acs.jctc.5b00382},
eprint = {https://doi.org/10.1021/acs.jctc.5b00382}
}

@article{strange2007efficient,
  author  = {Peter Stange and Andreas Griewank and Matthias Bollh\"ofer},
  title   = {On the efficient update of rectangular LU-factorizations subject to low rank modifications},
  journal = {Electron. Trans. Numer. Anal.},
  volume  = {26},
  year    = {2007},
  pages   = {161--177}
}

@online{guennebaud2021eigen,
author = {Ga\"{e}l Guennebaud and Beno\^{i}t Jacob and others},
title = {Eigen v3},
date = {2021-06-19},
urldate = {2021-06-22},
url = {http://eigen.tuxfamily.org}
}

@INPROCEEDINGS{xianyi2012model,
author={Z. {Xianyi} and W. {Qian} and Z. {Yunquan}},
booktitle={2012 IEEE 18th International Conference on Parallel and Distributed Systems},
title={Model-driven Level 3 BLAS Performance Optimization on Loongson 3A Processor},
year={2012},
volume={},
number={},
pages={684-691},
doi={10.1109/ICPADS.2012.97},
ISSN={1521-9097},
month=dec
}

@Inbook{wang2014intel,
author="Wang, Endong
and Zhang, Qing
and Shen, Bo
and Zhang, Guangyong
and Lu, Xiaowei
and Wu, Qing
and Wang, Yajuan",
title="Intel Math Kernel Library",
bookTitle="High-Performance Computing on the Intel® Xeon Phi{\texttrademark}: How to Fully Exploit MIC Architectures",
year="2014",
publisher="Springer International Publishing",
address="Cham",
pages="167--188",
isbn="978-3-319-06486-4",
doi="10.1007/978-3-319-06486-4_7",
url="https://doi.org/10.1007/978-3-319-06486-4_7"
}

@online{intel2021accelerate,
author={Intel},
title={Accelerate Fast Math with Intel® oneAPI Math Kernel Library},
year=2021,
urldate={2021-06-23},
url={https://software.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top.html}
}

@online{ibm2021engineering,
author={IBM},
title={Engineering and Scientific Subroutine Library 6.3},
year=2020,
month=jun,
urldate={2021-06-23},
url={https://www.ibm.com/docs/en/essl/6.3}
}

@online{nvidia2021cublas,
author={NVIDIA},
title={cuBLAS :: CUDA Toolkit Documentation},
date={2021-05-20},
urldate={2021-06-23},
url={https://docs.nvidia.com/cuda/cublas/index.html}
}

@article{gareev2018high,
author = {Gareev, Roman and Grosser, Tobias and Kruse, Michael},
title = {High-Performance Generalized Tensor Operations: A Compiler-Oriented Approach},
year = {2018},
issue_date = {October 2018},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {15},
number = {3},
issn = {1544-3566},
url = {https://doi.org/10.1145/3235029},
doi = {10.1145/3235029},
journal = {ACM Trans. Archit. Code Optim.},
month = sep,
articleno = {34},
numpages = {27},
keywords = {high-performance computing, Tensor contractions, matrix-matrix multiplication}
}

@article{grosser2012polly,
author = {Grosser, Tobias and Groesslinger, Armin and Lengauer, Christian},
title = {POLLY — PERFORMING POLYHEDRAL OPTIMIZATIONS ON A LOW-LEVEL INTERMEDIATE REPRESENTATION},
journal = {Parallel Processing Letters},
volume = {22},
number = {04},
pages = {1250010},
year = {2012},
doi = {10.1142/S0129626412500107},
URL = {https://doi.org/10.1142/S0129626412500107},
eprint = {https://doi.org/10.1142/S0129626412500107}
}

@article{bondhugula2020high,
  author = {Uday Bondhugula},
  title = {High Performance Code Generation in {MLIR:} An Early Case Study with {GEMM}},
  journal = {CoRR},
  volume = {abs/2003.00532},
  year = {2020},
  url = {https://arxiv.org/abs/2003.00532},
  archivePrefix = {arXiv},
  eprint = {2003.00532},
  timestamp = {Tue, 10 Mar 2020 13:33:48 +0100},
  biburl = {https://dblp.org/rec/journals/corr/abs-2003-00532.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{carvalho2021kernelfarer,
  author = {João P. L. de Carvalho and Braedy Kuzma and Ivan Korostelev and José Nelson Amaral and Christopher Barton and José Moreira and Guido Araujo},
  title = {{KernelFaRer}: Replacing Native-Code Idioms with High-Performance Library Calls},
  journal = {ACM Transactions On Architecture And Code Optimization ({TACO})},
  year = {2021},
  OPTkey = {},
  OPTvolume = {},
  OPTnumber = {},
  OPTpages = {},
  OPTmonth = {},
  OPTnote = {To Appear},
  OPTannote = {}
}

@manual{PowerISA,
  author = {IBM},
  title = {Power® {ISA} Version 3.1},
  institution = {IBM},
  year = 2020,
  month = may,
  url = {https://ibm.ent.box.com/s/hhjfw0x0lrbtyzmiaffnbxh2fuo0fog0}
}

@manual{IntelISA,
  title={Intel® {A}rchitecture {I}nstruction {S}et {E}xtensions and {F}uture {F}eatures Programming Reference},
  month=feb,
  organization={Intel Corporation},
  year={2021},
  url={https://software.intel.com/content/dam/develop/external/us/en/documents-tps/architecture-instruction-set-extensions-programming-reference.pdf}
}

@manual{ArmISA,
  title={Arm® {A}rchitecture {R}eference {M}anual Armv8, for Armv8-A {A}rchitecture {P}rofile},
  month=jan,
  organization={Arm Limited},
  year={2021},
  url={https://developer.arm.com/documentation/ddi0487/latest/}
}

@ARTICLE {kuck1968illiac,
author = {D. Kuck},
journal = {IEEE Transactions on Computers},
title = {ILLIAC IV Software and Application Programming},
year = {1968},
volume = {17},
number = {08},
issn = {1557-9956},
pages = {758-770},
keywords = {index terms?applications of array computer;array computer;array language;compiler;operating system.},
doi = {10.1109/TC.1968.229159},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month = aug
}

@article{hassan20161performance,
title = {Performance Evaluation of Matrix-Matrix Multiplications Using Intel's Advanced Vector Extensions (AVX)},
journal = {Microprocessors and Microsystems},
volume = {47},
pages = {369-374},
year = {2016},
issn = {0141-9331},
doi = {https://doi.org/10.1016/j.micpro.2016.10.002},
url = {https://www.sciencedirect.com/science/article/pii/S0141933116302502},
author = {Somaia Awad Hassan and A.M. Hemeida and Mountasser M.M. Mahmoud},
keywords = {Advanced vector extension (AVX), Matrix-matrix multiplications, Intrinsic functions, Inline assembly, Intel C++ compiler, Microsoft VC++ compiler},
}

@article{hemeida2020optimizing,
title = {Optimizing matrix-matrix multiplication on intel’s advanced vector extensions multicore processor},
journal = {Ain Shams Engineering Journal},
volume = {11},
number = {4},
pages = {1179-1190},
year = {2020},
issn = {2090-4479},
doi = {https://doi.org/10.1016/j.asej.2020.01.003},
url = {https://www.sciencedirect.com/science/article/pii/S2090447920300058},
author = {A.M. Hemeida and S.A. Hassan and Salem Alkhalaf and M.M.M. Mahmoud and M.A. Saber and Ayman M. {Bahaa Eldin} and Tomonobu Senjyu and Abdullah H. Alayed},
keywords = {Intel’s AVX, Intel MKL SGEMM, Matrix-matrix multiplications, Optimization, Multicore},
}

@inproceedings{alappat2020understanding,
author="Alappat, Christie L.
and Hofmann, Johannes
and Hager, Georg
and Fehske, Holger
and Bishop, Alan R.
and Wellein, Gerhard",
editor="Sadayappan, Ponnuswamy
and Chamberlain, Bradford L.
and Juckeland, Guido
and Ltaief, Hatem",
title="Understanding {HPC} Benchmark Performance on {Intel} {Broadwell} and {Cascade} Lake Processors",
booktitle="High Performance Computing",
address="Frankfurt am Main, Germany",
month="June",
year="2020",
pages="412--433",
isbn="978-3-030-50743-5"
}

@InProceedings{poenaru2020evaluating,
author="Poenaru, Andrei
and McIntosh-Smith, Simon",
editor="Malawski, Maciej
and Rzadca, Krzysztof",
title="Evaluating the Effectiveness of a Vector-Length-Agnostic Instruction Set",
booktitle="Euro-Par 2020: Parallel Processing",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="98--114",
isbn="978-3-030-57675-2"
}

@inproceedings{larsen2001fast,
author = {Larsen, E. Scott and McAllister, David},
title = {Fast Matrix Multiplies Using Graphics Hardware},
year = {2001},
isbn = {158113293X},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/582034.582089},
doi = {10.1145/582034.582089},
booktitle = {Proceedings of the 2001 ACM/IEEE Conference on Supercomputing},
pages = {55},
numpages = {1},
keywords = {graphics hardware, matrix multiplication},
location = {Denver, Colorado},
series = {SC '01}
}

@inproceedings{fatahalian2004understanding,
author = {Fatahalian, K. and Sugerman, J. and Hanrahan, P.},
title = {Understanding the Efficiency of GPU Algorithms for Matrix-Matrix Multiplication},
year = {2004},
isbn = {3905673150},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/1058129.1058148},
doi = {10.1145/1058129.1058148},
booktitle = {Proceedings of the ACM SIGGRAPH/EUROGRAPHICS Conference on Graphics Hardware},
pages = {133–137},
numpages = {5},
location = {Grenoble, France},
series = {HWWS '04}
}

@inproceedings{li2011strassens,
  author={Li, Junjie and Ranka, Sanjay and Sahni, Sartaj},
  booktitle={2011 IEEE 17th International Conference on Parallel and Distributed Systems},
  title={Strassen's Matrix Multiplication on GPUs},
  year={2011},
  volume={},
  number={},
  pages={157-164},
  doi={10.1109/ICPADS.2011.130}
}

@inproceedings{nath2011accelerating,
author="Nath, Rajib
and Tomov, Stanimire
and Dongarra, Jack",
editor="Palma, Jos{\'e} M. Laginha M.
and Dayd{\'e}, Michel
and Marques, Osni
and Lopes, Jo{\~a}o Correia",
title="Accelerating GPU Kernels for Dense Linear Algebra",
booktitle="High Performance Computing for Computational Science -- VECPAR 2010",
year="2011",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="83--92",
isbn="978-3-642-19328-6"
}

@inproceedings{han2019distme,
author = {Han, Donghyoung and Nam, Yoon-Min and Lee, Jihye and Park, Kyongseok and Kim, Hyunwoo and Kim, Min-Soo},
title = {DistME: A Fast and Elastic Distributed Matrix Computation Engine Using GPUs},
year = {2019},
isbn = {9781450356435},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3299869.3319865},
doi = {10.1145/3299869.3319865},
booktitle = {Proceedings of the 2019 International Conference on Management of Data},
pages = {759–774},
numpages = {16},
keywords = {matrix multiplication, distributed data-parallel system, gpu computation},
location = {Amsterdam, Netherlands},
series = {SIGMOD '19}
}

@article{jouppi2017datacenter,
author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and Boyle, Rick and Cantin, Pierre-luc and Chao, Clifford and Clark, Chris and Coriell, Jeremy and Daley, Mike and Dau, Matt and Dean, Jeffrey and Gelb, Ben and Ghaemmaghami, Tara Vazir and Gottipati, Rajendra and Gulland, William and Hagmann, Robert and Ho, C. Richard and Hogberg, Doug and Hu, John and Hundt, Robert and Hurt, Dan and Ibarz, Julian and Jaffey, Aaron and Jaworski, Alek and Kaplan, Alexander and Khaitan, Harshit and Killebrew, Daniel and Koch, Andy and Kumar, Naveen and Lacy, Steve and Laudon, James and Law, James and Le, Diemthu and Leary, Chris and Liu, Zhuyuan and Lucke, Kyle and Lundin, Alan and MacKean, Gordon and Maggiore, Adriana and Mahony, Maire and Miller, Kieran and Nagarajan, Rahul and Narayanaswami, Ravi and Ni, Ray and Nix, Kathy and Norrie, Thomas and Omernick, Mark and Penukonda, Narayana and Phelps, Andy and Ross, Jonathan and Ross, Matt and Salek, Amir and Samadiani, Emad and Severn, Chris and Sizikov, Gregory and Snelham, Matthew and Souter, Jed and Steinberg, Dan and Swing, Andy and Tan, Mercedes and Thorson, Gregory and Tian, Bo and Toma, Horia and Tuttle, Erick and Vasudevan, Vijay and Walter, Richard and Wang, Walter and Wilcox, Eric and Yoon, Doe Hyun},
title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
year = {2017},
issue_date = {May 2017},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {45},
number = {2},
issn = {0163-5964},
url = {https://doi.org/10.1145/3140659.3080246},
doi = {10.1145/3140659.3080246},
journal = {SIGARCH Comput. Archit. News},
month = jun,
pages = {1–12},
numpages = {12},
keywords = {neural network, TensorFlow, CNN, accelerator, GPU, DNN, domain-specific architecture, TPU, RNN, MLP, deep learning, LSTM}
}

@INPROCEEDINGS{markidis2018nvidia,
  author={Markidis, Stefano and Chien, Steven Wei Der and Laure, Erwin and Peng, Ivy Bo and Vetter, Jeffrey S.},
  booktitle={2018 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
  title={NVIDIA Tensor Core Programmability, Performance   Precision},
  year={2018},
  volume={},
  number={},
  pages={522-531},
  doi={10.1109/IPDPSW.2018.00091}
}

@inproceedings{liao2019davinci,
  title={Davinci: A scalable architecture for neural network computing},
  author={Liao, Heng and Tu, Jiajin and Xia, Jing and Zhou, Xiping},
  booktitle={2019 IEEE Hot Chips 31 Symposium (HCS)},
  pages={1--44},
  year={2019},
  organization={IEEE Computer Society}
}

@article{wang2019benchmarking,
  author = {Yu Wang and Gu{-}Yeon Wei and David Brooks},
  title = {Benchmarking TPU, GPU, and {CPU} Platforms for Deep Learning},
  journal = {CoRR},
  volume = {abs/1907.10701},
  year = {2019},
  url = {http://arxiv.org/abs/1907.10701},
  archivePrefix = {arXiv},
  eprint = {1907.10701},
  timestamp = {Thu, 01 Aug 2019 08:59:33 +0200},
  biburl = {https://dblp.org/rec/journals/corr/abs-1907-10701.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{gu2020bandwidth,
author = {Gu, Zhixiang and Moreira, Jose and Edelsohn, David and Azad, Ariful},
title = {Bandwidth Optimized Parallel Algorithms for Sparse Matrix-Matrix Multiplication Using Propagation Blocking},
year = {2020},
isbn = {9781450369350},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi-org.login.ezproxy.library.ualberta.ca/10.1145/3350755.3400216},
doi = {10.1145/3350755.3400216},
abstract = {Sparse matrix-matrix multiplication (SpGEMM) is a widely used kernel in various graph, scientific computing and machine learning algorithms. It is well known that SpGEMM is a memory-bound operation, and its peak performance is expected to be bound by the memory bandwidth. Yet, existing algorithms fail to saturate the memory bandwidth, resulting in suboptimal performance under the Roofline model. In this paper, we characterize existing SpGEMM algorithms based on their memory access patterns and develop practical lower and upper bounds for SpGEMM performance. We then develop an SpGEMM algorithm based on the outer product. The newly developed algorithm called PB-SpGEMM saturates memory bandwidth by using the propagation blocking technique and by performing in-cache sorting and merging. For many practical matrices, PB-SpGEMM runs 20%-50% faster than the state-of-the-art heap and hash SpGEMM algorithms on modern multicore processors. Most importantly, PB-SpGEMM attains performance predicted by the Roofline model, and its performance remains stable with respect to matrix size and sparsity.},
booktitle = {Proceedings of the 32nd ACM Symposium on Parallelism in Algorithms and Architectures},
pages = {293–303},
numpages = {11},
keywords = {SpGEMM, parallel algorithm},
location = {Virtual Event, USA},
series = {SPAA '20}
}

@online{googlebench,
title = {google/benchmark: A microbenchmark support library},
organization = {Google},
date = {2021-07-21},
urldate = {2021-07-18},
url = {https://github.com/google/benchmark}
}

@article{moreira2021matrix,
  author    = {Jos{\'{e}} E. Moreira and
               Kit Barton and
               Steven Battle and
               Peter Bergner and
               Ramon Bertran and
               Puneeth Bhat and
               Pedro Caldeira and
               David Edelsohn and
               Gordon Fossum and
               Brad Frey and
               Nemanja Ivanovic and
               Chip Kerchner and
               Vincent Lim and
               Shakti Kapoor and
               Tulio Machado Filho and
               Silvia Melitta Mueller and
               Brett Olsson and
               Satish Sadasivam and
               Baptiste Saleil and
               Bill Schmidt and
               Rajalakshmi Srinivasaraghavan and
               Shricharan Srivatsan and
               Brian W. Thompto and
               Andreas Wagner and
               Nelson Wu},
  title     = {A matrix math facility for Power {ISA(TM)} processors},
  journal   = {CoRR},
  volume    = {abs/2104.03142},
  year      = {2021},
  url       = {https://arxiv.org/abs/2104.03142},
  archivePrefix = {arXiv},
  eprint    = {2104.03142},
  timestamp = {Tue, 13 Apr 2021 16:46:17 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2104-03142.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@INPROCEEDINGS{velkoski2014performance,
  author={Velkoski, Goran and Gusev, Marjan and Ristov, Sasko},
  booktitle={2014 37th International Convention on Information and Communication Technology, Electronics and Microelectronics (MIPRO)},
  title={The performance impact analysis of loop unrolling},
  year={2014},
  volume={},
  number={},
  pages={307-312},
  doi={10.1109/MIPRO.2014.6859582}
}

@inproceedings{allen1984automatic,
author = {Allen, John R. and Kennedy, Ken},
title = {Automatic Loop Interchange},
year = {1984},
isbn = {0897911393},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/502874.502897},
doi = {10.1145/502874.502897},
booktitle = {Proceedings of the 1984 SIGPLAN Symposium on Compiler Construction},
pages = {233–246},
numpages = {14},
location = {Montreal, Canada},
series = {SIGPLAN '84}
}

@article{fabeiro2016writing,
title = {Writing a performance-portable matrix multiplication},
journal = {Parallel Computing},
volume = {52},
pages = {65-77},
year = {2016},
issn = {0167-8191},
doi = {https://doi.org/10.1016/j.parco.2015.12.005},
url = {https://www.sciencedirect.com/science/article/pii/S0167819115001611},
author = {Jorge {F. Fabeiro} and Diego Andrade and Basilio B. Fraguela},
keywords = {GPGPU, Heterogeneous systems, OpenCL, Performance portability, Embedded languages},
}

@book{schreiber1990automatic,
  title={Automatic blocking of nested loops},
  author={Schreiber, Robert and Dongarra, Jack J and others},
  year={1990},
  publisher={Research Institute for Advanced Computer Science, NASA Ames Research Center}
}