This repository has been archived by the owner on Nov 30, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrefs.bib
1064 lines (986 loc) · 40.2 KB
/
refs.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@article{goto2008anatomy,
author = {Goto, Kazushige and Geijn, Robert A. van de},
title = {Anatomy of High-Performance Matrix Multiplication},
year = {2008},
issue_date = {May 2008},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {34},
number = {3},
issn = {0098-3500},
url = {https://doi.org/10.1145/1356052.1356053},
doi = {10.1145/1356052.1356053},
journal = {ACM Trans. Math. Softw.},
month = may,
articleno = {Article 12},
numpages = {25},
keywords = {matrix multiplication, Linear algebra, basic linear algebra subprogrms}
}
@online{ibmarchive,
title = {{IBM - Archives - History of IBM - United States}},
organization = {International Business Machines},
date = {2003-01-23},
urldate = {2020-09-15},
url = {https://www.ibm.com/ibm/history/history/history_intro.html}
}
@article{montoye1990design,
author={R. K. {Montoye} and E. {Hokenek} and S. L. {Runyon}},
journal={IBM Journal of Research and Development},
title={Design of the IBM RISC System/6000 floating-point execution unit},
year={1990},
volume={34},
number={1},
pages={59-70}
}
@article{tomasulo1967efficient,
author={R. M. {Tomasulo}},
journal={IBM Journal of Research and Development},
title={An Efficient Algorithm for Exploiting Multiple Arithmetic Units},
year={1967},
volume={11},
number={1},
pages={25-33}
}
@inproceedings{blue1992training,
author = {James L. Blue and Patrick J. Grother},
title = {{Training feed-forward neural networks using conjugate gradients}},
volume = {1661},
booktitle = {Machine Vision Applications in Character Recognition and Industrial Inspection},
editor = {Donald P. D'Amato and Wolf-Ekkehard Blanz and Byron E. Dom and Sargur N. Srihari},
organization = {International Society for Optics and Photonics},
publisher = {SPIE},
pages = {179 -- 190},
year = {1992},
doi = {10.1117/12.130286},
URL = {https://doi.org/10.1117/12.130286}
}
@book{rojas1996neural,
title={Neural networks: a systematic introduction},
author={Rojas, Ra{\'u}l},
year={1996},
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
isbn="978-3-642-61068-4",
doi="10.1007/978-3-642-61068-4_7",
url="https://doi.org/10.1007/978-3-642-61068-4_7"
}
@article{flynn1972some,
author={M. J. {Flynn}},
journal={IEEE Transactions on Computers},
title={Some Computer Organizations and Their Effectiveness},
year={1972},
volume={C-21},
number={9},
pages={948-960},
doi={10.1109/TC.1972.5009071}
}
@article{barnes1968illiac,
author={G. H. {Barnes} and R. M. {Brown} and M. {Kato} and D. J. {Kuck} and D. L. {Slotnick} and R. A. {Stokes}},
journal={IEEE Transactions on Computers},
title={{The ILLIAC IV Computer}},
year={1968},
volume={C-17},
number={8},
pages={746-757},
doi={10.1109/TC.1968.229158}
}
@INPROCEEDINGS{tyler1999altivec,
author={J. {Tyler} and J. {Lent} and A. {Mather} and {Huy Nguyen}},
booktitle={1999 IEEE International Performance, Computing and Communications Conference (Cat. No.99CH36305)},
title={{AltiVec/sup TM/: bringing vector technology to the PowerPC/sup TM/ processor family}},
year={1999},
volume={},
number={},
pages={437-444},
doi={10.1109/PCCC.1999.749469}
}
@online{llvmLangref,
author = {{\relax LLVM} Foundation},
title = {{LLVM} Language Reference Manual},
year = 2020,
url = {https://llvm.org/docs/LangRef.html},
urldate = {2021-01-04}
}
@unpublished{kuzma2021fast,
author={Braedy Kuzma and Ivan Korostelev and João P. L. de Carvalho and José Moreira and Christopher Barton and Guido Araujo and José Nelson Amaral},
title = {Fast Matrix Multiplication via Compiler-only Layered Data Reorganization and Intrinsic Lowering},
note = {under revision}
}
@ARTICLE{eisen2007ibm,
author={Eisen, L. and Ward, J. W. and Tast, H.-W. and Mading, N. and Leenstra, J. and Mueller, S. M. and Jacobi, C. and Preiss, J. and Schwarz, E. M. and Carlough, S. R.},
journal={IBM Journal of Research and Development},
title={IBM POWER6 accelerators: VMX and DFU},
year={2007},
volume={51},
number={6},
pages={1-21},
doi={10.1147/rd.516.0663}
}
@MastersThesis{lattner2002llvm,
author = {Chris Lattner},
title = "{LLVM: An Infrastructure for Multi-Stage Optimization}",
school = "{Computer Science Dept., University of Illinois at Urbana-Champaign}",
year = {2002},
address = {Urbana, IL},
month = dec,
note = {{\em See {\tt http://llvm.cs.uiuc.edu}.}}
}
@INPROCEEDINGS{lattner2004llvm,
author={Lattner, C. and Adve, V.},
booktitle={International Symposium on Code Generation and Optimization, 2004. CGO 2004.},
title={LLVM: a compilation framework for lifelong program analysis transformation},
year={2004},
volume={},
number={},
pages={75-86},
doi={10.1109/CGO.2004.1281665}
}
@inproceedings{grosser2011polly,
title={Polly-Polyhedral optimization in LLVM},
author={Grosser, Tobias and Zheng, Hongbin and Aloor, Raghesh and Simb{\"u}rger, Andreas and Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l},
booktitle={Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT)},
volume={2011},
pages={1},
year={2011}
}
@article{alves2015runtime,
author = {Alves, P\'{e}ricles and Gruber, Fabian and Doerfert, Johannes and Lamprineas, Alexandros and Grosser, Tobias and Rastello, Fabrice and Pereira, Fernando Magno Quint\~{a}o},
title = {Runtime Pointer Disambiguation},
year = {2015},
issue_date = {October 2015},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {50},
number = {10},
issn = {0362-1340},
url = {https://doi.org/10.1145/2858965.2814285},
doi = {10.1145/2858965.2814285},
journal = {SIGPLAN Not.},
month = oct,
pages = {589–606},
numpages = {18},
keywords = {Alias analysis, dynamic guards, optimization}
}
@inproceedings{sui2016interprocedural,
author = {Sui, Yulei and Xue, Jingling},
title = {SVF: Interprocedural Static Value-Flow Analysis in LLVM},
year = {2016},
isbn = {9781450342414},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/2892208.2892235},
doi = {10.1145/2892208.2892235},
booktitle = {Proceedings of the 25th International Conference on Compiler Construction},
pages = {265–266},
numpages = {2},
keywords = {Pointer Analysis, Value-Flow, SVF},
location = {Barcelona, Spain},
series = {CC 2016}
}
@inproceedings{hardekopf2009semi,
author = {Hardekopf, Ben and Lin, Calvin},
title = {Semi-Sparse Flow-Sensitive Pointer Analysis},
year = {2009},
isbn = {9781605583792},
publisher = {Association for Computing Machinery},
url = {https://doi.org/10.1145/1480881.1480911},
doi = {10.1145/1480881.1480911},
booktitle = {Proceedings of the 36th Annual ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages},
pages = {226–238},
numpages = {13},
keywords = {pointer analysis, alias analysis},
location = {Savannah, GA, USA},
series = {POPL '09}
}
@article{lozano2019combinatorial,
author = {Lozano, Roberto Casta\~{n}eda and Carlsson, Mats and Blindell, Gabriel Hjort and Schulte, Christian},
title = {Combinatorial Register Allocation and Instruction Scheduling},
year = {2019},
issue_date = {July 2019},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {3},
issn = {0164-0925},
url = {https://doi.org/10.1145/3332373},
doi = {10.1145/3332373},
journal = {ACM Trans. Program. Lang. Syst.},
month = jul,
articleno = {17},
numpages = {53},
keywords = {register allocation, instruction scheduling, Combinatorial optimization}
}
@article{pereira2008register,
author = {Quint\~{a}o Pereira, Fernando Magno and Palsberg, Jens},
title = {Register Allocation by Puzzle Solving},
year = {2008},
issue_date = {June 2008},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {43},
number = {6},
issn = {0362-1340},
url = {https://doi.org/10.1145/1379022.1375609},
doi = {10.1145/1379022.1375609},
journal = {SIGPLAN Not.},
month = jun,
pages = {216–226},
numpages = {11},
keywords = {register allocation, puzzle solving, register aliasing}
}
@inproceedings{cytron1989efficient,
title={An efficient method of computing static single assignment form},
author={Cytron, Ron and Ferrante, Jeanne and Rosen, Barry K and Wegman, Mark N and Zadeck, F Kenneth},
booktitle={Proceedings of the 16th ACM SIGPLAN-SIGACT symposium on Principles of programming languages},
pages={25--35},
year={1989}
}
@inproceedings{rosen1988global,
title={Global value numbers and redundant computations},
author={Rosen, Barry K and Wegman, Mark N and Zadeck, F Kenneth},
booktitle={Proceedings of the 15th ACM SIGPLAN-SIGACT symposium on Principles of programming languages},
pages={12--27},
year={1988}
}
@inproceedings{alpern1988detecting,
title={Detecting equality of variables in programs},
author={Alpern, Bowen and Wegman, Mark N and Zadeck, F Kenneth},
booktitle={Proceedings of the 15th ACM SIGPLAN-SIGACT symposium on Principles of programming languages},
pages={1--11},
year={1988}
}
@article{cytron1991efficiently,
author = {Cytron, Ron and Ferrante, Jeanne and Rosen, Barry K. and Wegman, Mark N. and Zadeck, F. Kenneth},
title = {Efficiently Computing Static Single Assignment Form and the Control Dependence Graph},
year = {1991},
issue_date = {Oct. 1991},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {13},
number = {4},
issn = {0164-0925},
url = {https://doi.org/10.1145/115372.115320},
doi = {10.1145/115372.115320},
journal = {ACM Trans. Program. Lang. Syst.},
month = oct,
pages = {451–490},
numpages = {40},
keywords = {optimizing compilers, def-use chain, dominator, control dependence, control flow graph}
}
@article{brandis1994single,
author = {Brandis, Marc M. and M\"{o}ssenb\"{o}ck, Hanspeter},
title = {Single-Pass Generation of Static Single-Assignment Form for Structured Languages},
year = {1994},
issue_date = {Nov. 1994},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {16},
number = {6},
issn = {0164-0925},
url = {https://doi.org/10.1145/197320.197331},
doi = {10.1145/197320.197331},
journal = {ACM Trans. Program. Lang. Syst.},
month = nov,
pages = {1684–1698},
numpages = {15},
keywords = {static single-assignment form, structured languages, dominator tree}
}
@inproceedings{10.1145/800028.808480,
author = {Cocke, John},
title = {Global Common Subexpression Elimination},
year = {1970},
isbn = {9781450373869},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/800028.808480},
doi = {10.1145/800028.808480},
abstract = {When considering compiler optimization, there are two questions that immediately come to mind; one, why and to what extent is optimization necessary and two, to what extent is it possible.When considering the second question, one might immediately become discouraged since it is well known that the program equivalency problem is recursively unsolvable. It is, of course, clear from this that there will never be techniques for generating a completely optimum program. These unsolvability results, however, do not preclude the possibility of ad hoc techniques for program improvement or even a partial theory which produces a class of equivalent programs optimized in varying degrees.The reasons why optimization is required seem to me to fall in two major categories. The first I will call “local” and the second “global”.},
booktitle = {Proceedings of a Symposium on Compiler Optimization},
pages = {20–24},
numpages = {5},
location = {Urbana-Champaign, Illinois}
}
@article{cocke1970global,
author = {Cocke, John},
title = {Global Common Subexpression Elimination},
year = {1970},
issue_date = {July 1970},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {5},
number = {7},
issn = {0362-1340},
url = {https://doi.org/10.1145/390013.808480},
doi = {10.1145/390013.808480},
journal = {SIGPLAN Not.},
month = jul,
pages = {20–24},
numpages = {5}
}
@INPROCEEDINGS{domke2021matrix,
author = {J. Domke and E. Vatai and A. Drozd and P. ChenT and Y. Oyama and L. Zhang and S. Salaria and D. Mukunoki and A. Podobas and M. WahibT and S. Matsuoka},
booktitle = {2021 IEEE International Parallel and Distributed Processing Symposium (IPDPS)},
title = {Matrix Engines for High Performance Computing: A Paragon of Performance or Grasping at Straws?},
year = {2021},
volume = {},
issn = {},
pages = {1056-1065},
keywords = {deep learning;program processors;tensors;machine learning algorithms;benchmark testing;throughput;supercomputers},
doi = {10.1109/IPDPS49936.2021.00114},
url = {https://doi.ieeecomputersociety.org/10.1109/IPDPS49936.2021.00114},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month = {may}
}
@article{wang2019bfloat16,
title={Bfloat16: the secret to high performance on cloud tpus},
author={Wang, Shibo and Kanwar, Pankaj},
journal={Google Cloud Blog},
year={2019}
}
@article{nakasato2011fast,
author = {Nakasato, Naohito},
title = {A Fast GEMM Implementation on the Cypress GPU},
year = {2011},
issue_date = {March 2011},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {38},
number = {4},
issn = {0163-5999},
url = {https://doi.org/10.1145/1964218.1964227},
doi = {10.1145/1964218.1964227},
journal = {SIGMETRICS Perform. Eval. Rev.},
month = mar,
pages = {50–55},
numpages = {6}
}
@InProceedings{yu2020toward,
author="Yu, Tan
and Cai, Yunfeng
and Li, Ping",
editor="Vedaldi, Andrea
and Bischof, Horst
and Brox, Thomas
and Frahm, Jan-Michael",
title="Toward Faster and Simpler Matrix Normalization via Rank-1 Update",
booktitle="Computer Vision -- ECCV 2020",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="203--219",
isbn="978-3-030-58529-7"
}
@INPROCEEDINGS{pal2018outerspace,
author={Pal, Subhankar and Beaumont, Jonathan and Park, Dong-Hyeon and Amarnath, Aporva and Feng, Siying and Chakrabarti, Chaitali and Kim, Hun-Seok and Blaauw, David and Mudge, Trevor and Dreslinski, Ronald},
booktitle={2018 IEEE International Symposium on High Performance Computer Architecture (HPCA)},
title={OuterSPACE: An Outer Product Based Sparse Matrix Multiplication Accelerator},
year={2018},
volume={},
number={},
pages={724-736},
doi={10.1109/HPCA.2018.00067}
}
@INPROCEEDINGS{srivastava2020matraptor,
author={Srivastava, Nitish and Jin, Hanchen and Liu, Jie and Albonesi, David and Zhang, Zhiru},
booktitle={2020 53rd Annual IEEE/ACM International Symposium on Microarchitecture (MICRO)},
title={MatRaptor: A Sparse-Sparse Matrix Multiplication Accelerator Based on Row-Wise Product},
year={2020},
volume={},
number={},
pages={766-780},
doi={10.1109/MICRO50266.2020.00068}
}
@article{wu2016achieving,
author = {Wu, Jing and Jaja, Joseph},
title = {Achieving Native GPU Performance for Out-of-Card Large Dense Matrix Multiplication},
journal = {Parallel Processing Letters},
volume = {26},
number = {02},
pages = {1650007},
year = {2016},
doi = {10.1142/S0129626416500079},
URL = {https://doi.org/10.1142/S0129626416500079},
eprint = {https://doi.org/10.1142/S0129626416500079}
}
@InProceedings{waugh2020use,
author="Waugh, Harry
and McIntosh-Smith, Simon",
editor="Nichols, Jeffrey
and Verastegui, Becky
and Maccabe, Arthur `Barney'
and Hernandez, Oscar
and Parete-Koon, Suzanne
and Ahearn, Theresa",
title="On the Use of BLAS Libraries in Modern Scientific Codes at Scale",
booktitle="Driving Scientific and Engineering Discoveries Through the Convergence of HPC, Big Data and AI",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="67--79",
isbn="978-3-030-63393-6"
}
@inproceedings{abadi2016tensorflow,
author = {Mart{\'\i}n Abadi and Paul Barham and Jianmin Chen and Zhifeng Chen and Andy Davis and Jeffrey Dean and Matthieu Devin and Sanjay Ghemawat and Geoffrey Irving and Michael Isard and Manjunath Kudlur and Josh Levenberg and Rajat Monga and Sherry Moore and Derek G. Murray and Benoit Steiner and Paul Tucker and Vijay Vasudevan and Pete Warden and Martin Wicke and Yuan Yu and Xiaoqiang Zheng},
title = {TensorFlow: A System for Large-Scale Machine Learning},
booktitle = {12th {USENIX} Symposium on Operating Systems Design and Implementation ({OSDI} 16)},
year = {2016},
isbn = {978-1-931971-33-1},
address = {Savannah, GA},
pages = {265--283},
url = {https://www.usenix.org/conference/osdi16/technical-sessions/presentation/abadi},
publisher = {{USENIX} Association},
month = nov,
}
@article{lawson1979basic,
title={Basic linear algebra subprograms for Fortran usage},
author={Lawson, Chuck L and Hanson, Richard J. and Kincaid, David R and Krogh, Fred T.},
journal={ACM Transactions on Mathematical Software (TOMS)},
volume={5},
number={3},
pages={308--323},
year={1979},
publisher={ACM New York, NY, USA}
}
@article{zee2016blis,
author = {Zee, Field G. Van and Smith, Tyler M. and Marker, Bryan and Low, Tze Meng and Geijn, Robert A. Van De and Igual, Francisco D. and Smelyanskiy, Mikhail and Zhang, Xianyi and Kistler, Michael and Austel, Vernon and Gunnels, John A. and Killough, Lee},
title = {The BLIS Framework: Experiments in Portability},
year = {2016},
issue_date = {June 2016},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {42},
number = {2},
issn = {0098-3500},
url = {https://doi.org/10.1145/2755561},
doi = {10.1145/2755561},
abstract = {BLIS is a new software framework for instantiating high-performance BLAS-like dense linear algebra libraries. We demonstrate how BLIS acts as a productivity multiplier by using it to implement the level-3 BLAS on a variety of current architectures. The systems for which we demonstrate the framework include state-of-the-art general-purpose, low-power, and many-core architectures. We show, with very little effort, how the BLIS framework yields sequential and parallel implementations that are competitive with the performance of ATLAS, OpenBLAS (an effort to maintain and extend the GotoBLAS), and commercial vendor implementations such as AMD’s ACML, IBM’s ESSL, and Intel’s MKL libraries. Although most of this article focuses on single-core implementation, we also provide compelling results that suggest the framework’s leverage extends to the multithreaded domain.},
journal = {ACM Trans. Math. Softw.},
month = jun,
articleno = {12},
numpages = {19},
keywords = {Linear algebra, matrix, BLAS, multiplication, libraries, high performance}
}
@article{vanzee2015blis,
author = {Van Zee, Field G. and van de Geijn, Robert A.},
title = {BLIS: A Framework for Rapidly Instantiating BLAS Functionality},
year = {2015},
issue_date = {June 2015},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {41},
number = {3},
issn = {0098-3500},
url = {https://doi.org/10.1145/2764454},
doi = {10.1145/2764454},
abstract = {The BLAS-like Library Instantiation Software (BLIS) framework is a new infrastructure for rapidly instantiating Basic Linear Algebra Subprograms (BLAS) functionality. Its fundamental innovation is that virtually all computation within level-2 (matrix-vector) and level-3 (matrix-matrix) BLAS operations can be expressed and optimized in terms of very simple kernels. While others have had similar insights, BLIS reduces the necessary kernels to what we believe is the simplest set that still supports the high performance that the computational science community demands. Higher-level framework code is generalized and implemented in ISO C99 so that it can be reused and/or reparameterized for different operations (and different architectures) with little to no modification. Inserting high-performance kernels into the framework facilitates the immediate optimization of any BLAS-like operations which are cast in terms of these kernels, and thus the framework acts as a productivity multiplier. Users of BLAS-dependent applications are given a choice of using the traditional Fortran-77 BLAS interface, a generalized C interface, or any other higher level interface that builds upon this latter API. Preliminary performance of level-2 and level-3 operations is observed to be competitive with two mature open source libraries (OpenBLAS and ATLAS) as well as an established commercial product (Intel MKL).},
journal = {ACM Trans. Math. Softw.},
month = jun,
articleno = {14},
numpages = {33},
keywords = {libraries, BLAS, matrix, Linear algebra, high-performance}
}
@article{low2016analytical,
title={Analytical modeling is enough for high-performance BLIS},
author={Low, Tze Meng and Igual, Francisco D and Smith, Tyler M and Quintana-Orti, Enrique S},
journal={ACM Transactions on Mathematical Software (TOMS)},
volume={43},
number={2},
pages={1--18},
year={2016},
publisher={ACM New York, NY, USA}
}
@INPROCEEDINGS{zulehner2019matrix,
author={Zulehner, Alwin and Wille, Robert},
booktitle={2019 Design, Automation Test in Europe Conference Exhibition (DATE)},
title={Matrix-Vector vs. Matrix-Matrix Multiplication: Potential in DD-based Simulation of Quantum Computations},
year={2019},
volume={},
number={},
pages={90-95},
doi={10.23919/DATE.2019.8714836}
}
@InProceedings{krol2014matrix,
author="Krol, Dawid
and Zydek, Dawid
and Selvaraj, Henry",
editor="Swi{\k{a}}tek, Jerzy
and Grzech, Adam
and Swi{\k{a}}tek, Pawe{\l}
and Tomczak, Jakub M.",
title="Matrix Multiplication in Multiphysics Systems Using CUDA",
booktitle="Advances in Systems Science",
year="2014",
publisher="Springer International Publishing",
address="Cham",
pages="493--502",
isbn="978-3-319-01857-7"
}
@article{akutsu2000algorithms,
author = {Akutsu, Tatsuya and Miyano, Satoru and Kuhara, Satoru},
title = {Algorithms for Identifying Boolean Networks and Related Biological Networks Based on Matrix Multiplication and Fingerprint Function},
journal = {Journal of Computational Biology},
volume = {7},
number = {3-4},
pages = {331-343},
year = {2000},
doi = {10.1089/106652700750050817},
note ={PMID: 11108466},
URL = {https://doi.org/10.1089/106652700750050817},
eprint = {https://doi.org/10.1089/106652700750050817}
}
@article{weber2015semiempirical,
author = {Weber, Valéry and Laino, Teodoro and Pozdneev, Alexander and Fedulova, Irina and Curioni, Alessandro},
title = {Semiempirical Molecular Dynamics (SEMD) I: Midpoint-Based Parallel Sparse Matrix–Matrix Multiplication Algorithm for Matrices with Decay},
journal = {Journal of Chemical Theory and Computation},
volume = {11},
number = {7},
pages = {3145-3152},
year = {2015},
doi = {10.1021/acs.jctc.5b00382},
note ={PMID: 26575751},
URL = {https://doi.org/10.1021/acs.jctc.5b00382},
eprint = {https://doi.org/10.1021/acs.jctc.5b00382}
}
@article{strange2007efficient,
author = {Peter Stange and Andreas Griewank and Matthias Bollh\"ofer},
title = {On the efficient update of rectangular LU-factorizations subject to low rank modifications},
journal = {Electron. Trans. Numer. Anal.},
volume = {26},
year = {2007},
pages = {161--177}
}
@online{guennebaud2021eigen,
author = {Ga\"{e}l Guennebaud and Beno\^{i}t Jacob and others},
title = {Eigen v3},
date = {2021-06-19},
urldate = {2021-06-22},
url = {http://eigen.tuxfamily.org}
}
@INPROCEEDINGS{xianyi2012model,
author={Z. {Xianyi} and W. {Qian} and Z. {Yunquan}},
booktitle={2012 IEEE 18th International Conference on Parallel and Distributed Systems},
title={Model-driven Level 3 BLAS Performance Optimization on Loongson 3A Processor},
year={2012},
volume={},
number={},
pages={684-691},
doi={10.1109/ICPADS.2012.97},
ISSN={1521-9097},
month=dec
}
@Inbook{wang2014intel,
author="Wang, Endong
and Zhang, Qing
and Shen, Bo
and Zhang, Guangyong
and Lu, Xiaowei
and Wu, Qing
and Wang, Yajuan",
title="Intel Math Kernel Library",
bookTitle="High-Performance Computing on the Intel® Xeon Phi{\texttrademark}: How to Fully Exploit MIC Architectures",
year="2014",
publisher="Springer International Publishing",
address="Cham",
pages="167--188",
isbn="978-3-319-06486-4",
doi="10.1007/978-3-319-06486-4_7",
url="https://doi.org/10.1007/978-3-319-06486-4_7"
}
@online{intel2021accelerate,
author={Intel},
title={Accelerate Fast Math with Intel® oneAPI Math Kernel Library},
year=2021,
urldate={2021-06-23},
url={https://software.intel.com/content/www/us/en/develop/documentation/onemkl-developer-reference-c/top.html}
}
@online{ibm2021engineering,
author={IBM},
title={Engineering and Scientific Subroutine Library 6.3},
year=2020,
month=jun,
urldate={2021-06-23},
url={https://www.ibm.com/docs/en/essl/6.3}
}
@online{nvidia2021cublas,
author={NVIDIA},
title={cuBLAS :: CUDA Toolkit Documentation},
date={2021-05-20},
urldate={2021-06-23},
url={https://docs.nvidia.com/cuda/cublas/index.html}
}
@article{gareev2018high,
author = {Gareev, Roman and Grosser, Tobias and Kruse, Michael},
title = {High-Performance Generalized Tensor Operations: A Compiler-Oriented Approach},
year = {2018},
issue_date = {October 2018},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {15},
number = {3},
issn = {1544-3566},
url = {https://doi.org/10.1145/3235029},
doi = {10.1145/3235029},
journal = {ACM Trans. Archit. Code Optim.},
month = sep,
articleno = {34},
numpages = {27},
keywords = {high-performance computing, Tensor contractions, matrix-matrix multiplication}
}
@article{grosser2012polly,
author = {Grosser, Tobias and Groesslinger, Armin and Lengauer, Christian},
title = {POLLY — PERFORMING POLYHEDRAL OPTIMIZATIONS ON A LOW-LEVEL INTERMEDIATE REPRESENTATION},
journal = {Parallel Processing Letters},
volume = {22},
number = {04},
pages = {1250010},
year = {2012},
doi = {10.1142/S0129626412500107},
URL = {https://doi.org/10.1142/S0129626412500107},
eprint = {https://doi.org/10.1142/S0129626412500107}
}
@article{bondhugula2020high,
author = {Uday Bondhugula},
title = {High Performance Code Generation in {MLIR:} An Early Case Study with {GEMM}},
journal = {CoRR},
volume = {abs/2003.00532},
year = {2020},
url = {https://arxiv.org/abs/2003.00532},
archivePrefix = {arXiv},
eprint = {2003.00532},
timestamp = {Tue, 10 Mar 2020 13:33:48 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2003-00532.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{carvalho2021kernelfarer,
author = {João P. L. de Carvalho and Braedy Kuzma and Ivan Korostelev and José Nelson Amaral and Christopher Barton and José Moreira and Guido Araujo},
title = {{KernelFaRer}: Replacing Native-Code Idioms with High-Performance Library Calls},
journal = {ACM Transactions On Architecture And Code Optimization ({TACO})},
year = {2021},
OPTkey = {},
OPTvolume = {},
OPTnumber = {},
OPTpages = {},
OPTmonth = {},
OPTnote = {To Appear},
OPTannote = {}
}
@manual{PowerISA,
author = {IBM},
title = {Power® {ISA} Version 3.1},
institution = {IBM},
year = 2020,
month = may,
url = {https://ibm.ent.box.com/s/hhjfw0x0lrbtyzmiaffnbxh2fuo0fog0}
}
@manual{IntelISA,
title={Intel® {A}rchitecture {I}nstruction {S}et {E}xtensions and {F}uture {F}eatures Programming Reference},
month=feb,
organization={Intel Corporation},
year={2021},
url={https://software.intel.com/content/dam/develop/external/us/en/documents-tps/architecture-instruction-set-extensions-programming-reference.pdf}
}
@manual{ArmISA,
title={Arm® {A}rchitecture {R}eference {M}anual Armv8, for Armv8-A {A}rchitecture {P}rofile},
month=jan,
organization={Arm Limited},
year={2021},
url={https://developer.arm.com/documentation/ddi0487/latest/}
}
@ARTICLE {kuck1968illiac,
author = {D. Kuck},
journal = {IEEE Transactions on Computers},
title = {ILLIAC IV Software and Application Programming},
year = {1968},
volume = {17},
number = {08},
issn = {1557-9956},
pages = {758-770},
keywords = {index terms?applications of array computer;array computer;array language;compiler;operating system.},
doi = {10.1109/TC.1968.229159},
publisher = {IEEE Computer Society},
address = {Los Alamitos, CA, USA},
month = aug
}
@article{hassan20161performance,
title = {Performance Evaluation of Matrix-Matrix Multiplications Using Intel's Advanced Vector Extensions (AVX)},
journal = {Microprocessors and Microsystems},
volume = {47},
pages = {369-374},
year = {2016},
issn = {0141-9331},
doi = {https://doi.org/10.1016/j.micpro.2016.10.002},
url = {https://www.sciencedirect.com/science/article/pii/S0141933116302502},
author = {Somaia Awad Hassan and A.M. Hemeida and Mountasser M.M. Mahmoud},
keywords = {Advanced vector extension (AVX), Matrix-matrix multiplications, Intrinsic functions, Inline assembly, Intel C++ compiler, Microsoft VC++ compiler},
}
@article{hemeida2020optimizing,
title = {Optimizing matrix-matrix multiplication on intel’s advanced vector extensions multicore processor},
journal = {Ain Shams Engineering Journal},
volume = {11},
number = {4},
pages = {1179-1190},
year = {2020},
issn = {2090-4479},
doi = {https://doi.org/10.1016/j.asej.2020.01.003},
url = {https://www.sciencedirect.com/science/article/pii/S2090447920300058},
author = {A.M. Hemeida and S.A. Hassan and Salem Alkhalaf and M.M.M. Mahmoud and M.A. Saber and Ayman M. {Bahaa Eldin} and Tomonobu Senjyu and Abdullah H. Alayed},
keywords = {Intel’s AVX, Intel MKL SGEMM, Matrix-matrix multiplications, Optimization, Multicore},
}
@inproceedings{alappat2020understanding,
author="Alappat, Christie L.
and Hofmann, Johannes
and Hager, Georg
and Fehske, Holger
and Bishop, Alan R.
and Wellein, Gerhard",
editor="Sadayappan, Ponnuswamy
and Chamberlain, Bradford L.
and Juckeland, Guido
and Ltaief, Hatem",
title="Understanding {HPC} Benchmark Performance on {Intel} {Broadwell} and {Cascade} Lake Processors",
booktitle="High Performance Computing",
address="Frankfurt am Main, Germany",
month="June",
year="2020",
pages="412--433",
isbn="978-3-030-50743-5"
}
@InProceedings{poenaru2020evaluating,
author="Poenaru, Andrei
and McIntosh-Smith, Simon",
editor="Malawski, Maciej
and Rzadca, Krzysztof",
title="Evaluating the Effectiveness of a Vector-Length-Agnostic Instruction Set",
booktitle="Euro-Par 2020: Parallel Processing",
year="2020",
publisher="Springer International Publishing",
address="Cham",
pages="98--114",
isbn="978-3-030-57675-2"
}
@inproceedings{larsen2001fast,
author = {Larsen, E. Scott and McAllister, David},
title = {Fast Matrix Multiplies Using Graphics Hardware},
year = {2001},
isbn = {158113293X},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/582034.582089},
doi = {10.1145/582034.582089},
booktitle = {Proceedings of the 2001 ACM/IEEE Conference on Supercomputing},
pages = {55},
numpages = {1},
keywords = {graphics hardware, matrix multiplication},
location = {Denver, Colorado},
series = {SC '01}
}
@inproceedings{fatahalian2004understanding,
author = {Fatahalian, K. and Sugerman, J. and Hanrahan, P.},
title = {Understanding the Efficiency of GPU Algorithms for Matrix-Matrix Multiplication},
year = {2004},
isbn = {3905673150},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/1058129.1058148},
doi = {10.1145/1058129.1058148},
booktitle = {Proceedings of the ACM SIGGRAPH/EUROGRAPHICS Conference on Graphics Hardware},
pages = {133–137},
numpages = {5},
location = {Grenoble, France},
series = {HWWS '04}
}
@inproceedings{li2011strassens,
author={Li, Junjie and Ranka, Sanjay and Sahni, Sartaj},
booktitle={2011 IEEE 17th International Conference on Parallel and Distributed Systems},
title={Strassen's Matrix Multiplication on GPUs},
year={2011},
volume={},
number={},
pages={157-164},
doi={10.1109/ICPADS.2011.130}
}
@inproceedings{nath2011accelerating,
author="Nath, Rajib
and Tomov, Stanimire
and Dongarra, Jack",
editor="Palma, Jos{\'e} M. Laginha M.
and Dayd{\'e}, Michel
and Marques, Osni
and Lopes, Jo{\~a}o Correia",
title="Accelerating GPU Kernels for Dense Linear Algebra",
booktitle="High Performance Computing for Computational Science -- VECPAR 2010",
year="2011",
publisher="Springer Berlin Heidelberg",
address="Berlin, Heidelberg",
pages="83--92",
isbn="978-3-642-19328-6"
}
@inproceedings{han2019distme,
author = {Han, Donghyoung and Nam, Yoon-Min and Lee, Jihye and Park, Kyongseok and Kim, Hyunwoo and Kim, Min-Soo},
title = {DistME: A Fast and Elastic Distributed Matrix Computation Engine Using GPUs},
year = {2019},
isbn = {9781450356435},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3299869.3319865},
doi = {10.1145/3299869.3319865},
booktitle = {Proceedings of the 2019 International Conference on Management of Data},
pages = {759–774},
numpages = {16},
keywords = {matrix multiplication, distributed data-parallel system, gpu computation},
location = {Amsterdam, Netherlands},
series = {SIGMOD '19}
}
@article{jouppi2017datacenter,
author = {Jouppi, Norman P. and Young, Cliff and Patil, Nishant and Patterson, David and Agrawal, Gaurav and Bajwa, Raminder and Bates, Sarah and Bhatia, Suresh and Boden, Nan and Borchers, Al and Boyle, Rick and Cantin, Pierre-luc and Chao, Clifford and Clark, Chris and Coriell, Jeremy and Daley, Mike and Dau, Matt and Dean, Jeffrey and Gelb, Ben and Ghaemmaghami, Tara Vazir and Gottipati, Rajendra and Gulland, William and Hagmann, Robert and Ho, C. Richard and Hogberg, Doug and Hu, John and Hundt, Robert and Hurt, Dan and Ibarz, Julian and Jaffey, Aaron and Jaworski, Alek and Kaplan, Alexander and Khaitan, Harshit and Killebrew, Daniel and Koch, Andy and Kumar, Naveen and Lacy, Steve and Laudon, James and Law, James and Le, Diemthu and Leary, Chris and Liu, Zhuyuan and Lucke, Kyle and Lundin, Alan and MacKean, Gordon and Maggiore, Adriana and Mahony, Maire and Miller, Kieran and Nagarajan, Rahul and Narayanaswami, Ravi and Ni, Ray and Nix, Kathy and Norrie, Thomas and Omernick, Mark and Penukonda, Narayana and Phelps, Andy and Ross, Jonathan and Ross, Matt and Salek, Amir and Samadiani, Emad and Severn, Chris and Sizikov, Gregory and Snelham, Matthew and Souter, Jed and Steinberg, Dan and Swing, Andy and Tan, Mercedes and Thorson, Gregory and Tian, Bo and Toma, Horia and Tuttle, Erick and Vasudevan, Vijay and Walter, Richard and Wang, Walter and Wilcox, Eric and Yoon, Doe Hyun},
title = {In-Datacenter Performance Analysis of a Tensor Processing Unit},
year = {2017},
issue_date = {May 2017},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {45},
number = {2},
issn = {0163-5964},
url = {https://doi.org/10.1145/3140659.3080246},
doi = {10.1145/3140659.3080246},
journal = {SIGARCH Comput. Archit. News},
month = jun,
pages = {1–12},
numpages = {12},
keywords = {neural network, TensorFlow, CNN, accelerator, GPU, DNN, domain-specific architecture, TPU, RNN, MLP, deep learning, LSTM}
}
@INPROCEEDINGS{markidis2018nvidia,
author={Markidis, Stefano and Chien, Steven Wei Der and Laure, Erwin and Peng, Ivy Bo and Vetter, Jeffrey S.},
booktitle={2018 IEEE International Parallel and Distributed Processing Symposium Workshops (IPDPSW)},
title={NVIDIA Tensor Core Programmability, Performance Precision},
year={2018},
volume={},
number={},
pages={522-531},
doi={10.1109/IPDPSW.2018.00091}
}
@inproceedings{liao2019davinci,
title={Davinci: A scalable architecture for neural network computing},
author={Liao, Heng and Tu, Jiajin and Xia, Jing and Zhou, Xiping},
booktitle={2019 IEEE Hot Chips 31 Symposium (HCS)},
pages={1--44},
year={2019},
organization={IEEE Computer Society}
}
@article{wang2019benchmarking,
author = {Yu Wang and Gu{-}Yeon Wei and David Brooks},
title = {Benchmarking TPU, GPU, and {CPU} Platforms for Deep Learning},
journal = {CoRR},
volume = {abs/1907.10701},
year = {2019},
url = {http://arxiv.org/abs/1907.10701},
archivePrefix = {arXiv},
eprint = {1907.10701},
timestamp = {Thu, 01 Aug 2019 08:59:33 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-1907-10701.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{gu2020bandwidth,
author = {Gu, Zhixiang and Moreira, Jose and Edelsohn, David and Azad, Ariful},
title = {Bandwidth Optimized Parallel Algorithms for Sparse Matrix-Matrix Multiplication Using Propagation Blocking},
year = {2020},
isbn = {9781450369350},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi-org.login.ezproxy.library.ualberta.ca/10.1145/3350755.3400216},
doi = {10.1145/3350755.3400216},
abstract = {Sparse matrix-matrix multiplication (SpGEMM) is a widely used kernel in various graph, scientific computing and machine learning algorithms. It is well known that SpGEMM is a memory-bound operation, and its peak performance is expected to be bound by the memory bandwidth. Yet, existing algorithms fail to saturate the memory bandwidth, resulting in suboptimal performance under the Roofline model. In this paper, we characterize existing SpGEMM algorithms based on their memory access patterns and develop practical lower and upper bounds for SpGEMM performance. We then develop an SpGEMM algorithm based on the outer product. The newly developed algorithm called PB-SpGEMM saturates memory bandwidth by using the propagation blocking technique and by performing in-cache sorting and merging. For many practical matrices, PB-SpGEMM runs 20%-50% faster than the state-of-the-art heap and hash SpGEMM algorithms on modern multicore processors. Most importantly, PB-SpGEMM attains performance predicted by the Roofline model, and its performance remains stable with respect to matrix size and sparsity.},
booktitle = {Proceedings of the 32nd ACM Symposium on Parallelism in Algorithms and Architectures},
pages = {293–303},
numpages = {11},
keywords = {SpGEMM, parallel algorithm},
location = {Virtual Event, USA},
series = {SPAA '20}
}
@online{googlebench,
title = {google/benchmark: A microbenchmark support library},
organization = {Google},
date = {2021-07-21},
urldate = {2021-07-18},
url = {https://github.com/google/benchmark}
}
@article{moreira2021matrix,
author = {Jos{\'{e}} E. Moreira and
Kit Barton and
Steven Battle and
Peter Bergner and
Ramon Bertran and
Puneeth Bhat and
Pedro Caldeira and
David Edelsohn and
Gordon Fossum and
Brad Frey and
Nemanja Ivanovic and
Chip Kerchner and
Vincent Lim and
Shakti Kapoor and
Tulio Machado Filho and
Silvia Melitta Mueller and
Brett Olsson and
Satish Sadasivam and
Baptiste Saleil and