-
Notifications
You must be signed in to change notification settings - Fork 0
/
01.tex
726 lines (589 loc) · 19.3 KB
/
01.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
\documentclass[10pt,mathserif]{beamer}
\usepackage[english]{babel}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{cmbright}
\usepackage{tikz}
\usepackage{listings}
\usepackage{relsize}
\usetikzlibrary{arrows}
\usetikzlibrary{backgrounds}
\usetikzlibrary{chains}
\usetikzlibrary{fit}
\usetikzlibrary{positioning}
\usetikzlibrary{scopes}
\usetikzlibrary{trees}
\usetheme{Warsaw}
\useoutertheme{essential}
\setbeamertemplate{navigation symbols}{}
\author{Stefano Cherubin}
\institute{Politecnico di Milano}
\date{03-05-2019}
\title{Introduction to LLVM compiler framework}
\newcommand{\customdata}{Stefano Cherubin <[email protected]>}
\renewcommand{\ttdefault}{pxtt}
\lstset{basicstyle=\ttfamily\scriptsize}
\newcommand{\cinput}[1]{\lstinputlisting[language=C]{#1}}
\newcommand{\cinline}[1]{\lstinline[language=C]!#1!}
\newcommand{\llvminput}[1]{\lstinputlisting[language=LLVM]{#1}}
\newcommand{\llvminline}[1]{\lstinline[language=LLVM]!#1!}
\lstdefinelanguage{LLVM}%
{morekeywords={define,declare,global,constant,internal,external,private,%
linkonce,linkonce_odr,weak,weak_odr,appending,common,extern_weak,%
thread_local,dllimport,dllexport,hidden,protected,default,except,deplibs,%
volatile,fastcc,coldcc,cc,ccc,x86_stdcallcc,x86_fastcallcc,ptx_kernel,%
ptx_device,signext,zeroext,inreg,sret,nounwind,noreturn,nocapture,byval,%
nest,readnone,readonly,noalias,uwtable,inlinehint,noinline,alwaysinline,%
optsize,ssp,sspreq,noredzone,noimplicitfloat,naked,alignstack,module,asm,%
align,tail,to,addrspace,section,alias,sideeffect,c,gc,target,datalayout,%
triple,blockaddress},%
morekeywords=[2]{add,fadd,sub,fsub,mul,fmul,sdiv,udiv,fdiv,srem,urem,frem,%
and,or,xor,icmp,fcmp,eq,ne,ugt,uge,ult,ule,sgt,sge,slt,sle,oeq,ogt,oge,%
olt,ole,one,ord,ueq,ugt,uge,ult,ule,une,uno,nuw,nsw,exact,inbounds,phi,%
call,select,shl,lshr,ashr,va_arg,trunc,zext,sext,fptrunc,fpext,fptoui,%
fptosi,uitofp,sitofp,ptrtoint,inttoptr,bitcast,ret,br,indirectbr,switch,%
invoke,unwind,unreachable,malloc,alloca,free,load,store,getelementptr,%
extractelement,insertelement,shufflevector,extractvalue,insertvalue},%
sensitive=t,%
morestring=[b]",%
morecomment=[l];%
}[keywords,comments,strings]
\AtBeginSection[]
{
\begin{frame}{Contents}
\tableofcontents[currentsection]
\end{frame}
}
\begin{document}
\begin{frame}
\maketitle
\begin{center}
\itshape\scriptsize This material is strongly based on material produced by
Michele Scandale and Ettore Speziale for the course
`Code Optimizations and Transformations`.
\end{center}
\end{frame}
\section{Introduction}
\begin{frame}{Compilers and compilers}
Approaching to compilers, we need to understand the difference between a
\emph{toy-compiler} and \emph{production-quality compiler}.
\begin{columns}[t]
\column{.40\textwidth}
\begin{block}{Toy Compiler}
\begin{itemize}
\item small code-base
\item easy doing tiny edits
\item impossible doing normal/big edits
\end{itemize}
\end{block}
\column{.50\textwidth}
\begin{block}{Production-Quality Compiler}
\begin{itemize}
\item huge code-base
\item difficult performing any kind of edits
\item compiler-code extremely optimized
\end{itemize}
\end{block}
\end{columns}
\vfill
Key concepts:
\begin{itemize}
\item working with a production-quality compiler is \emph{initially} \alert{hard},
but \ldots
\item \ldots an huge set of tools for analyzing/transforming/testing code
is provided -- toy compilers \alert{miss these things}!
\end{itemize}
\end{frame}
\begin{frame}{LLVM: Low Level Virtual Machine}
Initially started as a research project at Urbana-Champaign:
\begin{itemize}
\item now intensively used for \alert{researches} involving compilers
\item key technology for \alert{leading industries} -- AMD, Apple, Intel,
NVIDIA
\end{itemize}
\vfill
If you are there, then it is \alert{your key-technology}:
\begin{itemize}
\item open-source compilers: GCC~\cite{LOCAL:www/gcc}, LLVM~\cite{LOCAL:www/llvm}
\item LLVM is relatively \alert{young} -- GCC performances may be better -- \ldots
\item \ldots LLVM is more modular, well written, kept \emph{clean} by developers.
\end{itemize}
\end{frame}
\section{Compiler organization}
\begin{frame}{Compiler pipeline}
Typically a compiler is a \alert{pipeline}:
\begin{figure}
\centering
\input{img/01/compiler-structure.tex}
\end{figure}
\vfill
There are three main components:
\begin{description}
\item[Front-end] \alert{translate} a source file into the intermediate representation
\item[Middle-end] \alert{analyze} intermediate representation, \alert{optimize}
it
\item[Back-end] \alert{generate} target machine assembly from the intermediate
representation
\end{description}
\end{frame}
\begin{frame}{Compiler pipeline}{Internal pipelines}
Each component is composed internally by pipelines:
\begin{itemize}
\item simple model -- read something, produce something
\item specify only how to transform input data into output data
\end{itemize}
\vfill
Complexity lies on \alert{chaining} together stages.
\vfill
\end{frame}
\begin{frame}{Compiler pipeline}
We will consider only the \emph{middle-end}: same concepts are
valid also for \{front,back\}-end.
\vfill
Technical terms:
\begin{description}
\item[Pass] a pipeline stage
\item[IR] (a.k.a. Intermediate Representation) is the language used in the
middle-end.
\end{description}
\vfill
The \alert{pass manager} manages a set of passes:
\begin{itemize}
\item build the compilation pipeline: \alert{schedule} passes together
according to \alert{dependencies}.
\end{itemize}
\vfill
Dependencies are \alert{hints} used by the pass manager in order to schedule passes.
\end{frame}
\begin{frame}{First insights}
A compiler is \alert{complex}:
\begin{itemize}
\item passes are the \alert{elementary unit of work}
\item pass manager must be \alert{advisee} about pass chaining
\item pipeline shapes are \alert{not fixed} -- it can change from one compiler
execution to another~\footnote{e.g. optimized/not optimized builds, compiler options, ...}
\end{itemize}
\vfill
Moreover, compilers must be \alert{conservative}:
\begin{itemize}
\item apply a transformation only if program \alert{semantic is preserved}
\end{itemize}
\vfill
Compiler algorithms are designed differently w.r.t. standard algorithms!
\end{frame}
\section{Algorithm design}
\begin{frame}{Classical Algorithm Design}
Dealing with algorithm design, a good approach is the following:
\begin{enumerate}
\item study the problem
\item make some example
\item identify the \alert{common case}
\item derive the algorithm for the common case
\item add handling for \alert{corner cases}
\item improve performancing \alert{optimizing the common case}
\end{enumerate}
\vfill
Weakness of the approach:
\begin{itemize}
\item \alert{corner cases} -- a \emph{correct} algorithm \textbf{must} consider \emph{all the corner cases}!
\end{itemize}
\end{frame}
\begin{frame}{Compiler Algorithm Design}{Be Conservative}
Corner cases are difficult to handle:
\begin{itemize}
\item compiler algorithms must be \alert{proved} to preserve program semantic
\item having a common methodology helps on that
\end{itemize}
\vfill
Compiler algorithms are built combining three kind of \alert{passes}:
\begin{itemize}
\item analysis
\item optimization
\item (normalization)
\end{itemize}
\pause
\vfill
We now consider a simple example: \emph{loop hoisting}.
\end{frame}
\begin{frame}{Loop Hoisting}
It is a transformation that:
\begin{itemize}
\item looks for statements (inside a loop) not depending on the loop state
\item move them outside the loop body
\end{itemize}
\begin{columns}[t]
\column{.45\textwidth}
\begin{block}{Loop Hoisting -- Before}
\centering
\cinput{snippet/01/loop-hoisting-before.c}
\end{block}
\column{.45\textwidth}
\begin{block}{Loop Hoisting -- After}
\centering
\cinput{snippet/01/loop-hoisting-after.c}
\end{block}
\end{columns}
\end{frame}
\begin{frame}{Loop Hoisting}{Focus on the Transformation}
\begin{block}{Transformation}
The transformation is trivial:
\begin{itemize}
\item move ``good'' statement outside of the loop
\end{itemize}
\end{block}
This is the \alert{optimization pass}. It needs to know:
\begin{itemize}
\item which pieces of code are loops
\item which statements are ``good'' statements
\end{itemize}
They are \alert{analysis}, which have to be implemented by other passes:
\begin{itemize}
\item detecting loops in the program
\item detecting loop-independent statements
\end{itemize}
When registering loop hoisting, also declare needed analysis:
\begin{itemize}
\item pipeline automatically built: \alert{analysis $\rightarrow$ optimization}
\end{itemize}
\end{frame}
\begin{frame}{Loop Hoisting}{Proving Program Semantic Preservation}
The \alert{proof} is trivial:
\begin{itemize}
\item transformation is correct if analysis are correct, but \ldots
\item \ldots usually analysis are built starting from other analysis already
implemented inside the compiler
\end{itemize}
\vfill
You have to prove that combining all analysis information gives you a
correct view of the code:
\begin{itemize}
\item analysis information cannot induce optimization passes applying a
transformation not preserving program semantic
\end{itemize}
\end{frame}
\begin{frame}{Loop Hoisting}{More Loops}
We have spoken about loops, but which kind of loop?
\begin{itemize}
\item \cinline{do-while} loops?
\item \cinline{while} loop?
\item \cinline{for} loops?
\end{itemize}
We have seen loop hoisting on:
\begin{itemize}
\item \cinline{do-while} loops
\end{itemize}
What about other kinds of loops?
\begin{itemize}
\item they must be normalized -- i.e. transformed to \cinline{do-while} loops
\end{itemize}
\alert{Normalization passes} do that:
\begin{itemize}
\item before running loop hoisting, you must tell to the pass manager that loop
normalization must be run before
\end{itemize}
This allows to recognize more loops, thus potentially
\alert{improving optimization impact}!
\end{frame}
\begin{frame}{Compiler Algorithm Design}{A methodology}
You have to:
\begin{enumerate}
\item analyze the problem
\item make some examples
\item detect the common case
\item declare the \alert{input format}
\item declare \alert{analysis} you need
\item design an \alert{optimization} pass
\item proof its \alert{correctness}
\item improve algorithm perfomance by acting on common case -- the only
considered up to now. Please notice that corner cases are not considered
-- just do not try to optimize the corner cases
\item improve the effectiveness of the algorithm by adding
\alert{normalization passes}
\end{enumerate}
\end{frame}
\section{Inside LLVM}
\begin{frame}{Terminology}{Speaking About LLVM IR}
LLVM IR comes with 3 different flavours:
\begin{description}
\item[assembly] human-readable format
\item[bitcode] binary on-disk machine-oriented format
\item[in-memory] binary in-memory format, used during compilation process
\end{description}
All formats have the same expressiveness!
\vfill
File extensions:
\begin{description}
\item[.ll] for assembly files
\item[.bc] for bitcode files
\end{description}
\end{frame}
\begin{frame}{Tools}{C Language Family Front-end}
Writing LLVM assembly by hand is unfeasible:
\begin{itemize}
\item different front-ends available for LLVM
\item use Clang~\cite{LOCAL:www/clang} for the C family
\end{itemize}
The clang driver is compatible with GCC:
\begin{itemize}
\item $\approx$ same command line options
\end{itemize}
\vfill
To generate LLVM IR:
\begin{description}
\item[assembly] \texttt{\smaller clang -emit-llvm -S -o out.ll in.c}
\item[bitcode] \texttt{\smaller clang -emit-llvm -o out.bc in.c}
\end{description}
It can also generate native code starting from LLVM assembly or LLVM bitcode --
like compiling an assembly file with GCC
\end{frame}
\begin{frame}{Tools}{Playing with LLVM Passes}
LLVM IR can be manipulated using \texttt{\smaller opt}:
\begin{itemize}
\item read an input file
\item run specified LLVM passes on it
\item respecting user-provided order
\end{itemize}
\vfill
Useful passes:
\begin{itemize}
\item print CFG with \texttt{\smaller opt -view-cfg input.ll}
\item print dominator tree with \texttt{\smaller opt -view-dom input.ll}
\item \ldots
\end{itemize}
Pass chaining:
\begin{itemize}
\item run \emph{mem2reg}, then view the CFG with \\
\texttt{\smaller opt -mem2reg -view-cfg input.ll}
\item potentially different results using different option order (\alert{phase/stage ordering})
\end{itemize}
\end{frame}
\begin{frame}{Pass Hierarchy}
LLVM provides a lot of passes:
\begin{itemize}
\item try \texttt{\smaller opt -help}
\end{itemize}
\vfill
For performance reasons there are different kind of passes:
\begin{block}{LLVM Passes}
\input{img/01/llvm-passes.tex}
\centering
\end{block}
\end{frame}
\begin{frame}{LLVM Passes}
Each pass kind visits particular elements of a module:
\begin{description}[align=left, labelwidth=1cm]
\item[ImmutablePass] compiler configuration -- never run
\item[CallGraphSCCPass] post-order visit of CallGraph SCCs
\item[ModulePass] visit the whole module
\item[FunctionPass] visit functions
\item[LoopPass] post-order visit of loop nests
%\item[BasicBlockPass] visit basic blocks % DEPRECATED AND REMOVED
\item[RegionPass] visit a custom-defined region of code
\end{description}
\vfill
Specializations comes with restrictions:
\begin{itemize}
\item e.g. a \alert{FunctionPass} cannot add or delete functions
\item refer to ``Writing a LLVM Pass''~\cite{LOCAL:www/llvmWritingAPass}
for accurate description of features and limitations of each kind of pass
\end{itemize}
\end{frame}
% \begin{frame}{Examples}
% Now we will see very simple passes:
%
% \begin{itemize}
% \item some of them are meaningless
% \item goal is to show you the LLVM API
% \end{itemize}
%
% \vfill
%
% The passes are:
% \begin{description}
% \item[instruction-count] simple instruction counting analysis
% \item[hello-llvm] optimization pass building an hello-world program
% \item[function-eraser] optimization pass removing ``small'' functions
% \end{description}
%
% \vfill
% Hint: take the LLVM pass writing tutorial~\cite{LOCAL:www/llvmWritingAPass}
% \end{frame}
\begin{frame}{What is Available Inside LLVM?}
LLVM provides passes performing basic transformations:
\begin{itemize}
\item variables promotion
\item loops canonicalization
\item \ldots
\end{itemize}
\vfill
They can be used to \alert{normalize/canonicalize} the input
\begin{itemize}
\item transform into a form analyzable for further passes
\item it is essential because keeps passes implementation manageable
\end{itemize}
\end{frame}
\section{LLVM-IR language}
\begin{frame}{LLVM IR}
LLVM IR~\cite{LOCAL:www/llvmLanguageRef} language is RISC-based:
\begin{itemize}
\item instructions operates on \alert{variables}~\footnote{Virtual registers}
\item only \llvminline{load} and \llvminline{store} access memory
\item \llvminline{alloca} used to reserve memory on function stacks
\end{itemize}
\vfill
There are also few \alert{high level instructions}:
\begin{itemize}
\item function call -- \llvminline{call}
\item pointer arithmetics -- \llvminline{getelementptr}
\item \ldots
\end{itemize}
\end{frame}
\begin{frame}{LLVM IR}{Types \& Variables}
LLVM IR is \alert{strongly typed}:
\begin{itemize}
\item e.g. you cannot assign a floating point value to an integer variable
without an explicit cast
\end{itemize}
\alert{Almost everything} is \alert{typed} -- e.g.:
\begin{description}
\item[functions] \llvminline{@fact} -- \llvminline{i32 (i32)}
\item[statements] \llvminline{\%3 = icmp eq i32 \%2, 0} -- \llvminline{i1}
\end{description}
A variable can be:
\begin{description}
\item[global] \llvminline{@var = common global i32 0, align 4}
\item[function parameter] \llvminline{define i32 @fact(i32 \%n)}
\item[local] \llvminline{\%2 = load i32, i32* \%1, align 4}
\end{description}
Local variables are defined by statements
\end{frame}
\begin{frame}{LLVM IR}{Example: factorial}
\begin{center}
\llvminput{snippet/01/fact.ll}
\end{center}
\end{frame}
\begin{frame}{LLVM IR Language}{Static Single Assignment}
LLVM IR is SSA-based:
\begin{itemize}
\item every variable is \alert{statically assigned} exactly \alert{once}
\end{itemize}
Statically means that:
\begin{itemize}
\item inside each function
\item for each variable \llvminline{\%foo}
\item there is only one statement in the form \llvminline{\%foo = ...}
\end{itemize}
Static is different from dynamic:
\begin{itemize}
\item a static assignment can be executed more than once
\end{itemize}
\end{frame}
\begin{frame}{Static Single Assignment}{Examples}
\begin{block}{Scalar SAXPY}
\centering
\cinput{snippet/02/scalar-saxpy.c}
\end{block}
\begin{block}{Scalar LLVM SAXPY}
\centering
\llvminput{snippet/02/scalar-saxpy.ll}
\end{block}
Temporary \llvminline{\%1} not reused! \llvminline{\%2} is used for the second
assignment!
\end{frame}
\begin{frame}{Static Single Assignment}{Examples}
\begin{block}{Array SAXPY}
\centering
\cinput{snippet/02/array-saxpy.c}
\end{block}
\begin{block}{Array LLVM SAXPY}
\centering
\llvminput{snippet/02/array-saxpy.ll}
\end{block}
One assignment for loop counter \llvminline{\%i.0}
\end{frame}
\begin{frame}{Static Single Assignment}{Handling Multiple Assignments}
\begin{block}{Max}
\centering
\cinput{snippet/02/max.c}
\end{block}
\begin{block}{LLVM Max -- Bad}
\centering
\llvminput{snippet/02/bad-max.ll}
\end{block}
Why is it bad?
\end{frame}
\begin{frame}{Static Single Assignment}{Use \llvminline{phi} to Avoid Troubles}
The \llvminline{\%2} variable must be statically set once
\begin{block}{LLVM Max}
\centering
\llvminput{snippet/02/good-max.ll}
\end{block}
The \llvminline{phi} instruction is a \emph{conditional move}:
\begin{itemize}
\item it takes $(variable_i, label_i)$ pairs
\item if coming from predecessor identified by $label_i$, its value is $variable_i$
\end{itemize}
\end{frame}
\begin{frame}{Static Single Assignment}{Definition and Uses}
Each SSA variable is set only once:
\begin{itemize}
\item variable \alert{definition}
\end{itemize}
\vfill
Each SSA variable can be used by multiple instructions:
\begin{itemize}
\item variable \alert{uses}
\end{itemize}
\vfill
Algorithms and technical language abuse of these terms:
\vfill
\emph{
Let \llvminline{\%foo} be a variable. If \llvminline{\%foo} definition has not
side-effects, and no uses, dead-code elimination can be efficiently performed
by erasing \llvminline{\%foo} definition from the CFG.
}
\end{frame}
\begin{frame}{Static Single Assignment}{Rationale}
Old compilers are not SSA-based:
\begin{itemize}
\item putting input into SSA-form is expensive
\item cost must be amortized
\end{itemize}
\vfill
New compilers are SSA-based:
\begin{itemize}
\item SSA easier to work with
\item SSA-based analysis/optimizations faster
\end{itemize}
\vfill
%All modern compilers are SSA-based:
%
%\begin{itemize}
%\item exception are old version of the HotSpot Client compiler
%\end{itemize}
\end{frame}
\section{Conclusions}
\begin{frame}{Conclusions}
LLVM is a \alert{production-quality} compiler framework:
\begin{itemize}
\item[$\Rightarrow$] impossible knowing all details
\end{itemize}
\vfill
But:
\begin{itemize}
\item it is well organized
\item given you known compilers theory, it is relatively easy to find what you need inside its sources
\end{itemize}
\vfill
Please take into account C++:
\begin{itemize}
\item basic skills required
\end{itemize}
\end{frame}
\section{Bibliography}
\begin{frame}[allowframebreaks]{Bibliography}
\nocite{*}
\bibliographystyle{unsrt}
\bibliography{bibliography}
\end{frame}
\end{document}