Skip to content

Commit 9d7883c

Browse files
committed
Improve cmm peephole optimizations
This will be used by a future pr to simplify casting without impacting performance as much
1 parent 8580d64 commit 9d7883c

File tree

5 files changed

+728
-124
lines changed

5 files changed

+728
-124
lines changed

backend/amd64/cfg_selection.ml

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ class selector =
194194
method! select_operation op args dbg ~label_after =
195195
match op with
196196
(* Recognize the LEA instruction *)
197-
| Caddi | Caddv | Cadda | Csubi -> (
197+
| Caddi | Caddv | Cadda | Csubi | Cor -> (
198198
match self#select_addressing Word_int (Cop (op, args, dbg)) with
199199
| Iindexed _, _ | Iindexed2 0, _ ->
200200
super#select_operation op args dbg ~label_after
@@ -252,13 +252,18 @@ class selector =
252252
| Cbswap { bitwidth } ->
253253
let bitwidth = select_bitwidth bitwidth in
254254
specific (Ibswap { bitwidth }), args
255-
(* Recognize sign extension *)
256255
| Casr -> (
256+
(* Recognize sign extension *)
257257
match args with
258258
| [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
259259
specific Isextend32, [k]
260260
| _ -> super#select_operation op args dbg ~label_after)
261261
(* Recognize zero extension *)
262+
| Clsr -> (
263+
match args with
264+
| [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
265+
specific Izextend32, [k]
266+
| _ -> super#select_operation op args dbg ~label_after)
262267
| Cand -> (
263268
match args with
264269
| [arg; Cconst_int (0xffff_ffff, _)]

backend/amd64/selection.ml

Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
(**************************************************************************)
2+
(* *)
3+
(* OCaml *)
4+
(* *)
5+
(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *)
6+
(* *)
7+
(* Copyright 2000 Institut National de Recherche en Informatique et *)
8+
(* en Automatique. *)
9+
(* *)
10+
(* All rights reserved. This file is distributed under the terms of *)
11+
(* the GNU Lesser General Public License version 2.1, with the *)
12+
(* special exception on linking described in the file LICENSE. *)
13+
(* *)
14+
(**************************************************************************)
15+
16+
(* Instruction selection for the AMD64 *)
17+
18+
[@@@ocaml.warning "+a-4-9-40-41-42"]
19+
20+
(* note: no `open! Int_replace_polymorphic_compare` as the module is about to be
21+
deleted. *)
22+
23+
open Arch
24+
open Selection_utils
25+
26+
let pseudoregs_for_operation op arg res =
27+
match (op : Mach.operation) with
28+
(* Two-address binary operations: arg.(0) and res.(0) must be the same *)
29+
| Iintop (Iadd | Isub | Imul | Iand | Ior | Ixor)
30+
| Ifloatop ((Float32 | Float64), (Iaddf | Isubf | Imulf | Idivf)) ->
31+
[| res.(0); arg.(1) |], res
32+
| Iintop_atomic { op = Compare_set; size = _; addr = _ } ->
33+
(* first arg must be rax *)
34+
let arg = Array.copy arg in
35+
arg.(0) <- rax;
36+
arg, res
37+
| Iintop_atomic { op = Compare_exchange; size = _; addr = _ } ->
38+
(* first arg must be rax, res.(0) must be rax. *)
39+
let arg = Array.copy arg in
40+
arg.(0) <- rax;
41+
arg, [| rax |]
42+
| Iintop_atomic { op = Exchange | Fetch_and_add; size = _; addr = _ } ->
43+
(* first arg must be the same as res.(0) *)
44+
let arg = Array.copy arg in
45+
arg.(0) <- res.(0);
46+
arg, res
47+
(* One-address unary operations: arg.(0) and res.(0) must be the same *)
48+
| Iintop_imm ((Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr), _)
49+
| Ifloatop ((Float64 | Float32), (Iabsf | Inegf))
50+
| Ispecific (Ibswap { bitwidth = Thirtytwo | Sixtyfour }) ->
51+
res, res
52+
(* For xchg, args must be a register allowing access to high 8 bit register
53+
(rax, rbx, rcx or rdx). Keep it simple, just force the argument in rax. *)
54+
| Ispecific (Ibswap { bitwidth = Sixteen }) -> [| rax |], [| rax |]
55+
(* For imulh, first arg must be in rax, rax is clobbered, and result is in
56+
rdx. *)
57+
| Iintop (Imulh _) -> [| rax; arg.(1) |], [| rdx |]
58+
| Ispecific (Ifloatarithmem (_, _, _)) ->
59+
let arg' = Array.copy arg in
60+
arg'.(0) <- res.(0);
61+
arg', res
62+
(* For shifts with variable shift count, second arg must be in rcx *)
63+
| Iintop (Ilsl | Ilsr | Iasr) -> [| res.(0); rcx |], res
64+
(* For div and mod, first arg must be in rax, rdx is clobbered, and result is
65+
in rax or rdx respectively. Keep it simple, just force second argument in
66+
rcx. *)
67+
| Iintop Idiv -> [| rax; rcx |], [| rax |]
68+
| Iintop Imod -> [| rax; rcx |], [| rdx |]
69+
| Ifloatop (Float64, Icompf cond) ->
70+
(* CR gyorsh: make this optimization as a separate PR. *)
71+
(* We need to temporarily store the result of the comparison in a float
72+
register, but we don't want to clobber any of the inputs if they would
73+
still be live after this operation -- so we add a fresh register as both
74+
an input and output. We don't use [destroyed_at_oper], because that
75+
forces us to choose a fixed register, which makes it more likely an extra
76+
mov would be added to transfer the argument to the fixed register. *)
77+
let treg = Reg.create Float in
78+
let _, is_swapped = float_cond_and_need_swap cond in
79+
( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]),
80+
[| res.(0); treg |] )
81+
| Ifloatop (Float32, Icompf cond) ->
82+
let treg = Reg.create Float32 in
83+
let _, is_swapped = float_cond_and_need_swap cond in
84+
( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]),
85+
[| res.(0); treg |] )
86+
| Ispecific Irdpmc ->
87+
(* For rdpmc instruction, the argument must be in ecx and the result is in
88+
edx (high) and eax (low). Make it simple and force the argument in rcx,
89+
and rax and rdx clobbered *)
90+
[| rcx |], res
91+
| Ispecific (Isimd op) ->
92+
Simd_selection.pseudoregs_for_operation
93+
(Simd_proc.register_behavior op)
94+
arg res
95+
| Ispecific (Isimd_mem (op, _addr)) ->
96+
Simd_selection.pseudoregs_for_operation
97+
(Simd_proc.Mem.register_behavior op)
98+
arg res
99+
| Icsel _ ->
100+
(* last arg must be the same as res.(0) *)
101+
let len = Array.length arg in
102+
let arg = Array.copy arg in
103+
arg.(len - 1) <- res.(0);
104+
arg, res
105+
(* Other instructions are regular *)
106+
| Iintop_atomic { op = Add | Sub | Land | Lor | Lxor; _ }
107+
| Iintop (Ipopcnt | Iclz _ | Ictz _ | Icomp _)
108+
| Iintop_imm ((Imulh _ | Idiv | Imod | Icomp _ | Ipopcnt | Iclz _ | Ictz _), _)
109+
| Ispecific
110+
( Isextend32 | Izextend32 | Ilea _
111+
| Istore_int (_, _, _)
112+
| Ipause | Ilfence | Isfence | Imfence
113+
| Ioffset_loc (_, _)
114+
| Irdtsc | Icldemote _ | Iprefetch _ )
115+
| Imove | Ispill | Ireload | Ireinterpret_cast _ | Istatic_cast _
116+
| Iconst_int _ | Iconst_float32 _ | Iconst_float _ | Iconst_vec128 _
117+
| Iconst_symbol _ | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
118+
| Iextcall _ | Istackoffset _ | Iload _
119+
| Istore (_, _, _)
120+
| Ialloc _ | Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque
121+
| Ibeginregion | Iendregion | Ipoll _ | Idls_get ->
122+
raise Use_default
123+
124+
(* The selector class *)
125+
126+
class selector =
127+
object (self)
128+
inherit Selectgen.selector_generic as super
129+
130+
method! is_immediate op n =
131+
match op with
132+
| Iadd | Isub | Imul | Iand | Ior | Ixor | Icomp _ -> is_immediate n
133+
| _ -> super#is_immediate op n
134+
135+
method is_immediate_test _cmp n = is_immediate n
136+
137+
method! is_simple_expr e =
138+
match e with
139+
| Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops ->
140+
(* inlined ops are simple if their arguments are *)
141+
List.for_all self#is_simple_expr args
142+
| _ -> super#is_simple_expr e
143+
144+
method! effects_of e =
145+
match e with
146+
| Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops ->
147+
Select_utils.Effect_and_coeffect.join_list_map args self#effects_of
148+
| _ -> super#effects_of e
149+
150+
method select_addressing _chunk exp =
151+
let a, d = select_addr exp in
152+
(* PR#4625: displacement must be a signed 32-bit immediate *)
153+
if not (is_immediate d)
154+
then Iindexed 0, exp
155+
else
156+
match a with
157+
| Asymbol s ->
158+
let glob : Arch.sym_global =
159+
match s.sym_global with Global -> Global | Local -> Local
160+
in
161+
Ibased (s.sym_name, glob, d), Ctuple []
162+
| Alinear e -> Iindexed d, e
163+
| Aadd (e1, e2) -> Iindexed2 d, Ctuple [e1; e2]
164+
| Ascale (e, scale) -> Iscaled (scale, d), e
165+
| Ascaledadd (e1, e2, scale) ->
166+
Iindexed2scaled (scale, d), Ctuple [e1; e2]
167+
168+
method! select_store is_assign addr exp =
169+
match exp with
170+
| Cconst_int (n, _dbg) when is_immediate n ->
171+
Ispecific (Istore_int (Nativeint.of_int n, addr, is_assign)), Ctuple []
172+
| Cconst_natint (n, _dbg) when is_immediate_natint n ->
173+
Ispecific (Istore_int (n, addr, is_assign)), Ctuple []
174+
| Cconst_int _ | Cconst_vec128 _
175+
| Cconst_natint (_, _)
176+
| Cconst_float32 (_, _)
177+
| Cconst_float (_, _)
178+
| Cconst_symbol (_, _)
179+
| Cvar _
180+
| Clet (_, _, _)
181+
| Clet_mut (_, _, _, _)
182+
| Cphantom_let (_, _, _)
183+
| Cassign (_, _)
184+
| Ctuple _
185+
| Cop (_, _, _)
186+
| Csequence (_, _)
187+
| Cifthenelse (_, _, _, _, _, _, _)
188+
| Cswitch (_, _, _, _, _)
189+
| Ccatch (_, _, _, _)
190+
| Cexit (_, _, _)
191+
| Ctrywith (_, _, _, _, _, _) ->
192+
super#select_store is_assign addr exp
193+
194+
method! select_operation op args dbg =
195+
match op with
196+
(* Recognize the LEA instruction *)
197+
| Caddi | Caddv | Cadda | Csubi | Cor -> (
198+
match self#select_addressing Word_int (Cop (op, args, dbg)) with
199+
| Iindexed _, _ | Iindexed2 0, _ -> super#select_operation op args dbg
200+
| ( ((Iindexed2 _ | Iscaled _ | Iindexed2scaled _ | Ibased _) as addr),
201+
arg ) ->
202+
Ispecific (Ilea addr), [arg])
203+
(* Recognize float arithmetic with memory. *)
204+
| Caddf width ->
205+
self#select_floatarith true width Simple_operation.Iaddf Arch.Ifloatadd
206+
args
207+
| Csubf width ->
208+
self#select_floatarith false width Simple_operation.Isubf Arch.Ifloatsub
209+
args
210+
| Cmulf width ->
211+
self#select_floatarith true width Simple_operation.Imulf Arch.Ifloatmul
212+
args
213+
| Cdivf width ->
214+
self#select_floatarith false width Simple_operation.Idivf Arch.Ifloatdiv
215+
args
216+
| Cpackf32 ->
217+
(* We must operate on registers. This is because if the second argument
218+
was a float stack slot, the resulting UNPCKLPS instruction would
219+
enforce the validity of loading it as a 128-bit memory location, even
220+
though it only loads 64 bits. *)
221+
Ispecific (Isimd (SSE Interleave_low_32_regs)), args
222+
(* Special cases overriding C implementations (regardless of
223+
[@@builtin]). *)
224+
| Cextcall { func = "sqrt" as func; _ }
225+
(* x86 intrinsics ([@@builtin]) *)
226+
| Cextcall { func; builtin = true; _ } -> (
227+
match func with
228+
| "caml_rdtsc_unboxed" -> Ispecific Irdtsc, args
229+
| "caml_rdpmc_unboxed" -> Ispecific Irdpmc, args
230+
| "caml_pause_hint" -> Ispecific Ipause, args
231+
| "caml_load_fence" -> Ispecific Ilfence, args
232+
| "caml_store_fence" -> Ispecific Isfence, args
233+
| "caml_memory_fence" -> Ispecific Imfence, args
234+
| "caml_cldemote" ->
235+
let addr, eloc =
236+
self#select_addressing Word_int (one_arg "cldemote" args)
237+
in
238+
Ispecific (Icldemote addr), [eloc]
239+
| _ -> (
240+
match Simd_selection.select_operation func args with
241+
| Some (op, args) -> op, args
242+
| None -> super#select_operation op args dbg))
243+
(* Recognize store instructions *)
244+
| Cstore (((Word_int | Word_val) as chunk), _init) -> (
245+
match args with
246+
| [loc; Cop (Caddi, [Cop (Cload _, [loc'], _); Cconst_int (n, _dbg)], _)]
247+
when loc = loc' && is_immediate n ->
248+
let addr, arg = self#select_addressing chunk loc in
249+
Ispecific (Ioffset_loc (n, addr)), [arg]
250+
| _ -> super#select_operation op args dbg)
251+
| Cbswap { bitwidth } ->
252+
let bitwidth = select_bitwidth bitwidth in
253+
Ispecific (Ibswap { bitwidth }), args
254+
(* Recognize sign extension *)
255+
| Casr -> (
256+
match args with
257+
| [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
258+
Ispecific Isextend32, [k]
259+
| _ -> super#select_operation op args dbg)
260+
(* Recognize zero extension *)
261+
| Clsr -> (
262+
match args with
263+
| [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
264+
Ispecific Izextend32, [k]
265+
| _ -> super#select_operation op args dbg)
266+
(* Recognize zero extension again *)
267+
| Cand -> (
268+
match args with
269+
| [arg; Cconst_int (0xffff_ffff, _)]
270+
| [arg; Cconst_natint (0xffff_ffffn, _)]
271+
| [Cconst_int (0xffff_ffff, _); arg]
272+
| [Cconst_natint (0xffff_ffffn, _); arg] ->
273+
Ispecific Izextend32, [arg]
274+
| _ -> super#select_operation op args dbg)
275+
| Ccsel _ -> (
276+
match args with
277+
| [cond; ifso; ifnot] -> (
278+
let cond, earg = self#select_condition cond in
279+
match cond with
280+
| Ifloattest (w, CFeq) ->
281+
(* CFeq cannot be represented as cmov without a jump. CFneq emits
282+
cmov for "unordered" and "not equal" cases. Use Cneq and swap the
283+
arguments. *)
284+
Icsel (Ifloattest (w, CFneq)), [earg; ifnot; ifso]
285+
| _ -> Icsel cond, [earg; ifso; ifnot])
286+
| _ -> super#select_operation op args dbg)
287+
| Cprefetch { is_write; locality } ->
288+
(* Emit prefetch for read hint when prefetchw is not supported. Matches
289+
the behavior of gcc's __builtin_prefetch *)
290+
let is_write =
291+
if is_write && not (Arch.Extension.enabled PREFETCHW)
292+
then false
293+
else is_write
294+
in
295+
let locality : Arch.prefetch_temporal_locality_hint =
296+
match select_locality locality with
297+
| Moderate when is_write && not (Arch.Extension.enabled PREFETCHWT1)
298+
->
299+
High
300+
| l -> l
301+
in
302+
let addr, eloc =
303+
self#select_addressing Word_int (one_arg "prefetch" args)
304+
in
305+
Ispecific (Iprefetch { is_write; addr; locality }), [eloc]
306+
| _ -> super#select_operation op args dbg
307+
308+
(* Recognize float arithmetic with mem *)
309+
310+
method select_floatarith commutative width regular_op mem_op args =
311+
let open Cmm in
312+
match width, args with
313+
| ( Float64,
314+
[arg1; Cop (Cload { memory_chunk = Double as chunk; _ }, [loc2], _)] )
315+
| ( Float32,
316+
[ arg1;
317+
Cop
318+
( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ },
319+
[loc2],
320+
_ ) ] ) ->
321+
let addr, arg2 = self#select_addressing chunk loc2 in
322+
Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg1; arg2]
323+
| ( Float64,
324+
[Cop (Cload { memory_chunk = Double as chunk; _ }, [loc1], _); arg2] )
325+
| ( Float32,
326+
[ Cop
327+
( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ },
328+
[loc1],
329+
_ );
330+
arg2 ] )
331+
when commutative ->
332+
let addr, arg1 = self#select_addressing chunk loc1 in
333+
Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg2; arg1]
334+
| _, [arg1; arg2] -> Mach.Ifloatop (width, regular_op), [arg1; arg2]
335+
| _ -> assert false
336+
337+
method! mark_c_tailcall = contains_calls := true
338+
339+
(* Deal with register constraints *)
340+
341+
method! insert_op_debug env op dbg rs rd =
342+
try
343+
let rsrc, rdst = pseudoregs_for_operation op rs rd in
344+
self#insert_moves env rs rsrc;
345+
self#insert_debug env (Iop op) dbg rsrc rdst;
346+
self#insert_moves env rdst rd;
347+
rd
348+
with Use_default -> super#insert_op_debug env op dbg rs rd
349+
end
350+
351+
let fundecl ~future_funcnames f =
352+
(new selector)#emit_fundecl ~future_funcnames f

backend/amd64/selection_utils.ml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,13 @@ let rec select_addr exp =
7676
| ( ((Asymbol _ | Aadd (_, _) | Ascaledadd (_, _, _)), _),
7777
((Asymbol _ | Alinear _ | Aadd (_, _) | Ascaledadd (_, _, _)), _) ) ->
7878
Aadd (arg1, arg2), 0)
79+
| Cmm.Cop (Cor, [arg; Cconst_int (1, _)], _)
80+
| Cmm.Cop (Cor, [Cconst_int (1, _); arg], _) -> (
81+
(* optimize tagging integers *)
82+
match select_addr arg with
83+
| Ascale (e, scale), off when scale mod 2 = 0 ->
84+
Ascale (e, scale), off lor 1
85+
| _ -> default)
7986
| _ -> default
8087

8188
(* Special constraints on operand and result registers *)

0 commit comments

Comments
 (0)