|
| 1 | +(**************************************************************************) |
| 2 | +(* *) |
| 3 | +(* OCaml *) |
| 4 | +(* *) |
| 5 | +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) |
| 6 | +(* *) |
| 7 | +(* Copyright 2000 Institut National de Recherche en Informatique et *) |
| 8 | +(* en Automatique. *) |
| 9 | +(* *) |
| 10 | +(* All rights reserved. This file is distributed under the terms of *) |
| 11 | +(* the GNU Lesser General Public License version 2.1, with the *) |
| 12 | +(* special exception on linking described in the file LICENSE. *) |
| 13 | +(* *) |
| 14 | +(**************************************************************************) |
| 15 | + |
| 16 | +(* Instruction selection for the AMD64 *) |
| 17 | + |
| 18 | +[@@@ocaml.warning "+a-4-9-40-41-42"] |
| 19 | + |
| 20 | +(* note: no `open! Int_replace_polymorphic_compare` as the module is about to be |
| 21 | + deleted. *) |
| 22 | + |
| 23 | +open Arch |
| 24 | +open Selection_utils |
| 25 | + |
| 26 | +let pseudoregs_for_operation op arg res = |
| 27 | + match (op : Mach.operation) with |
| 28 | + (* Two-address binary operations: arg.(0) and res.(0) must be the same *) |
| 29 | + | Iintop (Iadd | Isub | Imul | Iand | Ior | Ixor) |
| 30 | + | Ifloatop ((Float32 | Float64), (Iaddf | Isubf | Imulf | Idivf)) -> |
| 31 | + [| res.(0); arg.(1) |], res |
| 32 | + | Iintop_atomic { op = Compare_set; size = _; addr = _ } -> |
| 33 | + (* first arg must be rax *) |
| 34 | + let arg = Array.copy arg in |
| 35 | + arg.(0) <- rax; |
| 36 | + arg, res |
| 37 | + | Iintop_atomic { op = Compare_exchange; size = _; addr = _ } -> |
| 38 | + (* first arg must be rax, res.(0) must be rax. *) |
| 39 | + let arg = Array.copy arg in |
| 40 | + arg.(0) <- rax; |
| 41 | + arg, [| rax |] |
| 42 | + | Iintop_atomic { op = Exchange | Fetch_and_add; size = _; addr = _ } -> |
| 43 | + (* first arg must be the same as res.(0) *) |
| 44 | + let arg = Array.copy arg in |
| 45 | + arg.(0) <- res.(0); |
| 46 | + arg, res |
| 47 | + (* One-address unary operations: arg.(0) and res.(0) must be the same *) |
| 48 | + | Iintop_imm ((Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr), _) |
| 49 | + | Ifloatop ((Float64 | Float32), (Iabsf | Inegf)) |
| 50 | + | Ispecific (Ibswap { bitwidth = Thirtytwo | Sixtyfour }) -> |
| 51 | + res, res |
| 52 | + (* For xchg, args must be a register allowing access to high 8 bit register |
| 53 | + (rax, rbx, rcx or rdx). Keep it simple, just force the argument in rax. *) |
| 54 | + | Ispecific (Ibswap { bitwidth = Sixteen }) -> [| rax |], [| rax |] |
| 55 | + (* For imulh, first arg must be in rax, rax is clobbered, and result is in |
| 56 | + rdx. *) |
| 57 | + | Iintop (Imulh _) -> [| rax; arg.(1) |], [| rdx |] |
| 58 | + | Ispecific (Ifloatarithmem (_, _, _)) -> |
| 59 | + let arg' = Array.copy arg in |
| 60 | + arg'.(0) <- res.(0); |
| 61 | + arg', res |
| 62 | + (* For shifts with variable shift count, second arg must be in rcx *) |
| 63 | + | Iintop (Ilsl | Ilsr | Iasr) -> [| res.(0); rcx |], res |
| 64 | + (* For div and mod, first arg must be in rax, rdx is clobbered, and result is |
| 65 | + in rax or rdx respectively. Keep it simple, just force second argument in |
| 66 | + rcx. *) |
| 67 | + | Iintop Idiv -> [| rax; rcx |], [| rax |] |
| 68 | + | Iintop Imod -> [| rax; rcx |], [| rdx |] |
| 69 | + | Ifloatop (Float64, Icompf cond) -> |
| 70 | + (* CR gyorsh: make this optimization as a separate PR. *) |
| 71 | + (* We need to temporarily store the result of the comparison in a float |
| 72 | + register, but we don't want to clobber any of the inputs if they would |
| 73 | + still be live after this operation -- so we add a fresh register as both |
| 74 | + an input and output. We don't use [destroyed_at_oper], because that |
| 75 | + forces us to choose a fixed register, which makes it more likely an extra |
| 76 | + mov would be added to transfer the argument to the fixed register. *) |
| 77 | + let treg = Reg.create Float in |
| 78 | + let _, is_swapped = float_cond_and_need_swap cond in |
| 79 | + ( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]), |
| 80 | + [| res.(0); treg |] ) |
| 81 | + | Ifloatop (Float32, Icompf cond) -> |
| 82 | + let treg = Reg.create Float32 in |
| 83 | + let _, is_swapped = float_cond_and_need_swap cond in |
| 84 | + ( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]), |
| 85 | + [| res.(0); treg |] ) |
| 86 | + | Ispecific Irdpmc -> |
| 87 | + (* For rdpmc instruction, the argument must be in ecx and the result is in |
| 88 | + edx (high) and eax (low). Make it simple and force the argument in rcx, |
| 89 | + and rax and rdx clobbered *) |
| 90 | + [| rcx |], res |
| 91 | + | Ispecific (Isimd op) -> |
| 92 | + Simd_selection.pseudoregs_for_operation |
| 93 | + (Simd_proc.register_behavior op) |
| 94 | + arg res |
| 95 | + | Ispecific (Isimd_mem (op, _addr)) -> |
| 96 | + Simd_selection.pseudoregs_for_operation |
| 97 | + (Simd_proc.Mem.register_behavior op) |
| 98 | + arg res |
| 99 | + | Icsel _ -> |
| 100 | + (* last arg must be the same as res.(0) *) |
| 101 | + let len = Array.length arg in |
| 102 | + let arg = Array.copy arg in |
| 103 | + arg.(len - 1) <- res.(0); |
| 104 | + arg, res |
| 105 | + (* Other instructions are regular *) |
| 106 | + | Iintop_atomic { op = Add | Sub | Land | Lor | Lxor; _ } |
| 107 | + | Iintop (Ipopcnt | Iclz _ | Ictz _ | Icomp _) |
| 108 | + | Iintop_imm ((Imulh _ | Idiv | Imod | Icomp _ | Ipopcnt | Iclz _ | Ictz _), _) |
| 109 | + | Ispecific |
| 110 | + ( Isextend32 | Izextend32 | Ilea _ |
| 111 | + | Istore_int (_, _, _) |
| 112 | + | Ipause | Ilfence | Isfence | Imfence |
| 113 | + | Ioffset_loc (_, _) |
| 114 | + | Irdtsc | Icldemote _ | Iprefetch _ ) |
| 115 | + | Imove | Ispill | Ireload | Ireinterpret_cast _ | Istatic_cast _ |
| 116 | + | Iconst_int _ | Iconst_float32 _ | Iconst_float _ | Iconst_vec128 _ |
| 117 | + | Iconst_symbol _ | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _ |
| 118 | + | Iextcall _ | Istackoffset _ | Iload _ |
| 119 | + | Istore (_, _, _) |
| 120 | + | Ialloc _ | Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque |
| 121 | + | Ibeginregion | Iendregion | Ipoll _ | Idls_get -> |
| 122 | + raise Use_default |
| 123 | + |
| 124 | +(* The selector class *) |
| 125 | + |
| 126 | +class selector = |
| 127 | + object (self) |
| 128 | + inherit Selectgen.selector_generic as super |
| 129 | + |
| 130 | + method! is_immediate op n = |
| 131 | + match op with |
| 132 | + | Iadd | Isub | Imul | Iand | Ior | Ixor | Icomp _ -> is_immediate n |
| 133 | + | _ -> super#is_immediate op n |
| 134 | + |
| 135 | + method is_immediate_test _cmp n = is_immediate n |
| 136 | + |
| 137 | + method! is_simple_expr e = |
| 138 | + match e with |
| 139 | + | Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops -> |
| 140 | + (* inlined ops are simple if their arguments are *) |
| 141 | + List.for_all self#is_simple_expr args |
| 142 | + | _ -> super#is_simple_expr e |
| 143 | + |
| 144 | + method! effects_of e = |
| 145 | + match e with |
| 146 | + | Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops -> |
| 147 | + Select_utils.Effect_and_coeffect.join_list_map args self#effects_of |
| 148 | + | _ -> super#effects_of e |
| 149 | + |
| 150 | + method select_addressing _chunk exp = |
| 151 | + let a, d = select_addr exp in |
| 152 | + (* PR#4625: displacement must be a signed 32-bit immediate *) |
| 153 | + if not (is_immediate d) |
| 154 | + then Iindexed 0, exp |
| 155 | + else |
| 156 | + match a with |
| 157 | + | Asymbol s -> |
| 158 | + let glob : Arch.sym_global = |
| 159 | + match s.sym_global with Global -> Global | Local -> Local |
| 160 | + in |
| 161 | + Ibased (s.sym_name, glob, d), Ctuple [] |
| 162 | + | Alinear e -> Iindexed d, e |
| 163 | + | Aadd (e1, e2) -> Iindexed2 d, Ctuple [e1; e2] |
| 164 | + | Ascale (e, scale) -> Iscaled (scale, d), e |
| 165 | + | Ascaledadd (e1, e2, scale) -> |
| 166 | + Iindexed2scaled (scale, d), Ctuple [e1; e2] |
| 167 | + |
| 168 | + method! select_store is_assign addr exp = |
| 169 | + match exp with |
| 170 | + | Cconst_int (n, _dbg) when is_immediate n -> |
| 171 | + Ispecific (Istore_int (Nativeint.of_int n, addr, is_assign)), Ctuple [] |
| 172 | + | Cconst_natint (n, _dbg) when is_immediate_natint n -> |
| 173 | + Ispecific (Istore_int (n, addr, is_assign)), Ctuple [] |
| 174 | + | Cconst_int _ | Cconst_vec128 _ |
| 175 | + | Cconst_natint (_, _) |
| 176 | + | Cconst_float32 (_, _) |
| 177 | + | Cconst_float (_, _) |
| 178 | + | Cconst_symbol (_, _) |
| 179 | + | Cvar _ |
| 180 | + | Clet (_, _, _) |
| 181 | + | Clet_mut (_, _, _, _) |
| 182 | + | Cphantom_let (_, _, _) |
| 183 | + | Cassign (_, _) |
| 184 | + | Ctuple _ |
| 185 | + | Cop (_, _, _) |
| 186 | + | Csequence (_, _) |
| 187 | + | Cifthenelse (_, _, _, _, _, _, _) |
| 188 | + | Cswitch (_, _, _, _, _) |
| 189 | + | Ccatch (_, _, _, _) |
| 190 | + | Cexit (_, _, _) |
| 191 | + | Ctrywith (_, _, _, _, _, _) -> |
| 192 | + super#select_store is_assign addr exp |
| 193 | + |
| 194 | + method! select_operation op args dbg = |
| 195 | + match op with |
| 196 | + (* Recognize the LEA instruction *) |
| 197 | + | Caddi | Caddv | Cadda | Csubi | Cor -> ( |
| 198 | + match self#select_addressing Word_int (Cop (op, args, dbg)) with |
| 199 | + | Iindexed _, _ | Iindexed2 0, _ -> super#select_operation op args dbg |
| 200 | + | ( ((Iindexed2 _ | Iscaled _ | Iindexed2scaled _ | Ibased _) as addr), |
| 201 | + arg ) -> |
| 202 | + Ispecific (Ilea addr), [arg]) |
| 203 | + (* Recognize float arithmetic with memory. *) |
| 204 | + | Caddf width -> |
| 205 | + self#select_floatarith true width Simple_operation.Iaddf Arch.Ifloatadd |
| 206 | + args |
| 207 | + | Csubf width -> |
| 208 | + self#select_floatarith false width Simple_operation.Isubf Arch.Ifloatsub |
| 209 | + args |
| 210 | + | Cmulf width -> |
| 211 | + self#select_floatarith true width Simple_operation.Imulf Arch.Ifloatmul |
| 212 | + args |
| 213 | + | Cdivf width -> |
| 214 | + self#select_floatarith false width Simple_operation.Idivf Arch.Ifloatdiv |
| 215 | + args |
| 216 | + | Cpackf32 -> |
| 217 | + (* We must operate on registers. This is because if the second argument |
| 218 | + was a float stack slot, the resulting UNPCKLPS instruction would |
| 219 | + enforce the validity of loading it as a 128-bit memory location, even |
| 220 | + though it only loads 64 bits. *) |
| 221 | + Ispecific (Isimd (SSE Interleave_low_32_regs)), args |
| 222 | + (* Special cases overriding C implementations (regardless of |
| 223 | + [@@builtin]). *) |
| 224 | + | Cextcall { func = "sqrt" as func; _ } |
| 225 | + (* x86 intrinsics ([@@builtin]) *) |
| 226 | + | Cextcall { func; builtin = true; _ } -> ( |
| 227 | + match func with |
| 228 | + | "caml_rdtsc_unboxed" -> Ispecific Irdtsc, args |
| 229 | + | "caml_rdpmc_unboxed" -> Ispecific Irdpmc, args |
| 230 | + | "caml_pause_hint" -> Ispecific Ipause, args |
| 231 | + | "caml_load_fence" -> Ispecific Ilfence, args |
| 232 | + | "caml_store_fence" -> Ispecific Isfence, args |
| 233 | + | "caml_memory_fence" -> Ispecific Imfence, args |
| 234 | + | "caml_cldemote" -> |
| 235 | + let addr, eloc = |
| 236 | + self#select_addressing Word_int (one_arg "cldemote" args) |
| 237 | + in |
| 238 | + Ispecific (Icldemote addr), [eloc] |
| 239 | + | _ -> ( |
| 240 | + match Simd_selection.select_operation func args with |
| 241 | + | Some (op, args) -> op, args |
| 242 | + | None -> super#select_operation op args dbg)) |
| 243 | + (* Recognize store instructions *) |
| 244 | + | Cstore (((Word_int | Word_val) as chunk), _init) -> ( |
| 245 | + match args with |
| 246 | + | [loc; Cop (Caddi, [Cop (Cload _, [loc'], _); Cconst_int (n, _dbg)], _)] |
| 247 | + when loc = loc' && is_immediate n -> |
| 248 | + let addr, arg = self#select_addressing chunk loc in |
| 249 | + Ispecific (Ioffset_loc (n, addr)), [arg] |
| 250 | + | _ -> super#select_operation op args dbg) |
| 251 | + | Cbswap { bitwidth } -> |
| 252 | + let bitwidth = select_bitwidth bitwidth in |
| 253 | + Ispecific (Ibswap { bitwidth }), args |
| 254 | + (* Recognize sign extension *) |
| 255 | + | Casr -> ( |
| 256 | + match args with |
| 257 | + | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] -> |
| 258 | + Ispecific Isextend32, [k] |
| 259 | + | _ -> super#select_operation op args dbg) |
| 260 | + (* Recognize zero extension *) |
| 261 | + | Clsr -> ( |
| 262 | + match args with |
| 263 | + | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] -> |
| 264 | + Ispecific Izextend32, [k] |
| 265 | + | _ -> super#select_operation op args dbg) |
| 266 | + (* Recognize zero extension again *) |
| 267 | + | Cand -> ( |
| 268 | + match args with |
| 269 | + | [arg; Cconst_int (0xffff_ffff, _)] |
| 270 | + | [arg; Cconst_natint (0xffff_ffffn, _)] |
| 271 | + | [Cconst_int (0xffff_ffff, _); arg] |
| 272 | + | [Cconst_natint (0xffff_ffffn, _); arg] -> |
| 273 | + Ispecific Izextend32, [arg] |
| 274 | + | _ -> super#select_operation op args dbg) |
| 275 | + | Ccsel _ -> ( |
| 276 | + match args with |
| 277 | + | [cond; ifso; ifnot] -> ( |
| 278 | + let cond, earg = self#select_condition cond in |
| 279 | + match cond with |
| 280 | + | Ifloattest (w, CFeq) -> |
| 281 | + (* CFeq cannot be represented as cmov without a jump. CFneq emits |
| 282 | + cmov for "unordered" and "not equal" cases. Use Cneq and swap the |
| 283 | + arguments. *) |
| 284 | + Icsel (Ifloattest (w, CFneq)), [earg; ifnot; ifso] |
| 285 | + | _ -> Icsel cond, [earg; ifso; ifnot]) |
| 286 | + | _ -> super#select_operation op args dbg) |
| 287 | + | Cprefetch { is_write; locality } -> |
| 288 | + (* Emit prefetch for read hint when prefetchw is not supported. Matches |
| 289 | + the behavior of gcc's __builtin_prefetch *) |
| 290 | + let is_write = |
| 291 | + if is_write && not (Arch.Extension.enabled PREFETCHW) |
| 292 | + then false |
| 293 | + else is_write |
| 294 | + in |
| 295 | + let locality : Arch.prefetch_temporal_locality_hint = |
| 296 | + match select_locality locality with |
| 297 | + | Moderate when is_write && not (Arch.Extension.enabled PREFETCHWT1) |
| 298 | + -> |
| 299 | + High |
| 300 | + | l -> l |
| 301 | + in |
| 302 | + let addr, eloc = |
| 303 | + self#select_addressing Word_int (one_arg "prefetch" args) |
| 304 | + in |
| 305 | + Ispecific (Iprefetch { is_write; addr; locality }), [eloc] |
| 306 | + | _ -> super#select_operation op args dbg |
| 307 | + |
| 308 | + (* Recognize float arithmetic with mem *) |
| 309 | + |
| 310 | + method select_floatarith commutative width regular_op mem_op args = |
| 311 | + let open Cmm in |
| 312 | + match width, args with |
| 313 | + | ( Float64, |
| 314 | + [arg1; Cop (Cload { memory_chunk = Double as chunk; _ }, [loc2], _)] ) |
| 315 | + | ( Float32, |
| 316 | + [ arg1; |
| 317 | + Cop |
| 318 | + ( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ }, |
| 319 | + [loc2], |
| 320 | + _ ) ] ) -> |
| 321 | + let addr, arg2 = self#select_addressing chunk loc2 in |
| 322 | + Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg1; arg2] |
| 323 | + | ( Float64, |
| 324 | + [Cop (Cload { memory_chunk = Double as chunk; _ }, [loc1], _); arg2] ) |
| 325 | + | ( Float32, |
| 326 | + [ Cop |
| 327 | + ( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ }, |
| 328 | + [loc1], |
| 329 | + _ ); |
| 330 | + arg2 ] ) |
| 331 | + when commutative -> |
| 332 | + let addr, arg1 = self#select_addressing chunk loc1 in |
| 333 | + Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg2; arg1] |
| 334 | + | _, [arg1; arg2] -> Mach.Ifloatop (width, regular_op), [arg1; arg2] |
| 335 | + | _ -> assert false |
| 336 | + |
| 337 | + method! mark_c_tailcall = contains_calls := true |
| 338 | + |
| 339 | + (* Deal with register constraints *) |
| 340 | + |
| 341 | + method! insert_op_debug env op dbg rs rd = |
| 342 | + try |
| 343 | + let rsrc, rdst = pseudoregs_for_operation op rs rd in |
| 344 | + self#insert_moves env rs rsrc; |
| 345 | + self#insert_debug env (Iop op) dbg rsrc rdst; |
| 346 | + self#insert_moves env rdst rd; |
| 347 | + rd |
| 348 | + with Use_default -> super#insert_op_debug env op dbg rs rd |
| 349 | + end |
| 350 | + |
| 351 | +let fundecl ~future_funcnames f = |
| 352 | + (new selector)#emit_fundecl ~future_funcnames f |
0 commit comments