diff --git a/backend/amd64/cfg_selection.ml b/backend/amd64/cfg_selection.ml index 776b648f406..6b3252a2cea 100644 --- a/backend/amd64/cfg_selection.ml +++ b/backend/amd64/cfg_selection.ml @@ -194,7 +194,7 @@ class selector = method! select_operation op args dbg ~label_after = match op with (* Recognize the LEA instruction *) - | Caddi | Caddv | Cadda | Csubi -> ( + | Caddi | Caddv | Cadda | Csubi | Cor -> ( match self#select_addressing Word_int (Cop (op, args, dbg)) with | Iindexed _, _ | Iindexed2 0, _ -> super#select_operation op args dbg ~label_after @@ -252,13 +252,18 @@ class selector = | Cbswap { bitwidth } -> let bitwidth = select_bitwidth bitwidth in specific (Ibswap { bitwidth }), args - (* Recognize sign extension *) | Casr -> ( + (* Recognize sign extension *) match args with | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] -> specific Isextend32, [k] | _ -> super#select_operation op args dbg ~label_after) (* Recognize zero extension *) + | Clsr -> ( + match args with + | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] -> + specific Izextend32, [k] + | _ -> super#select_operation op args dbg ~label_after) | Cand -> ( match args with | [arg; Cconst_int (0xffff_ffff, _)] diff --git a/backend/amd64/selection.ml b/backend/amd64/selection.ml new file mode 100644 index 00000000000..fcf75f5b527 --- /dev/null +++ b/backend/amd64/selection.ml @@ -0,0 +1,352 @@ +(**************************************************************************) +(* *) +(* OCaml *) +(* *) +(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* *) +(* Copyright 2000 Institut National de Recherche en Informatique et *) +(* en Automatique. *) +(* *) +(* All rights reserved. This file is distributed under the terms of *) +(* the GNU Lesser General Public License version 2.1, with the *) +(* special exception on linking described in the file LICENSE. *) +(* *) +(**************************************************************************) + +(* Instruction selection for the AMD64 *) + +[@@@ocaml.warning "+a-4-9-40-41-42"] + +(* note: no `open! Int_replace_polymorphic_compare` as the module is about to be + deleted. *) + +open Arch +open Selection_utils + +let pseudoregs_for_operation op arg res = + match (op : Mach.operation) with + (* Two-address binary operations: arg.(0) and res.(0) must be the same *) + | Iintop (Iadd | Isub | Imul | Iand | Ior | Ixor) + | Ifloatop ((Float32 | Float64), (Iaddf | Isubf | Imulf | Idivf)) -> + [| res.(0); arg.(1) |], res + | Iintop_atomic { op = Compare_set; size = _; addr = _ } -> + (* first arg must be rax *) + let arg = Array.copy arg in + arg.(0) <- rax; + arg, res + | Iintop_atomic { op = Compare_exchange; size = _; addr = _ } -> + (* first arg must be rax, res.(0) must be rax. *) + let arg = Array.copy arg in + arg.(0) <- rax; + arg, [| rax |] + | Iintop_atomic { op = Exchange | Fetch_and_add; size = _; addr = _ } -> + (* first arg must be the same as res.(0) *) + let arg = Array.copy arg in + arg.(0) <- res.(0); + arg, res + (* One-address unary operations: arg.(0) and res.(0) must be the same *) + | Iintop_imm ((Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr), _) + | Ifloatop ((Float64 | Float32), (Iabsf | Inegf)) + | Ispecific (Ibswap { bitwidth = Thirtytwo | Sixtyfour }) -> + res, res + (* For xchg, args must be a register allowing access to high 8 bit register + (rax, rbx, rcx or rdx). Keep it simple, just force the argument in rax. *) + | Ispecific (Ibswap { bitwidth = Sixteen }) -> [| rax |], [| rax |] + (* For imulh, first arg must be in rax, rax is clobbered, and result is in + rdx. *) + | Iintop (Imulh _) -> [| rax; arg.(1) |], [| rdx |] + | Ispecific (Ifloatarithmem (_, _, _)) -> + let arg' = Array.copy arg in + arg'.(0) <- res.(0); + arg', res + (* For shifts with variable shift count, second arg must be in rcx *) + | Iintop (Ilsl | Ilsr | Iasr) -> [| res.(0); rcx |], res + (* For div and mod, first arg must be in rax, rdx is clobbered, and result is + in rax or rdx respectively. Keep it simple, just force second argument in + rcx. *) + | Iintop Idiv -> [| rax; rcx |], [| rax |] + | Iintop Imod -> [| rax; rcx |], [| rdx |] + | Ifloatop (Float64, Icompf cond) -> + (* CR gyorsh: make this optimization as a separate PR. *) + (* We need to temporarily store the result of the comparison in a float + register, but we don't want to clobber any of the inputs if they would + still be live after this operation -- so we add a fresh register as both + an input and output. We don't use [destroyed_at_oper], because that + forces us to choose a fixed register, which makes it more likely an extra + mov would be added to transfer the argument to the fixed register. *) + let treg = Reg.create Float in + let _, is_swapped = float_cond_and_need_swap cond in + ( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]), + [| res.(0); treg |] ) + | Ifloatop (Float32, Icompf cond) -> + let treg = Reg.create Float32 in + let _, is_swapped = float_cond_and_need_swap cond in + ( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]), + [| res.(0); treg |] ) + | Ispecific Irdpmc -> + (* For rdpmc instruction, the argument must be in ecx and the result is in + edx (high) and eax (low). Make it simple and force the argument in rcx, + and rax and rdx clobbered *) + [| rcx |], res + | Ispecific (Isimd op) -> + Simd_selection.pseudoregs_for_operation + (Simd_proc.register_behavior op) + arg res + | Ispecific (Isimd_mem (op, _addr)) -> + Simd_selection.pseudoregs_for_operation + (Simd_proc.Mem.register_behavior op) + arg res + | Icsel _ -> + (* last arg must be the same as res.(0) *) + let len = Array.length arg in + let arg = Array.copy arg in + arg.(len - 1) <- res.(0); + arg, res + (* Other instructions are regular *) + | Iintop_atomic { op = Add | Sub | Land | Lor | Lxor; _ } + | Iintop (Ipopcnt | Iclz _ | Ictz _ | Icomp _) + | Iintop_imm ((Imulh _ | Idiv | Imod | Icomp _ | Ipopcnt | Iclz _ | Ictz _), _) + | Ispecific + ( Isextend32 | Izextend32 | Ilea _ + | Istore_int (_, _, _) + | Ipause | Ilfence | Isfence | Imfence + | Ioffset_loc (_, _) + | Irdtsc | Icldemote _ | Iprefetch _ ) + | Imove | Ispill | Ireload | Ireinterpret_cast _ | Istatic_cast _ + | Iconst_int _ | Iconst_float32 _ | Iconst_float _ | Iconst_vec128 _ + | Iconst_symbol _ | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _ + | Iextcall _ | Istackoffset _ | Iload _ + | Istore (_, _, _) + | Ialloc _ | Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque + | Ibeginregion | Iendregion | Ipoll _ | Idls_get -> + raise Use_default + +(* The selector class *) + +class selector = + object (self) + inherit Selectgen.selector_generic as super + + method! is_immediate op n = + match op with + | Iadd | Isub | Imul | Iand | Ior | Ixor | Icomp _ -> is_immediate n + | _ -> super#is_immediate op n + + method is_immediate_test _cmp n = is_immediate n + + method! is_simple_expr e = + match e with + | Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops -> + (* inlined ops are simple if their arguments are *) + List.for_all self#is_simple_expr args + | _ -> super#is_simple_expr e + + method! effects_of e = + match e with + | Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops -> + Select_utils.Effect_and_coeffect.join_list_map args self#effects_of + | _ -> super#effects_of e + + method select_addressing _chunk exp = + let a, d = select_addr exp in + (* PR#4625: displacement must be a signed 32-bit immediate *) + if not (is_immediate d) + then Iindexed 0, exp + else + match a with + | Asymbol s -> + let glob : Arch.sym_global = + match s.sym_global with Global -> Global | Local -> Local + in + Ibased (s.sym_name, glob, d), Ctuple [] + | Alinear e -> Iindexed d, e + | Aadd (e1, e2) -> Iindexed2 d, Ctuple [e1; e2] + | Ascale (e, scale) -> Iscaled (scale, d), e + | Ascaledadd (e1, e2, scale) -> + Iindexed2scaled (scale, d), Ctuple [e1; e2] + + method! select_store is_assign addr exp = + match exp with + | Cconst_int (n, _dbg) when is_immediate n -> + Ispecific (Istore_int (Nativeint.of_int n, addr, is_assign)), Ctuple [] + | Cconst_natint (n, _dbg) when is_immediate_natint n -> + Ispecific (Istore_int (n, addr, is_assign)), Ctuple [] + | Cconst_int _ | Cconst_vec128 _ + | Cconst_natint (_, _) + | Cconst_float32 (_, _) + | Cconst_float (_, _) + | Cconst_symbol (_, _) + | Cvar _ + | Clet (_, _, _) + | Clet_mut (_, _, _, _) + | Cphantom_let (_, _, _) + | Cassign (_, _) + | Ctuple _ + | Cop (_, _, _) + | Csequence (_, _) + | Cifthenelse (_, _, _, _, _, _, _) + | Cswitch (_, _, _, _, _) + | Ccatch (_, _, _, _) + | Cexit (_, _, _) + | Ctrywith (_, _, _, _, _, _) -> + super#select_store is_assign addr exp + + method! select_operation op args dbg = + match op with + (* Recognize the LEA instruction *) + | Caddi | Caddv | Cadda | Csubi | Cor -> ( + match self#select_addressing Word_int (Cop (op, args, dbg)) with + | Iindexed _, _ | Iindexed2 0, _ -> super#select_operation op args dbg + | ( ((Iindexed2 _ | Iscaled _ | Iindexed2scaled _ | Ibased _) as addr), + arg ) -> + Ispecific (Ilea addr), [arg]) + (* Recognize float arithmetic with memory. *) + | Caddf width -> + self#select_floatarith true width Simple_operation.Iaddf Arch.Ifloatadd + args + | Csubf width -> + self#select_floatarith false width Simple_operation.Isubf Arch.Ifloatsub + args + | Cmulf width -> + self#select_floatarith true width Simple_operation.Imulf Arch.Ifloatmul + args + | Cdivf width -> + self#select_floatarith false width Simple_operation.Idivf Arch.Ifloatdiv + args + | Cpackf32 -> + (* We must operate on registers. This is because if the second argument + was a float stack slot, the resulting UNPCKLPS instruction would + enforce the validity of loading it as a 128-bit memory location, even + though it only loads 64 bits. *) + Ispecific (Isimd (SSE Interleave_low_32_regs)), args + (* Special cases overriding C implementations (regardless of + [@@builtin]). *) + | Cextcall { func = "sqrt" as func; _ } + (* x86 intrinsics ([@@builtin]) *) + | Cextcall { func; builtin = true; _ } -> ( + match func with + | "caml_rdtsc_unboxed" -> Ispecific Irdtsc, args + | "caml_rdpmc_unboxed" -> Ispecific Irdpmc, args + | "caml_pause_hint" -> Ispecific Ipause, args + | "caml_load_fence" -> Ispecific Ilfence, args + | "caml_store_fence" -> Ispecific Isfence, args + | "caml_memory_fence" -> Ispecific Imfence, args + | "caml_cldemote" -> + let addr, eloc = + self#select_addressing Word_int (one_arg "cldemote" args) + in + Ispecific (Icldemote addr), [eloc] + | _ -> ( + match Simd_selection.select_operation func args with + | Some (op, args) -> op, args + | None -> super#select_operation op args dbg)) + (* Recognize store instructions *) + | Cstore (((Word_int | Word_val) as chunk), _init) -> ( + match args with + | [loc; Cop (Caddi, [Cop (Cload _, [loc'], _); Cconst_int (n, _dbg)], _)] + when loc = loc' && is_immediate n -> + let addr, arg = self#select_addressing chunk loc in + Ispecific (Ioffset_loc (n, addr)), [arg] + | _ -> super#select_operation op args dbg) + | Cbswap { bitwidth } -> + let bitwidth = select_bitwidth bitwidth in + Ispecific (Ibswap { bitwidth }), args + (* Recognize sign extension *) + | Casr -> ( + match args with + | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] -> + Ispecific Isextend32, [k] + | _ -> super#select_operation op args dbg) + (* Recognize zero extension *) + | Clsr -> ( + match args with + | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] -> + Ispecific Izextend32, [k] + | _ -> super#select_operation op args dbg) + (* Recognize zero extension again *) + | Cand -> ( + match args with + | [arg; Cconst_int (0xffff_ffff, _)] + | [arg; Cconst_natint (0xffff_ffffn, _)] + | [Cconst_int (0xffff_ffff, _); arg] + | [Cconst_natint (0xffff_ffffn, _); arg] -> + Ispecific Izextend32, [arg] + | _ -> super#select_operation op args dbg) + | Ccsel _ -> ( + match args with + | [cond; ifso; ifnot] -> ( + let cond, earg = self#select_condition cond in + match cond with + | Ifloattest (w, CFeq) -> + (* CFeq cannot be represented as cmov without a jump. CFneq emits + cmov for "unordered" and "not equal" cases. Use Cneq and swap the + arguments. *) + Icsel (Ifloattest (w, CFneq)), [earg; ifnot; ifso] + | _ -> Icsel cond, [earg; ifso; ifnot]) + | _ -> super#select_operation op args dbg) + | Cprefetch { is_write; locality } -> + (* Emit prefetch for read hint when prefetchw is not supported. Matches + the behavior of gcc's __builtin_prefetch *) + let is_write = + if is_write && not (Arch.Extension.enabled PREFETCHW) + then false + else is_write + in + let locality : Arch.prefetch_temporal_locality_hint = + match select_locality locality with + | Moderate when is_write && not (Arch.Extension.enabled PREFETCHWT1) + -> + High + | l -> l + in + let addr, eloc = + self#select_addressing Word_int (one_arg "prefetch" args) + in + Ispecific (Iprefetch { is_write; addr; locality }), [eloc] + | _ -> super#select_operation op args dbg + + (* Recognize float arithmetic with mem *) + + method select_floatarith commutative width regular_op mem_op args = + let open Cmm in + match width, args with + | ( Float64, + [arg1; Cop (Cload { memory_chunk = Double as chunk; _ }, [loc2], _)] ) + | ( Float32, + [ arg1; + Cop + ( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ }, + [loc2], + _ ) ] ) -> + let addr, arg2 = self#select_addressing chunk loc2 in + Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg1; arg2] + | ( Float64, + [Cop (Cload { memory_chunk = Double as chunk; _ }, [loc1], _); arg2] ) + | ( Float32, + [ Cop + ( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ }, + [loc1], + _ ); + arg2 ] ) + when commutative -> + let addr, arg1 = self#select_addressing chunk loc1 in + Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg2; arg1] + | _, [arg1; arg2] -> Mach.Ifloatop (width, regular_op), [arg1; arg2] + | _ -> assert false + + method! mark_c_tailcall = contains_calls := true + + (* Deal with register constraints *) + + method! insert_op_debug env op dbg rs rd = + try + let rsrc, rdst = pseudoregs_for_operation op rs rd in + self#insert_moves env rs rsrc; + self#insert_debug env (Iop op) dbg rsrc rdst; + self#insert_moves env rdst rd; + rd + with Use_default -> super#insert_op_debug env op dbg rs rd + end + +let fundecl ~future_funcnames f = + (new selector)#emit_fundecl ~future_funcnames f diff --git a/backend/amd64/selection_utils.ml b/backend/amd64/selection_utils.ml index 8162e3c89aa..2365fd487b6 100644 --- a/backend/amd64/selection_utils.ml +++ b/backend/amd64/selection_utils.ml @@ -76,6 +76,13 @@ let rec select_addr exp = | ( ((Asymbol _ | Aadd (_, _) | Ascaledadd (_, _, _)), _), ((Asymbol _ | Alinear _ | Aadd (_, _) | Ascaledadd (_, _, _)), _) ) -> Aadd (arg1, arg2), 0) + | Cmm.Cop (Cor, [arg; Cconst_int (1, _)], _) + | Cmm.Cop (Cor, [Cconst_int (1, _); arg], _) -> ( + (* optimize tagging integers *) + match select_addr arg with + | Ascale (e, scale), off when scale mod 2 = 0 -> + Ascale (e, scale), off lor 1 + | _ -> default) | _ -> default (* Special constraints on operand and result registers *) diff --git a/backend/cmm_helpers.ml b/backend/cmm_helpers.ml index 82da9ce5c00..7a8e5caafc6 100644 --- a/backend/cmm_helpers.ml +++ b/backend/cmm_helpers.ml @@ -320,58 +320,344 @@ let add_no_overflow n x c dbg = let d = n + x in if d = 0 then c else Cop (Caddi, [c; Cconst_int (d, dbg)], dbg) +let is_defined_shift n = 0 <= n && n < arch_bits + +(** returns true only if [e + n] is definitely the same as [e | n] *) +let[@inline] can_interchange_add_with_or e n = + match e with + | Cop (Clsl, [_; Cconst_int (x, _)], _) -> is_defined_shift x && n asr x = 0 + | _ -> false + +let[@inline] prefer_add = function + | Cop (Cor, [e; (Cconst_int (n, _) as n')], dbg) + when can_interchange_add_with_or e n -> + Cop (Caddi, [e; n'], dbg) + | e -> e + +let[@inline] prefer_or = function + | Cop (Caddi, [e; (Cconst_int (n, _) as n')], dbg) + when can_interchange_add_with_or e n -> + Cop (Cor, [e; n'], dbg) + | e -> e + +let rec map_tail1 e ~f = + match e with + | Clet (id, exp, body) -> Clet (id, exp, map_tail1 body ~f) + | Cphantom_let (id, exp, body) -> Cphantom_let (id, exp, map_tail1 body ~f) + | Csequence (e1, e2) -> Csequence (e1, map_tail1 e2 ~f) + | Cconst_int _ | Cconst_natint _ | Cconst_float32 _ | Cconst_float _ + | Cconst_vec128 _ | Cconst_symbol _ | Cvar _ | Ctuple _ | Cop _ + | Cifthenelse _ | Cexit _ | Ccatch _ | Ctrywith _ | Cswitch _ -> + f e + +let map_tail2 x y ~f = map_tail1 y ~f:(fun y -> map_tail1 x ~f:(fun x -> f x y)) + +let[@inline] is_constant = function + | Cconst_int _ | Cconst_natint _ -> true + | _ -> false + let rec add_const c n dbg = if n = 0 then c else - match c with - | Cconst_int (x, _) when Misc.no_overflow_add x n -> Cconst_int (x + n, dbg) - | Cop (Caddi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x -> - add_no_overflow n x c dbg - | Cop (Caddi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_add n x -> - add_no_overflow n x c dbg - | Cop (Csubi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x -> - Cop (Csubi, [Cconst_int (n + x, dbg); c], dbg) - | Cop (Csubi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_sub n x -> - add_const c (n - x) dbg - | c -> Cop (Caddi, [c; Cconst_int (n, dbg)], dbg) + map_tail1 c ~f:(fun c -> + match prefer_add c with + | Cconst_int (x, _) when Misc.no_overflow_add x n -> + Cconst_int (x + n, dbg) + | Cop (Caddi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x + -> + add_no_overflow n x c dbg + | Cop (Caddi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_add n x + -> + add_no_overflow n x c dbg + | Cop (Csubi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x + -> + Cop (Csubi, [Cconst_int (n + x, dbg); c], dbg) + | Cop (Csubi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_sub n x + -> + add_const c (n - x) dbg + | _ -> Cop (Caddi, [c; Cconst_int (n, dbg)], dbg)) let incr_int c dbg = add_const c 1 dbg let decr_int c dbg = add_const c (-1) dbg let rec add_int c1 c2 dbg = - match c1, c2 with - | Cconst_int (n, _), c | c, Cconst_int (n, _) -> add_const c n dbg - | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 -> - add_const (add_int c1 c2 dbg) n1 dbg - | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) -> - add_const (add_int c1 c2 dbg) n2 dbg - | _, _ -> Cop (Caddi, [c1; c2], dbg) + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match prefer_add c1, prefer_add c2 with + | Cconst_int (n, _), c | c, Cconst_int (n, _) -> add_const c n dbg + | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 -> + add_const (add_int c1 c2 dbg) n1 dbg + | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) -> + add_const (add_int c1 c2 dbg) n2 dbg + | _, _ -> Cop (Caddi, [c1; c2], dbg)) let rec sub_int c1 c2 dbg = - match c1, c2 with - | c1, Cconst_int (n2, _) when n2 <> min_int -> add_const c1 (-n2) dbg - | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) when n2 <> min_int -> - add_const (sub_int c1 c2 dbg) (-n2) dbg - | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 -> - add_const (sub_int c1 c2 dbg) n1 dbg - | c1, c2 -> Cop (Csubi, [c1; c2], dbg) + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match c1, c2 with + | c1, Cconst_int (n2, _) when n2 <> min_int -> add_const c1 (-n2) dbg + | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) when n2 <> min_int -> + add_const (sub_int c1 c2 dbg) (-n2) dbg + | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 -> + add_const (sub_int c1 c2 dbg) n1 dbg + | c1, c2 -> Cop (Csubi, [c1; c2], dbg)) let neg_int c dbg = sub_int (Cconst_int (0, dbg)) c dbg -let rec lsl_int c1 c2 dbg = - match c1, c2 with - | c1, Cconst_int (0, _) -> c1 - | Cop (Clsl, [c; Cconst_int (n1, _)], _), Cconst_int (n2, _) - when n1 > 0 && n2 > 0 && n1 + n2 < size_int * 8 -> - Cop (Clsl, [c; Cconst_int (n1 + n2, dbg)], dbg) - | Cop (Caddi, [c1; Cconst_int (n1, _)], _), Cconst_int (n2, _) - when Misc.no_overflow_lsl n1 n2 -> - add_const (lsl_int c1 c2 dbg) (n1 lsl n2) dbg - | _, _ -> Cop (Clsl, [c1; c2], dbg) +(** This function conservatively approximates the number of significant bits in its signed + argument. That is, it computes the number of bits required to represent the absolute + value of its argument. *) +let rec max_signed_bit_length e = + match prefer_or e with + | Cop ((Ccmpi _ | Ccmpf _), _, _) -> + (* integer/float comparisons return either [1] or [0]. *) + 1 + | Cop (Cand, [_; Cconst_int (n, _)], _) when n > 0 -> 1 + Misc.log2 n + | Cop (Clsl, [c; Cconst_int (n, _)], _) when is_defined_shift n -> + Int.min arch_bits (max_signed_bit_length c + n) + | Cop (Casr, [c; Cconst_int (n, _)], _) when is_defined_shift n -> + Int.max 0 (max_signed_bit_length c - n) + | Cop (Clsr, [c; Cconst_int (n, _)], _) when is_defined_shift n -> + if n = 0 then max_signed_bit_length c else arch_bits - n + | Cop ((Cand | Cor | Cxor), [x; y], _) -> + Int.max (max_signed_bit_length x) (max_signed_bit_length y) + | _ -> arch_bits + +let ignore_low_bit_int = function + | Cop + ( Caddi, + [(Cop (Clsl, [_; Cconst_int (n, _)], _) as c); Cconst_int (1, _)], + _ ) + when n > 0 && is_defined_shift n -> + c + | Cop (Cor, [c; Cconst_int (1, _)], _) -> c + | c -> c + +let[@inline] get_const = function + | Cconst_int (i, _) -> Some (Nativeint.of_int i) + | Cconst_natint (i, _) -> Some i + | _ -> None -let lsl_const c n dbg = lsl_int c (Cconst_int (n, dbg)) dbg +let[@inline] const_exn = function + | Cconst_int (i, _) -> Nativeint.of_int i + | Cconst_natint (i, _) -> i + | _ -> Misc.fatal_error "const_exn: not a constant" + +let replace x ~with_ = + match x with + | Cconst_int _ | Cconst_natint _ | Cconst_symbol _ | Cvar _ | Ctuple [] -> + with_ + | inner -> Csequence (inner, with_) + +let rec xor_const e n dbg = + match n with + | 0n -> e + | n -> + map_tail1 e ~f:(fun e -> + match get_const e with + | Some e -> natint_const_untagged dbg (Nativeint.logxor e n) + | None -> ( + let[@local] default () = + (* prefer putting constants on the right *) + Cop (Cxor, [e; natint_const_untagged dbg n], dbg) + in + match e with + | Cop (Cxor, [x; y], _) -> ( + match get_const y with + | None -> default () + | Some y -> xor_const x (Nativeint.logxor y n) dbg) + | _ -> default ())) + +let rec or_const e n dbg = + match n with + | 0n -> e + | -1n -> replace e ~with_:(Cconst_int (-1, dbg)) + | n -> + map_tail1 e ~f:(fun e -> + let[@local] default () = + (* prefer putting constants on the right *) + Cop (Cor, [e; natint_const_untagged dbg n], dbg) + in + match get_const e with + | Some e -> natint_const_untagged dbg (Nativeint.logor e n) + | None -> ( + match e with + | Cop (Cor, [x; y], _) -> ( + match get_const y with + | None -> default () + | Some y -> or_const x (Nativeint.logor y n) dbg) + | _ -> default ())) + +let rec and_const e n dbg = + match n with + | 0n -> replace e ~with_:(Cconst_int (0, dbg)) + | -1n -> e + | n -> + map_tail1 e ~f:(fun e -> + match get_const e with + | Some e -> natint_const_untagged dbg (Nativeint.logand e n) + | None -> ( + let[@local] default () = + (* prefer putting constants on the right *) + Cop (Cand, [e; natint_const_untagged dbg n], dbg) + in + match e with + | Cop (Cand, [x; y], dbg) -> ( + match get_const y with + | Some y -> and_const x (Nativeint.logand y n) dbg + | None -> default ()) + | Cop (Cload { memory_chunk; mutability; is_atomic }, args, dbg) -> ( + let[@local] load memory_chunk = + Cop (Cload { memory_chunk; mutability; is_atomic }, args, dbg) + in + match memory_chunk, n with + | (Byte_signed | Byte_unsigned), 0xffn -> load Byte_unsigned + | (Sixteen_signed | Sixteen_unsigned), 0xffffn -> + load Sixteen_unsigned + | (Thirtytwo_signed | Thirtytwo_unsigned), 0xffff_ffffn -> + load Thirtytwo_unsigned + | _ -> default ()) + | _ -> default ())) + +let xor_int c1 c2 dbg = + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match get_const c1, get_const c2 with + | Some c1, Some c2 -> natint_const_untagged dbg (Nativeint.logxor c1 c2) + | None, Some c2 -> xor_const c1 c2 dbg + | Some c1, None -> xor_const c2 c1 dbg + | None, None -> Cop (Cxor, [c1; c2], dbg)) + +let or_int c1 c2 dbg = + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match get_const c1, get_const c2 with + | Some c1, Some c2 -> natint_const_untagged dbg (Nativeint.logor c1 c2) + | None, Some c2 -> or_const c1 c2 dbg + | Some c1, None -> or_const c2 c1 dbg + | None, None -> Cop (Cor, [c1; c2], dbg)) + +let and_int c1 c2 dbg = + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match get_const c1, get_const c2 with + | Some c1, Some c2 -> natint_const_untagged dbg (Nativeint.logand c1 c2) + | None, Some c2 -> and_const c1 c2 dbg + | Some c1, None -> and_const c2 c1 dbg + | None, None -> Cop (Cand, [c1; c2], dbg)) + +let rec lsr_int c1 c2 dbg = + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match c1, c2 with + | c1, Cconst_int (0, _) -> c1 + | c1, Cconst_int (n, _) when is_defined_shift n -> ( + let c1 = ignore_low_bit_int c1 in + match get_const c1 with + | Some x -> + natint_const_untagged dbg (Nativeint.shift_right_logical x n) + | None -> ( + match prefer_or c1 with + | Cop (Clsr, [inner; Cconst_int (n', _)], _) when is_defined_shift n' + -> + if is_defined_shift (n + n') + then lsr_const inner (n + n') dbg + else replace inner ~with_:(Cconst_int (0, dbg)) + | Cop ((Cor | Cxor), [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_right_logical (const_exn y) n = 0n -> + lsr_int x c2 dbg + | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_right (const_exn y) n = 0n -> + replace x ~with_:(Cconst_int (0, dbg)) + | _ -> Cop (Clsr, [c1; c2], dbg))) + | Cop (Clsr, [x; (Cconst_int (n', _) as y)], dbg'), c2 + when is_defined_shift n' -> + (* prefer putting the constant shift on the outside to help enable + further peephole optimizations *) + Cop (Clsr, [Cop (Clsr, [x; c2], dbg); y], dbg') + | c1, c2 -> Cop (Clsr, [c1; c2], dbg)) + +and asr_int c1 c2 dbg = + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match c1, c2 with + | c1, Cconst_int (0, _) -> c1 + | c1, Cconst_int (n, _) when is_defined_shift n -> ( + let c1 = ignore_low_bit_int c1 in + match get_const c1 with + | Some x -> natint_const_untagged dbg (Nativeint.shift_right x n) + | None -> ( + match prefer_or c1 with + | Cop (Casr, [inner; Cconst_int (n', _)], _) when is_defined_shift n' + -> + (* saturating add, since the sign bit extends to the left. This is + different from the logical shifts because arithmetic shifting + [arch_bits] times or more is the same as shifting [arch_bits - 1] + times *) + asr_const inner (Int.min (n + n') (arch_bits - 1)) dbg + | Cop (Clsr, [_; Cconst_int (n', _)], _) + when n' > 0 && is_defined_shift n' -> + (* If the argument is guaranteed non-negative, then we know the sign + bit is 0 and we can weaken this operation to a logical shift *) + lsr_const c1 n dbg + | Cop (Clsl, [c; Cconst_int (x, _)], _) + when is_defined_shift x && max_signed_bit_length c + x < arch_bits + -> + (* some operations always return small enough integers that it is + safe and correct to combine [asr (lsl x y) z] into [asr x (z - + y)]. *) + if x > n then lsl_const c (x - n) dbg else asr_const c (n - x) dbg + | Cop ((Cor | Cxor), [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_right (const_exn y) n = 0n -> + asr_int x c2 dbg + | Cop (Cor, [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_right (const_exn y) n = -1n -> + replace x ~with_:(Cconst_int (-1, dbg)) + | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_right (const_exn y) n = -1n -> + asr_int x c2 dbg + | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_right (const_exn y) n = 0n -> + replace x ~with_:(Cconst_int (0, dbg)) + | _ -> Cop (Casr, [c1; c2], dbg))) + | Cop (Casr, [x; (Cconst_int (n', _) as y)], z), c2 + when is_defined_shift n' -> + (* prefer putting the constant shift on the outside to help enable + further peephole optimizations *) + Cop (Casr, [Cop (Casr, [x; c2], dbg); y], z) + | _ -> Cop (Casr, [c1; c2], dbg)) + +and lsl_int c1 c2 dbg = + map_tail2 c1 c2 ~f:(fun c1 c2 -> + match c1, c2 with + | c1, Cconst_int (0, _) -> c1 + | c1, Cconst_int (n, _) when is_defined_shift n -> ( + match get_const c1 with + | Some c1 -> natint_const_untagged dbg (Nativeint.shift_left c1 n) + | None -> ( + match c1 with + | Cop (Clsl, [inner; Cconst_int (n', _)], dbg) + when is_defined_shift n' -> + if is_defined_shift (n + n') + then lsl_const inner (n + n') dbg + else replace inner ~with_:(Cconst_int (0, dbg)) + | Cop (Caddi, [c1; Cconst_int (offset, _)], _) + when Misc.no_overflow_lsl offset n -> + add_const (lsl_int c1 c2 dbg) (offset lsl n) dbg + | Cop ((Cor | Cxor), [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_left (const_exn y) n = 0n -> + lsl_int x c2 dbg + | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _) + when Nativeint.shift_left (const_exn y) n = 0n -> + replace x ~with_:(Cconst_int (0, dbg)) + | c1 -> Cop (Clsl, [c1; c2], dbg))) + | Cop (Clsl, [x; (Cconst_int (n', _) as y)], dbg'), c2 + when is_defined_shift n' -> + (* prefer putting the constant shift on the outside to help enable + further peephole optimizations *) + Cop (Clsl, [Cop (Clsl, [x; c2], dbg); y], dbg') + | _, _ -> Cop (Clsl, [c1; c2], dbg)) + +and lsl_const c n dbg = lsl_int c (Cconst_int (n, dbg)) dbg + +and asr_const c n dbg = asr_int c (Cconst_int (n, dbg)) dbg + +and lsr_const c n dbg = lsr_int c (Cconst_int (n, dbg)) dbg let is_power2 n = n = 1 lsl Misc.log2 n @@ -392,74 +678,28 @@ let rec mul_int c1 c2 dbg = add_const (mul_int c (Cconst_int (k, dbg)) dbg) (n * k) dbg | c1, c2 -> Cop (Cmuli, [c1; c2], dbg) -(* identify cmm operations whose result is guaranteed to be small integers (e.g. - in the range [min_int / 4; max_int / 4]) *) -let guaranteed_to_be_small_int = function - | Cop ((Ccmpi _ | Ccmpf _), _, _) -> - (* integer/float comparisons return either [1] or [0]. *) - true - | _ -> false - -let ignore_low_bit_int = function - | Cop - ( Caddi, - [(Cop (Clsl, [_; Cconst_int (n, _)], _) as c); Cconst_int (1, _)], - _ ) - when n > 0 -> - c - | Cop (Cor, [c; Cconst_int (1, _)], _) -> c - | c -> c - -let lsr_int c1 c2 dbg = - match c1, c2 with - | c1, Cconst_int (0, _) -> c1 - | Cop (Clsr, [c; Cconst_int (n1, _)], _), Cconst_int (n2, _) - when n1 > 0 && n2 > 0 && n1 + n2 < size_int * 8 -> - Cop (Clsr, [c; Cconst_int (n1 + n2, dbg)], dbg) - | c1, Cconst_int (n, _) when n > 0 -> - Cop (Clsr, [ignore_low_bit_int c1; c2], dbg) - | _ -> Cop (Clsr, [c1; c2], dbg) - -let lsr_const c n dbg = lsr_int c (Cconst_int (n, dbg)) dbg - -let asr_int c1 c2 dbg = - match c2 with - | Cconst_int (0, _) -> c1 - | Cconst_int (n, _) when n > 0 -> ( - match ignore_low_bit_int c1 with - (* some operations always return small enough integers that it is safe and - correct to optimise [asr (lsl x 1) 1] into [x]. *) - | Cop (Clsl, [c; Cconst_int (1, _)], _) - when n = 1 && guaranteed_to_be_small_int c -> - c - | c1' -> Cop (Casr, [c1'; c2], dbg)) - | _ -> Cop (Casr, [c1; c2], dbg) - -let asr_const c n dbg = asr_int c (Cconst_int (n, dbg)) dbg - let tag_int i dbg = match i with | Cconst_int (n, _) -> int_const dbg n - | Cop (Casr, [c; Cconst_int (n, _)], _) when n > 0 -> - Cop - (Cor, [asr_int c (Cconst_int (n - 1, dbg)) dbg; Cconst_int (1, dbg)], dbg) - | c -> incr_int (lsl_int c (Cconst_int (1, dbg)) dbg) dbg + | c -> incr_int (lsl_const c 1 dbg) dbg let untag_int i dbg = match i with | Cconst_int (n, _) -> Cconst_int (n asr 1, dbg) | Cop (Cor, [Cop (Casr, [c; Cconst_int (n, _)], _); Cconst_int (1, _)], _) - when n > 0 && n < (size_int * 8) - 1 -> - Cop (Casr, [c; Cconst_int (n + 1, dbg)], dbg) + when n > 0 && is_defined_shift (n + 1) -> + asr_const c (n + 1) dbg | Cop (Cor, [Cop (Clsr, [c; Cconst_int (n, _)], _); Cconst_int (1, _)], _) - when n > 0 && n < (size_int * 8) - 1 -> - Cop (Clsr, [c; Cconst_int (n + 1, dbg)], dbg) - | c -> asr_int c (Cconst_int (1, dbg)) dbg + when n > 0 && is_defined_shift (n + 1) -> + lsr_const c (n + 1) dbg + | c -> asr_const c 1 dbg let mk_not dbg cmm = match cmm with - | Cop (Caddi, [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], dbg') - -> ( + | Cop + ( (Caddi | Cor), + [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], + dbg' ) -> ( match c with | Cop (Ccmpi cmp, [c1; c2], dbg'') -> tag_int @@ -793,8 +1033,10 @@ let mod_int ?dividend_cannot_be_min_int c1 c2 dbg = let test_bool dbg cmm = match cmm with - | Cop (Caddi, [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], _) - -> + | Cop + ( (Caddi | Cor), + [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], + _ ) -> c | Cconst_int (n, dbg) -> if n = 1 then Cconst_int (0, dbg) else Cconst_int (1, dbg) @@ -1029,8 +1271,10 @@ let array_indexing ?typ log2size ptr ofs dbg = if i = 0 then ptr else Cop (add, [ptr; Cconst_int (i lsl log2size, dbg)], dbg) - | Cop (Caddi, [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], dbg') - -> + | Cop + ( (Caddi | Cor), + [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], + dbg' ) -> Cop (add, [ptr; lsl_const c log2size dbg], dbg') | Cop (Caddi, [c; Cconst_int (n, _)], dbg') when log2size = 0 -> Cop @@ -1293,11 +1537,11 @@ let rec low_bits ~bits ~dbg x = then x else let unused_bits = arch_bits - bits in - let does_mask_keep_low_bits test_mask = + let does_mask_keep_low_bits mask = (* If the mask has all the low bits set, then the low bits are unchanged. This could happen from zero-extension. *) - let mask = Nativeint.pred (Nativeint.shift_left 1n bits) in - Nativeint.equal mask (Nativeint.logand test_mask mask) + let low_bits = Nativeint.pred (Nativeint.shift_left 1n bits) in + Nativeint.equal low_bits (Nativeint.logand mask low_bits) in (* Ignore sign and zero extensions which do not affect the low bits *) map_tail @@ -1344,7 +1588,7 @@ let zero_extend ~bits ~dbg e = | e -> zero_extend_via_mask e) (low_bits ~bits e ~dbg) -let sign_extend ~bits ~dbg e = +let rec sign_extend ~bits ~dbg e = assert (0 < bits && bits <= arch_bits); let unused_bits = arch_bits - bits in let sign_extend_via_shift e = @@ -1354,11 +1598,24 @@ let sign_extend ~bits ~dbg e = then e else map_tail - (function - | Cop ((Casr | Clsr), [inner; Cconst_int (n, _)], _) as e - when 0 <= n && n < arch_bits -> + (fun e -> + match prefer_or e with + | Cop (Cand, [x; y], _) when is_constant y -> + and_int (sign_extend ~bits x ~dbg) (sign_extend ~bits y ~dbg) dbg + | Cop (Cor, [x; y], _) when is_constant y -> + or_int (sign_extend ~bits x ~dbg) (sign_extend ~bits y ~dbg) dbg + | Cop (Cxor, [x; y], _) when is_constant y -> + xor_int (sign_extend ~bits x ~dbg) (sign_extend ~bits y ~dbg) dbg + | Cop (((Casr | Clsr) as op), [inner; Cconst_int (n, _)], _) as e + when is_defined_shift n -> (* see middle_end/flambda2/z3/sign_extension.py for proof *) - if n > unused_bits + if n = unused_bits + then + match op with + | Casr -> e + | Clsr -> asr_const inner unused_bits dbg + | _ -> assert false + else if n > unused_bits then (* sign-extension is a no-op since the top n bits already match *) e @@ -1961,21 +2218,6 @@ let bigarray_word_kind : Lambda.bigarray_kind -> memory_chunk = function | Pbigarray_complex32 -> Single { reg = Float64 } | Pbigarray_complex64 -> Double -let and_int e1 e2 dbg = - let is_mask32 = function - | Cconst_natint (0xFFFF_FFFFn, _) -> true - | Cconst_int (n, _) -> Nativeint.of_int n = 0xFFFF_FFFFn - | _ -> false - in - match e1, e2 with - | e, m when is_mask32 m -> zero_extend ~bits:32 e ~dbg - | m, e when is_mask32 m -> zero_extend ~bits:32 e ~dbg - | e1, e2 -> Cop (Cand, [e1; e2], dbg) - -let or_int e1 e2 dbg = Cop (Cor, [e1; e2], dbg) - -let xor_int e1 e2 dbg = Cop (Cxor, [e1; e2], dbg) - (* Boxed integers *) let operations_boxed_int (bi : Primitive.boxed_integer) = diff --git a/middle_end/flambda2/z3/sign_extension.py b/middle_end/flambda2/z3/sign_extension.py index 6613558072d..272f4a46c71 100755 --- a/middle_end/flambda2/z3/sign_extension.py +++ b/middle_end/flambda2/z3/sign_extension.py @@ -29,10 +29,15 @@ def reference_sign_extend(self, bits): def experimental_sign_extend(self, bits) -> BitVecRef: unused_bits = self.size() - bits + return If( - self.shift_right > unused_bits, - self.as_ast(), - (self.inner << (unused_bits - self.shift_right)) >> unused_bits, + self.shift_right == unused_bits, + If(self.arith, self.as_ast(), self.inner >> unused_bits), + If( + self.shift_right > unused_bits, + self.as_ast(), + (self.inner << (unused_bits - self.shift_right)) >> unused_bits, + ), )