diff --git a/backend/amd64/cfg_selection.ml b/backend/amd64/cfg_selection.ml
index 776b648f406..6b3252a2cea 100644
--- a/backend/amd64/cfg_selection.ml
+++ b/backend/amd64/cfg_selection.ml
@@ -194,7 +194,7 @@ class selector =
     method! select_operation op args dbg ~label_after =
       match op with
       (* Recognize the LEA instruction *)
-      | Caddi | Caddv | Cadda | Csubi -> (
+      | Caddi | Caddv | Cadda | Csubi | Cor -> (
         match self#select_addressing Word_int (Cop (op, args, dbg)) with
         | Iindexed _, _ | Iindexed2 0, _ ->
           super#select_operation op args dbg ~label_after
@@ -252,13 +252,18 @@ class selector =
       | Cbswap { bitwidth } ->
         let bitwidth = select_bitwidth bitwidth in
         specific (Ibswap { bitwidth }), args
-      (* Recognize sign extension *)
       | Casr -> (
+        (* Recognize sign extension *)
         match args with
         | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
           specific Isextend32, [k]
         | _ -> super#select_operation op args dbg ~label_after)
       (* Recognize zero extension *)
+      | Clsr -> (
+        match args with
+        | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
+          specific Izextend32, [k]
+        | _ -> super#select_operation op args dbg ~label_after)
       | Cand -> (
         match args with
         | [arg; Cconst_int (0xffff_ffff, _)]
diff --git a/backend/amd64/selection.ml b/backend/amd64/selection.ml
new file mode 100644
index 00000000000..fcf75f5b527
--- /dev/null
+++ b/backend/amd64/selection.ml
@@ -0,0 +1,352 @@
+(**************************************************************************)
+(*                                                                        *)
+(*                                 OCaml                                  *)
+(*                                                                        *)
+(*             Xavier Leroy, projet Cristal, INRIA Rocquencourt           *)
+(*                                                                        *)
+(*   Copyright 2000 Institut National de Recherche en Informatique et     *)
+(*     en Automatique.                                                    *)
+(*                                                                        *)
+(*   All rights reserved.  This file is distributed under the terms of    *)
+(*   the GNU Lesser General Public License version 2.1, with the          *)
+(*   special exception on linking described in the file LICENSE.          *)
+(*                                                                        *)
+(**************************************************************************)
+
+(* Instruction selection for the AMD64 *)
+
+[@@@ocaml.warning "+a-4-9-40-41-42"]
+
+(* note: no `open! Int_replace_polymorphic_compare` as the module is about to be
+   deleted. *)
+
+open Arch
+open Selection_utils
+
+let pseudoregs_for_operation op arg res =
+  match (op : Mach.operation) with
+  (* Two-address binary operations: arg.(0) and res.(0) must be the same *)
+  | Iintop (Iadd | Isub | Imul | Iand | Ior | Ixor)
+  | Ifloatop ((Float32 | Float64), (Iaddf | Isubf | Imulf | Idivf)) ->
+    [| res.(0); arg.(1) |], res
+  | Iintop_atomic { op = Compare_set; size = _; addr = _ } ->
+    (* first arg must be rax *)
+    let arg = Array.copy arg in
+    arg.(0) <- rax;
+    arg, res
+  | Iintop_atomic { op = Compare_exchange; size = _; addr = _ } ->
+    (* first arg must be rax, res.(0) must be rax. *)
+    let arg = Array.copy arg in
+    arg.(0) <- rax;
+    arg, [| rax |]
+  | Iintop_atomic { op = Exchange | Fetch_and_add; size = _; addr = _ } ->
+    (* first arg must be the same as res.(0) *)
+    let arg = Array.copy arg in
+    arg.(0) <- res.(0);
+    arg, res
+  (* One-address unary operations: arg.(0) and res.(0) must be the same *)
+  | Iintop_imm ((Iadd | Isub | Imul | Iand | Ior | Ixor | Ilsl | Ilsr | Iasr), _)
+  | Ifloatop ((Float64 | Float32), (Iabsf | Inegf))
+  | Ispecific (Ibswap { bitwidth = Thirtytwo | Sixtyfour }) ->
+    res, res
+  (* For xchg, args must be a register allowing access to high 8 bit register
+     (rax, rbx, rcx or rdx). Keep it simple, just force the argument in rax. *)
+  | Ispecific (Ibswap { bitwidth = Sixteen }) -> [| rax |], [| rax |]
+  (* For imulh, first arg must be in rax, rax is clobbered, and result is in
+     rdx. *)
+  | Iintop (Imulh _) -> [| rax; arg.(1) |], [| rdx |]
+  | Ispecific (Ifloatarithmem (_, _, _)) ->
+    let arg' = Array.copy arg in
+    arg'.(0) <- res.(0);
+    arg', res
+  (* For shifts with variable shift count, second arg must be in rcx *)
+  | Iintop (Ilsl | Ilsr | Iasr) -> [| res.(0); rcx |], res
+  (* For div and mod, first arg must be in rax, rdx is clobbered, and result is
+     in rax or rdx respectively. Keep it simple, just force second argument in
+     rcx. *)
+  | Iintop Idiv -> [| rax; rcx |], [| rax |]
+  | Iintop Imod -> [| rax; rcx |], [| rdx |]
+  | Ifloatop (Float64, Icompf cond) ->
+    (* CR gyorsh: make this optimization as a separate PR. *)
+    (* We need to temporarily store the result of the comparison in a float
+       register, but we don't want to clobber any of the inputs if they would
+       still be live after this operation -- so we add a fresh register as both
+       an input and output. We don't use [destroyed_at_oper], because that
+       forces us to choose a fixed register, which makes it more likely an extra
+       mov would be added to transfer the argument to the fixed register. *)
+    let treg = Reg.create Float in
+    let _, is_swapped = float_cond_and_need_swap cond in
+    ( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]),
+      [| res.(0); treg |] )
+  | Ifloatop (Float32, Icompf cond) ->
+    let treg = Reg.create Float32 in
+    let _, is_swapped = float_cond_and_need_swap cond in
+    ( (if is_swapped then [| arg.(0); treg |] else [| treg; arg.(1) |]),
+      [| res.(0); treg |] )
+  | Ispecific Irdpmc ->
+    (* For rdpmc instruction, the argument must be in ecx and the result is in
+       edx (high) and eax (low). Make it simple and force the argument in rcx,
+       and rax and rdx clobbered *)
+    [| rcx |], res
+  | Ispecific (Isimd op) ->
+    Simd_selection.pseudoregs_for_operation
+      (Simd_proc.register_behavior op)
+      arg res
+  | Ispecific (Isimd_mem (op, _addr)) ->
+    Simd_selection.pseudoregs_for_operation
+      (Simd_proc.Mem.register_behavior op)
+      arg res
+  | Icsel _ ->
+    (* last arg must be the same as res.(0) *)
+    let len = Array.length arg in
+    let arg = Array.copy arg in
+    arg.(len - 1) <- res.(0);
+    arg, res
+  (* Other instructions are regular *)
+  | Iintop_atomic { op = Add | Sub | Land | Lor | Lxor; _ }
+  | Iintop (Ipopcnt | Iclz _ | Ictz _ | Icomp _)
+  | Iintop_imm ((Imulh _ | Idiv | Imod | Icomp _ | Ipopcnt | Iclz _ | Ictz _), _)
+  | Ispecific
+      ( Isextend32 | Izextend32 | Ilea _
+      | Istore_int (_, _, _)
+      | Ipause | Ilfence | Isfence | Imfence
+      | Ioffset_loc (_, _)
+      | Irdtsc | Icldemote _ | Iprefetch _ )
+  | Imove | Ispill | Ireload | Ireinterpret_cast _ | Istatic_cast _
+  | Iconst_int _ | Iconst_float32 _ | Iconst_float _ | Iconst_vec128 _
+  | Iconst_symbol _ | Icall_ind | Icall_imm _ | Itailcall_ind | Itailcall_imm _
+  | Iextcall _ | Istackoffset _ | Iload _
+  | Istore (_, _, _)
+  | Ialloc _ | Iname_for_debugger _ | Iprobe _ | Iprobe_is_enabled _ | Iopaque
+  | Ibeginregion | Iendregion | Ipoll _ | Idls_get ->
+    raise Use_default
+
+(* The selector class *)
+
+class selector =
+  object (self)
+    inherit Selectgen.selector_generic as super
+
+    method! is_immediate op n =
+      match op with
+      | Iadd | Isub | Imul | Iand | Ior | Ixor | Icomp _ -> is_immediate n
+      | _ -> super#is_immediate op n
+
+    method is_immediate_test _cmp n = is_immediate n
+
+    method! is_simple_expr e =
+      match e with
+      | Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops ->
+        (* inlined ops are simple if their arguments are *)
+        List.for_all self#is_simple_expr args
+      | _ -> super#is_simple_expr e
+
+    method! effects_of e =
+      match e with
+      | Cop (Cextcall { func = fn }, args, _) when List.mem fn inline_ops ->
+        Select_utils.Effect_and_coeffect.join_list_map args self#effects_of
+      | _ -> super#effects_of e
+
+    method select_addressing _chunk exp =
+      let a, d = select_addr exp in
+      (* PR#4625: displacement must be a signed 32-bit immediate *)
+      if not (is_immediate d)
+      then Iindexed 0, exp
+      else
+        match a with
+        | Asymbol s ->
+          let glob : Arch.sym_global =
+            match s.sym_global with Global -> Global | Local -> Local
+          in
+          Ibased (s.sym_name, glob, d), Ctuple []
+        | Alinear e -> Iindexed d, e
+        | Aadd (e1, e2) -> Iindexed2 d, Ctuple [e1; e2]
+        | Ascale (e, scale) -> Iscaled (scale, d), e
+        | Ascaledadd (e1, e2, scale) ->
+          Iindexed2scaled (scale, d), Ctuple [e1; e2]
+
+    method! select_store is_assign addr exp =
+      match exp with
+      | Cconst_int (n, _dbg) when is_immediate n ->
+        Ispecific (Istore_int (Nativeint.of_int n, addr, is_assign)), Ctuple []
+      | Cconst_natint (n, _dbg) when is_immediate_natint n ->
+        Ispecific (Istore_int (n, addr, is_assign)), Ctuple []
+      | Cconst_int _ | Cconst_vec128 _
+      | Cconst_natint (_, _)
+      | Cconst_float32 (_, _)
+      | Cconst_float (_, _)
+      | Cconst_symbol (_, _)
+      | Cvar _
+      | Clet (_, _, _)
+      | Clet_mut (_, _, _, _)
+      | Cphantom_let (_, _, _)
+      | Cassign (_, _)
+      | Ctuple _
+      | Cop (_, _, _)
+      | Csequence (_, _)
+      | Cifthenelse (_, _, _, _, _, _, _)
+      | Cswitch (_, _, _, _, _)
+      | Ccatch (_, _, _, _)
+      | Cexit (_, _, _)
+      | Ctrywith (_, _, _, _, _, _) ->
+        super#select_store is_assign addr exp
+
+    method! select_operation op args dbg =
+      match op with
+      (* Recognize the LEA instruction *)
+      | Caddi | Caddv | Cadda | Csubi | Cor -> (
+        match self#select_addressing Word_int (Cop (op, args, dbg)) with
+        | Iindexed _, _ | Iindexed2 0, _ -> super#select_operation op args dbg
+        | ( ((Iindexed2 _ | Iscaled _ | Iindexed2scaled _ | Ibased _) as addr),
+            arg ) ->
+          Ispecific (Ilea addr), [arg])
+      (* Recognize float arithmetic with memory. *)
+      | Caddf width ->
+        self#select_floatarith true width Simple_operation.Iaddf Arch.Ifloatadd
+          args
+      | Csubf width ->
+        self#select_floatarith false width Simple_operation.Isubf Arch.Ifloatsub
+          args
+      | Cmulf width ->
+        self#select_floatarith true width Simple_operation.Imulf Arch.Ifloatmul
+          args
+      | Cdivf width ->
+        self#select_floatarith false width Simple_operation.Idivf Arch.Ifloatdiv
+          args
+      | Cpackf32 ->
+        (* We must operate on registers. This is because if the second argument
+           was a float stack slot, the resulting UNPCKLPS instruction would
+           enforce the validity of loading it as a 128-bit memory location, even
+           though it only loads 64 bits. *)
+        Ispecific (Isimd (SSE Interleave_low_32_regs)), args
+      (* Special cases overriding C implementations (regardless of
+         [@@builtin]). *)
+      | Cextcall { func = "sqrt" as func; _ }
+      (* x86 intrinsics ([@@builtin]) *)
+      | Cextcall { func; builtin = true; _ } -> (
+        match func with
+        | "caml_rdtsc_unboxed" -> Ispecific Irdtsc, args
+        | "caml_rdpmc_unboxed" -> Ispecific Irdpmc, args
+        | "caml_pause_hint" -> Ispecific Ipause, args
+        | "caml_load_fence" -> Ispecific Ilfence, args
+        | "caml_store_fence" -> Ispecific Isfence, args
+        | "caml_memory_fence" -> Ispecific Imfence, args
+        | "caml_cldemote" ->
+          let addr, eloc =
+            self#select_addressing Word_int (one_arg "cldemote" args)
+          in
+          Ispecific (Icldemote addr), [eloc]
+        | _ -> (
+          match Simd_selection.select_operation func args with
+          | Some (op, args) -> op, args
+          | None -> super#select_operation op args dbg))
+      (* Recognize store instructions *)
+      | Cstore (((Word_int | Word_val) as chunk), _init) -> (
+        match args with
+        | [loc; Cop (Caddi, [Cop (Cload _, [loc'], _); Cconst_int (n, _dbg)], _)]
+          when loc = loc' && is_immediate n ->
+          let addr, arg = self#select_addressing chunk loc in
+          Ispecific (Ioffset_loc (n, addr)), [arg]
+        | _ -> super#select_operation op args dbg)
+      | Cbswap { bitwidth } ->
+        let bitwidth = select_bitwidth bitwidth in
+        Ispecific (Ibswap { bitwidth }), args
+      (* Recognize sign extension *)
+      | Casr -> (
+        match args with
+        | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
+          Ispecific Isextend32, [k]
+        | _ -> super#select_operation op args dbg)
+      (* Recognize zero extension *)
+      | Clsr -> (
+        match args with
+        | [Cop (Clsl, [k; Cconst_int (32, _)], _); Cconst_int (32, _)] ->
+          Ispecific Izextend32, [k]
+        | _ -> super#select_operation op args dbg)
+      (* Recognize zero extension again *)
+      | Cand -> (
+        match args with
+        | [arg; Cconst_int (0xffff_ffff, _)]
+        | [arg; Cconst_natint (0xffff_ffffn, _)]
+        | [Cconst_int (0xffff_ffff, _); arg]
+        | [Cconst_natint (0xffff_ffffn, _); arg] ->
+          Ispecific Izextend32, [arg]
+        | _ -> super#select_operation op args dbg)
+      | Ccsel _ -> (
+        match args with
+        | [cond; ifso; ifnot] -> (
+          let cond, earg = self#select_condition cond in
+          match cond with
+          | Ifloattest (w, CFeq) ->
+            (* CFeq cannot be represented as cmov without a jump. CFneq emits
+               cmov for "unordered" and "not equal" cases. Use Cneq and swap the
+               arguments. *)
+            Icsel (Ifloattest (w, CFneq)), [earg; ifnot; ifso]
+          | _ -> Icsel cond, [earg; ifso; ifnot])
+        | _ -> super#select_operation op args dbg)
+      | Cprefetch { is_write; locality } ->
+        (* Emit prefetch for read hint when prefetchw is not supported. Matches
+           the behavior of gcc's __builtin_prefetch *)
+        let is_write =
+          if is_write && not (Arch.Extension.enabled PREFETCHW)
+          then false
+          else is_write
+        in
+        let locality : Arch.prefetch_temporal_locality_hint =
+          match select_locality locality with
+          | Moderate when is_write && not (Arch.Extension.enabled PREFETCHWT1)
+            ->
+            High
+          | l -> l
+        in
+        let addr, eloc =
+          self#select_addressing Word_int (one_arg "prefetch" args)
+        in
+        Ispecific (Iprefetch { is_write; addr; locality }), [eloc]
+      | _ -> super#select_operation op args dbg
+
+    (* Recognize float arithmetic with mem *)
+
+    method select_floatarith commutative width regular_op mem_op args =
+      let open Cmm in
+      match width, args with
+      | ( Float64,
+          [arg1; Cop (Cload { memory_chunk = Double as chunk; _ }, [loc2], _)] )
+      | ( Float32,
+          [ arg1;
+            Cop
+              ( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ },
+                [loc2],
+                _ ) ] ) ->
+        let addr, arg2 = self#select_addressing chunk loc2 in
+        Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg1; arg2]
+      | ( Float64,
+          [Cop (Cload { memory_chunk = Double as chunk; _ }, [loc1], _); arg2] )
+      | ( Float32,
+          [ Cop
+              ( Cload { memory_chunk = Single { reg = Float32 } as chunk; _ },
+                [loc1],
+                _ );
+            arg2 ] )
+        when commutative ->
+        let addr, arg1 = self#select_addressing chunk loc1 in
+        Mach.Ispecific (Ifloatarithmem (width, mem_op, addr)), [arg2; arg1]
+      | _, [arg1; arg2] -> Mach.Ifloatop (width, regular_op), [arg1; arg2]
+      | _ -> assert false
+
+    method! mark_c_tailcall = contains_calls := true
+
+    (* Deal with register constraints *)
+
+    method! insert_op_debug env op dbg rs rd =
+      try
+        let rsrc, rdst = pseudoregs_for_operation op rs rd in
+        self#insert_moves env rs rsrc;
+        self#insert_debug env (Iop op) dbg rsrc rdst;
+        self#insert_moves env rdst rd;
+        rd
+      with Use_default -> super#insert_op_debug env op dbg rs rd
+  end
+
+let fundecl ~future_funcnames f =
+  (new selector)#emit_fundecl ~future_funcnames f
diff --git a/backend/amd64/selection_utils.ml b/backend/amd64/selection_utils.ml
index 8162e3c89aa..2365fd487b6 100644
--- a/backend/amd64/selection_utils.ml
+++ b/backend/amd64/selection_utils.ml
@@ -76,6 +76,13 @@ let rec select_addr exp =
     | ( ((Asymbol _ | Aadd (_, _) | Ascaledadd (_, _, _)), _),
         ((Asymbol _ | Alinear _ | Aadd (_, _) | Ascaledadd (_, _, _)), _) ) ->
       Aadd (arg1, arg2), 0)
+  | Cmm.Cop (Cor, [arg; Cconst_int (1, _)], _)
+  | Cmm.Cop (Cor, [Cconst_int (1, _); arg], _) -> (
+    (* optimize tagging integers *)
+    match select_addr arg with
+    | Ascale (e, scale), off when scale mod 2 = 0 ->
+      Ascale (e, scale), off lor 1
+    | _ -> default)
   | _ -> default
 
 (* Special constraints on operand and result registers *)
diff --git a/backend/cmm_helpers.ml b/backend/cmm_helpers.ml
index 82da9ce5c00..7a8e5caafc6 100644
--- a/backend/cmm_helpers.ml
+++ b/backend/cmm_helpers.ml
@@ -320,58 +320,344 @@ let add_no_overflow n x c dbg =
   let d = n + x in
   if d = 0 then c else Cop (Caddi, [c; Cconst_int (d, dbg)], dbg)
 
+let is_defined_shift n = 0 <= n && n < arch_bits
+
+(** returns true only if [e + n] is definitely the same as [e | n] *)
+let[@inline] can_interchange_add_with_or e n =
+  match e with
+  | Cop (Clsl, [_; Cconst_int (x, _)], _) -> is_defined_shift x && n asr x = 0
+  | _ -> false
+
+let[@inline] prefer_add = function
+  | Cop (Cor, [e; (Cconst_int (n, _) as n')], dbg)
+    when can_interchange_add_with_or e n ->
+    Cop (Caddi, [e; n'], dbg)
+  | e -> e
+
+let[@inline] prefer_or = function
+  | Cop (Caddi, [e; (Cconst_int (n, _) as n')], dbg)
+    when can_interchange_add_with_or e n ->
+    Cop (Cor, [e; n'], dbg)
+  | e -> e
+
+let rec map_tail1 e ~f =
+  match e with
+  | Clet (id, exp, body) -> Clet (id, exp, map_tail1 body ~f)
+  | Cphantom_let (id, exp, body) -> Cphantom_let (id, exp, map_tail1 body ~f)
+  | Csequence (e1, e2) -> Csequence (e1, map_tail1 e2 ~f)
+  | Cconst_int _ | Cconst_natint _ | Cconst_float32 _ | Cconst_float _
+  | Cconst_vec128 _ | Cconst_symbol _ | Cvar _ | Ctuple _ | Cop _
+  | Cifthenelse _ | Cexit _ | Ccatch _ | Ctrywith _ | Cswitch _ ->
+    f e
+
+let map_tail2 x y ~f = map_tail1 y ~f:(fun y -> map_tail1 x ~f:(fun x -> f x y))
+
+let[@inline] is_constant = function
+  | Cconst_int _ | Cconst_natint _ -> true
+  | _ -> false
+
 let rec add_const c n dbg =
   if n = 0
   then c
   else
-    match c with
-    | Cconst_int (x, _) when Misc.no_overflow_add x n -> Cconst_int (x + n, dbg)
-    | Cop (Caddi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x ->
-      add_no_overflow n x c dbg
-    | Cop (Caddi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_add n x ->
-      add_no_overflow n x c dbg
-    | Cop (Csubi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x ->
-      Cop (Csubi, [Cconst_int (n + x, dbg); c], dbg)
-    | Cop (Csubi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_sub n x ->
-      add_const c (n - x) dbg
-    | c -> Cop (Caddi, [c; Cconst_int (n, dbg)], dbg)
+    map_tail1 c ~f:(fun c ->
+        match prefer_add c with
+        | Cconst_int (x, _) when Misc.no_overflow_add x n ->
+          Cconst_int (x + n, dbg)
+        | Cop (Caddi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x
+          ->
+          add_no_overflow n x c dbg
+        | Cop (Caddi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_add n x
+          ->
+          add_no_overflow n x c dbg
+        | Cop (Csubi, [Cconst_int (x, _); c], _) when Misc.no_overflow_add n x
+          ->
+          Cop (Csubi, [Cconst_int (n + x, dbg); c], dbg)
+        | Cop (Csubi, [c; Cconst_int (x, _)], _) when Misc.no_overflow_sub n x
+          ->
+          add_const c (n - x) dbg
+        | _ -> Cop (Caddi, [c; Cconst_int (n, dbg)], dbg))
 
 let incr_int c dbg = add_const c 1 dbg
 
 let decr_int c dbg = add_const c (-1) dbg
 
 let rec add_int c1 c2 dbg =
-  match c1, c2 with
-  | Cconst_int (n, _), c | c, Cconst_int (n, _) -> add_const c n dbg
-  | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 ->
-    add_const (add_int c1 c2 dbg) n1 dbg
-  | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) ->
-    add_const (add_int c1 c2 dbg) n2 dbg
-  | _, _ -> Cop (Caddi, [c1; c2], dbg)
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match prefer_add c1, prefer_add c2 with
+      | Cconst_int (n, _), c | c, Cconst_int (n, _) -> add_const c n dbg
+      | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 ->
+        add_const (add_int c1 c2 dbg) n1 dbg
+      | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) ->
+        add_const (add_int c1 c2 dbg) n2 dbg
+      | _, _ -> Cop (Caddi, [c1; c2], dbg))
 
 let rec sub_int c1 c2 dbg =
-  match c1, c2 with
-  | c1, Cconst_int (n2, _) when n2 <> min_int -> add_const c1 (-n2) dbg
-  | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) when n2 <> min_int ->
-    add_const (sub_int c1 c2 dbg) (-n2) dbg
-  | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 ->
-    add_const (sub_int c1 c2 dbg) n1 dbg
-  | c1, c2 -> Cop (Csubi, [c1; c2], dbg)
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match c1, c2 with
+      | c1, Cconst_int (n2, _) when n2 <> min_int -> add_const c1 (-n2) dbg
+      | c1, Cop (Caddi, [c2; Cconst_int (n2, _)], _) when n2 <> min_int ->
+        add_const (sub_int c1 c2 dbg) (-n2) dbg
+      | Cop (Caddi, [c1; Cconst_int (n1, _)], _), c2 ->
+        add_const (sub_int c1 c2 dbg) n1 dbg
+      | c1, c2 -> Cop (Csubi, [c1; c2], dbg))
 
 let neg_int c dbg = sub_int (Cconst_int (0, dbg)) c dbg
 
-let rec lsl_int c1 c2 dbg =
-  match c1, c2 with
-  | c1, Cconst_int (0, _) -> c1
-  | Cop (Clsl, [c; Cconst_int (n1, _)], _), Cconst_int (n2, _)
-    when n1 > 0 && n2 > 0 && n1 + n2 < size_int * 8 ->
-    Cop (Clsl, [c; Cconst_int (n1 + n2, dbg)], dbg)
-  | Cop (Caddi, [c1; Cconst_int (n1, _)], _), Cconst_int (n2, _)
-    when Misc.no_overflow_lsl n1 n2 ->
-    add_const (lsl_int c1 c2 dbg) (n1 lsl n2) dbg
-  | _, _ -> Cop (Clsl, [c1; c2], dbg)
+(** This function conservatively approximates the number of significant bits in its signed
+    argument. That is, it computes the number of bits required to represent the absolute
+    value of its argument. *)
+let rec max_signed_bit_length e =
+  match prefer_or e with
+  | Cop ((Ccmpi _ | Ccmpf _), _, _) ->
+    (* integer/float comparisons return either [1] or [0]. *)
+    1
+  | Cop (Cand, [_; Cconst_int (n, _)], _) when n > 0 -> 1 + Misc.log2 n
+  | Cop (Clsl, [c; Cconst_int (n, _)], _) when is_defined_shift n ->
+    Int.min arch_bits (max_signed_bit_length c + n)
+  | Cop (Casr, [c; Cconst_int (n, _)], _) when is_defined_shift n ->
+    Int.max 0 (max_signed_bit_length c - n)
+  | Cop (Clsr, [c; Cconst_int (n, _)], _) when is_defined_shift n ->
+    if n = 0 then max_signed_bit_length c else arch_bits - n
+  | Cop ((Cand | Cor | Cxor), [x; y], _) ->
+    Int.max (max_signed_bit_length x) (max_signed_bit_length y)
+  | _ -> arch_bits
+
+let ignore_low_bit_int = function
+  | Cop
+      ( Caddi,
+        [(Cop (Clsl, [_; Cconst_int (n, _)], _) as c); Cconst_int (1, _)],
+        _ )
+    when n > 0 && is_defined_shift n ->
+    c
+  | Cop (Cor, [c; Cconst_int (1, _)], _) -> c
+  | c -> c
+
+let[@inline] get_const = function
+  | Cconst_int (i, _) -> Some (Nativeint.of_int i)
+  | Cconst_natint (i, _) -> Some i
+  | _ -> None
 
-let lsl_const c n dbg = lsl_int c (Cconst_int (n, dbg)) dbg
+let[@inline] const_exn = function
+  | Cconst_int (i, _) -> Nativeint.of_int i
+  | Cconst_natint (i, _) -> i
+  | _ -> Misc.fatal_error "const_exn: not a constant"
+
+let replace x ~with_ =
+  match x with
+  | Cconst_int _ | Cconst_natint _ | Cconst_symbol _ | Cvar _ | Ctuple [] ->
+    with_
+  | inner -> Csequence (inner, with_)
+
+let rec xor_const e n dbg =
+  match n with
+  | 0n -> e
+  | n ->
+    map_tail1 e ~f:(fun e ->
+        match get_const e with
+        | Some e -> natint_const_untagged dbg (Nativeint.logxor e n)
+        | None -> (
+          let[@local] default () =
+            (* prefer putting constants on the right *)
+            Cop (Cxor, [e; natint_const_untagged dbg n], dbg)
+          in
+          match e with
+          | Cop (Cxor, [x; y], _) -> (
+            match get_const y with
+            | None -> default ()
+            | Some y -> xor_const x (Nativeint.logxor y n) dbg)
+          | _ -> default ()))
+
+let rec or_const e n dbg =
+  match n with
+  | 0n -> e
+  | -1n -> replace e ~with_:(Cconst_int (-1, dbg))
+  | n ->
+    map_tail1 e ~f:(fun e ->
+        let[@local] default () =
+          (* prefer putting constants on the right *)
+          Cop (Cor, [e; natint_const_untagged dbg n], dbg)
+        in
+        match get_const e with
+        | Some e -> natint_const_untagged dbg (Nativeint.logor e n)
+        | None -> (
+          match e with
+          | Cop (Cor, [x; y], _) -> (
+            match get_const y with
+            | None -> default ()
+            | Some y -> or_const x (Nativeint.logor y n) dbg)
+          | _ -> default ()))
+
+let rec and_const e n dbg =
+  match n with
+  | 0n -> replace e ~with_:(Cconst_int (0, dbg))
+  | -1n -> e
+  | n ->
+    map_tail1 e ~f:(fun e ->
+        match get_const e with
+        | Some e -> natint_const_untagged dbg (Nativeint.logand e n)
+        | None -> (
+          let[@local] default () =
+            (* prefer putting constants on the right *)
+            Cop (Cand, [e; natint_const_untagged dbg n], dbg)
+          in
+          match e with
+          | Cop (Cand, [x; y], dbg) -> (
+            match get_const y with
+            | Some y -> and_const x (Nativeint.logand y n) dbg
+            | None -> default ())
+          | Cop (Cload { memory_chunk; mutability; is_atomic }, args, dbg) -> (
+            let[@local] load memory_chunk =
+              Cop (Cload { memory_chunk; mutability; is_atomic }, args, dbg)
+            in
+            match memory_chunk, n with
+            | (Byte_signed | Byte_unsigned), 0xffn -> load Byte_unsigned
+            | (Sixteen_signed | Sixteen_unsigned), 0xffffn ->
+              load Sixteen_unsigned
+            | (Thirtytwo_signed | Thirtytwo_unsigned), 0xffff_ffffn ->
+              load Thirtytwo_unsigned
+            | _ -> default ())
+          | _ -> default ()))
+
+let xor_int c1 c2 dbg =
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match get_const c1, get_const c2 with
+      | Some c1, Some c2 -> natint_const_untagged dbg (Nativeint.logxor c1 c2)
+      | None, Some c2 -> xor_const c1 c2 dbg
+      | Some c1, None -> xor_const c2 c1 dbg
+      | None, None -> Cop (Cxor, [c1; c2], dbg))
+
+let or_int c1 c2 dbg =
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match get_const c1, get_const c2 with
+      | Some c1, Some c2 -> natint_const_untagged dbg (Nativeint.logor c1 c2)
+      | None, Some c2 -> or_const c1 c2 dbg
+      | Some c1, None -> or_const c2 c1 dbg
+      | None, None -> Cop (Cor, [c1; c2], dbg))
+
+let and_int c1 c2 dbg =
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match get_const c1, get_const c2 with
+      | Some c1, Some c2 -> natint_const_untagged dbg (Nativeint.logand c1 c2)
+      | None, Some c2 -> and_const c1 c2 dbg
+      | Some c1, None -> and_const c2 c1 dbg
+      | None, None -> Cop (Cand, [c1; c2], dbg))
+
+let rec lsr_int c1 c2 dbg =
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match c1, c2 with
+      | c1, Cconst_int (0, _) -> c1
+      | c1, Cconst_int (n, _) when is_defined_shift n -> (
+        let c1 = ignore_low_bit_int c1 in
+        match get_const c1 with
+        | Some x ->
+          natint_const_untagged dbg (Nativeint.shift_right_logical x n)
+        | None -> (
+          match prefer_or c1 with
+          | Cop (Clsr, [inner; Cconst_int (n', _)], _) when is_defined_shift n'
+            ->
+            if is_defined_shift (n + n')
+            then lsr_const inner (n + n') dbg
+            else replace inner ~with_:(Cconst_int (0, dbg))
+          | Cop ((Cor | Cxor), [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_right_logical (const_exn y) n = 0n ->
+            lsr_int x c2 dbg
+          | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_right (const_exn y) n = 0n ->
+            replace x ~with_:(Cconst_int (0, dbg))
+          | _ -> Cop (Clsr, [c1; c2], dbg)))
+      | Cop (Clsr, [x; (Cconst_int (n', _) as y)], dbg'), c2
+        when is_defined_shift n' ->
+        (* prefer putting the constant shift on the outside to help enable
+           further peephole optimizations *)
+        Cop (Clsr, [Cop (Clsr, [x; c2], dbg); y], dbg')
+      | c1, c2 -> Cop (Clsr, [c1; c2], dbg))
+
+and asr_int c1 c2 dbg =
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match c1, c2 with
+      | c1, Cconst_int (0, _) -> c1
+      | c1, Cconst_int (n, _) when is_defined_shift n -> (
+        let c1 = ignore_low_bit_int c1 in
+        match get_const c1 with
+        | Some x -> natint_const_untagged dbg (Nativeint.shift_right x n)
+        | None -> (
+          match prefer_or c1 with
+          | Cop (Casr, [inner; Cconst_int (n', _)], _) when is_defined_shift n'
+            ->
+            (* saturating add, since the sign bit extends to the left. This is
+               different from the logical shifts because arithmetic shifting
+               [arch_bits] times or more is the same as shifting [arch_bits - 1]
+               times *)
+            asr_const inner (Int.min (n + n') (arch_bits - 1)) dbg
+          | Cop (Clsr, [_; Cconst_int (n', _)], _)
+            when n' > 0 && is_defined_shift n' ->
+            (* If the argument is guaranteed non-negative, then we know the sign
+               bit is 0 and we can weaken this operation to a logical shift *)
+            lsr_const c1 n dbg
+          | Cop (Clsl, [c; Cconst_int (x, _)], _)
+            when is_defined_shift x && max_signed_bit_length c + x < arch_bits
+            ->
+            (* some operations always return small enough integers that it is
+               safe and correct to combine [asr (lsl x y) z] into [asr x (z -
+               y)]. *)
+            if x > n then lsl_const c (x - n) dbg else asr_const c (n - x) dbg
+          | Cop ((Cor | Cxor), [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_right (const_exn y) n = 0n ->
+            asr_int x c2 dbg
+          | Cop (Cor, [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_right (const_exn y) n = -1n ->
+            replace x ~with_:(Cconst_int (-1, dbg))
+          | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_right (const_exn y) n = -1n ->
+            asr_int x c2 dbg
+          | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_right (const_exn y) n = 0n ->
+            replace x ~with_:(Cconst_int (0, dbg))
+          | _ -> Cop (Casr, [c1; c2], dbg)))
+      | Cop (Casr, [x; (Cconst_int (n', _) as y)], z), c2
+        when is_defined_shift n' ->
+        (* prefer putting the constant shift on the outside to help enable
+           further peephole optimizations *)
+        Cop (Casr, [Cop (Casr, [x; c2], dbg); y], z)
+      | _ -> Cop (Casr, [c1; c2], dbg))
+
+and lsl_int c1 c2 dbg =
+  map_tail2 c1 c2 ~f:(fun c1 c2 ->
+      match c1, c2 with
+      | c1, Cconst_int (0, _) -> c1
+      | c1, Cconst_int (n, _) when is_defined_shift n -> (
+        match get_const c1 with
+        | Some c1 -> natint_const_untagged dbg (Nativeint.shift_left c1 n)
+        | None -> (
+          match c1 with
+          | Cop (Clsl, [inner; Cconst_int (n', _)], dbg)
+            when is_defined_shift n' ->
+            if is_defined_shift (n + n')
+            then lsl_const inner (n + n') dbg
+            else replace inner ~with_:(Cconst_int (0, dbg))
+          | Cop (Caddi, [c1; Cconst_int (offset, _)], _)
+            when Misc.no_overflow_lsl offset n ->
+            add_const (lsl_int c1 c2 dbg) (offset lsl n) dbg
+          | Cop ((Cor | Cxor), [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_left (const_exn y) n = 0n ->
+            lsl_int x c2 dbg
+          | Cop (Cand, [x; ((Cconst_int _ | Cconst_natint _) as y)], _)
+            when Nativeint.shift_left (const_exn y) n = 0n ->
+            replace x ~with_:(Cconst_int (0, dbg))
+          | c1 -> Cop (Clsl, [c1; c2], dbg)))
+      | Cop (Clsl, [x; (Cconst_int (n', _) as y)], dbg'), c2
+        when is_defined_shift n' ->
+        (* prefer putting the constant shift on the outside to help enable
+           further peephole optimizations *)
+        Cop (Clsl, [Cop (Clsl, [x; c2], dbg); y], dbg')
+      | _, _ -> Cop (Clsl, [c1; c2], dbg))
+
+and lsl_const c n dbg = lsl_int c (Cconst_int (n, dbg)) dbg
+
+and asr_const c n dbg = asr_int c (Cconst_int (n, dbg)) dbg
+
+and lsr_const c n dbg = lsr_int c (Cconst_int (n, dbg)) dbg
 
 let is_power2 n = n = 1 lsl Misc.log2 n
 
@@ -392,74 +678,28 @@ let rec mul_int c1 c2 dbg =
     add_const (mul_int c (Cconst_int (k, dbg)) dbg) (n * k) dbg
   | c1, c2 -> Cop (Cmuli, [c1; c2], dbg)
 
-(* identify cmm operations whose result is guaranteed to be small integers (e.g.
-   in the range [min_int / 4; max_int / 4]) *)
-let guaranteed_to_be_small_int = function
-  | Cop ((Ccmpi _ | Ccmpf _), _, _) ->
-    (* integer/float comparisons return either [1] or [0]. *)
-    true
-  | _ -> false
-
-let ignore_low_bit_int = function
-  | Cop
-      ( Caddi,
-        [(Cop (Clsl, [_; Cconst_int (n, _)], _) as c); Cconst_int (1, _)],
-        _ )
-    when n > 0 ->
-    c
-  | Cop (Cor, [c; Cconst_int (1, _)], _) -> c
-  | c -> c
-
-let lsr_int c1 c2 dbg =
-  match c1, c2 with
-  | c1, Cconst_int (0, _) -> c1
-  | Cop (Clsr, [c; Cconst_int (n1, _)], _), Cconst_int (n2, _)
-    when n1 > 0 && n2 > 0 && n1 + n2 < size_int * 8 ->
-    Cop (Clsr, [c; Cconst_int (n1 + n2, dbg)], dbg)
-  | c1, Cconst_int (n, _) when n > 0 ->
-    Cop (Clsr, [ignore_low_bit_int c1; c2], dbg)
-  | _ -> Cop (Clsr, [c1; c2], dbg)
-
-let lsr_const c n dbg = lsr_int c (Cconst_int (n, dbg)) dbg
-
-let asr_int c1 c2 dbg =
-  match c2 with
-  | Cconst_int (0, _) -> c1
-  | Cconst_int (n, _) when n > 0 -> (
-    match ignore_low_bit_int c1 with
-    (* some operations always return small enough integers that it is safe and
-       correct to optimise [asr (lsl x 1) 1] into [x]. *)
-    | Cop (Clsl, [c; Cconst_int (1, _)], _)
-      when n = 1 && guaranteed_to_be_small_int c ->
-      c
-    | c1' -> Cop (Casr, [c1'; c2], dbg))
-  | _ -> Cop (Casr, [c1; c2], dbg)
-
-let asr_const c n dbg = asr_int c (Cconst_int (n, dbg)) dbg
-
 let tag_int i dbg =
   match i with
   | Cconst_int (n, _) -> int_const dbg n
-  | Cop (Casr, [c; Cconst_int (n, _)], _) when n > 0 ->
-    Cop
-      (Cor, [asr_int c (Cconst_int (n - 1, dbg)) dbg; Cconst_int (1, dbg)], dbg)
-  | c -> incr_int (lsl_int c (Cconst_int (1, dbg)) dbg) dbg
+  | c -> incr_int (lsl_const c 1 dbg) dbg
 
 let untag_int i dbg =
   match i with
   | Cconst_int (n, _) -> Cconst_int (n asr 1, dbg)
   | Cop (Cor, [Cop (Casr, [c; Cconst_int (n, _)], _); Cconst_int (1, _)], _)
-    when n > 0 && n < (size_int * 8) - 1 ->
-    Cop (Casr, [c; Cconst_int (n + 1, dbg)], dbg)
+    when n > 0 && is_defined_shift (n + 1) ->
+    asr_const c (n + 1) dbg
   | Cop (Cor, [Cop (Clsr, [c; Cconst_int (n, _)], _); Cconst_int (1, _)], _)
-    when n > 0 && n < (size_int * 8) - 1 ->
-    Cop (Clsr, [c; Cconst_int (n + 1, dbg)], dbg)
-  | c -> asr_int c (Cconst_int (1, dbg)) dbg
+    when n > 0 && is_defined_shift (n + 1) ->
+    lsr_const c (n + 1) dbg
+  | c -> asr_const c 1 dbg
 
 let mk_not dbg cmm =
   match cmm with
-  | Cop (Caddi, [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], dbg')
-    -> (
+  | Cop
+      ( (Caddi | Cor),
+        [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)],
+        dbg' ) -> (
     match c with
     | Cop (Ccmpi cmp, [c1; c2], dbg'') ->
       tag_int
@@ -793,8 +1033,10 @@ let mod_int ?dividend_cannot_be_min_int c1 c2 dbg =
 
 let test_bool dbg cmm =
   match cmm with
-  | Cop (Caddi, [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], _)
-    ->
+  | Cop
+      ( (Caddi | Cor),
+        [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)],
+        _ ) ->
     c
   | Cconst_int (n, dbg) ->
     if n = 1 then Cconst_int (0, dbg) else Cconst_int (1, dbg)
@@ -1029,8 +1271,10 @@ let array_indexing ?typ log2size ptr ofs dbg =
     if i = 0
     then ptr
     else Cop (add, [ptr; Cconst_int (i lsl log2size, dbg)], dbg)
-  | Cop (Caddi, [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)], dbg')
-    ->
+  | Cop
+      ( (Caddi | Cor),
+        [Cop (Clsl, [c; Cconst_int (1, _)], _); Cconst_int (1, _)],
+        dbg' ) ->
     Cop (add, [ptr; lsl_const c log2size dbg], dbg')
   | Cop (Caddi, [c; Cconst_int (n, _)], dbg') when log2size = 0 ->
     Cop
@@ -1293,11 +1537,11 @@ let rec low_bits ~bits ~dbg x =
   then x
   else
     let unused_bits = arch_bits - bits in
-    let does_mask_keep_low_bits test_mask =
+    let does_mask_keep_low_bits mask =
       (* If the mask has all the low bits set, then the low bits are unchanged.
          This could happen from zero-extension. *)
-      let mask = Nativeint.pred (Nativeint.shift_left 1n bits) in
-      Nativeint.equal mask (Nativeint.logand test_mask mask)
+      let low_bits = Nativeint.pred (Nativeint.shift_left 1n bits) in
+      Nativeint.equal low_bits (Nativeint.logand mask low_bits)
     in
     (* Ignore sign and zero extensions which do not affect the low bits *)
     map_tail
@@ -1344,7 +1588,7 @@ let zero_extend ~bits ~dbg e =
         | e -> zero_extend_via_mask e)
       (low_bits ~bits e ~dbg)
 
-let sign_extend ~bits ~dbg e =
+let rec sign_extend ~bits ~dbg e =
   assert (0 < bits && bits <= arch_bits);
   let unused_bits = arch_bits - bits in
   let sign_extend_via_shift e =
@@ -1354,11 +1598,24 @@ let sign_extend ~bits ~dbg e =
   then e
   else
     map_tail
-      (function
-        | Cop ((Casr | Clsr), [inner; Cconst_int (n, _)], _) as e
-          when 0 <= n && n < arch_bits ->
+      (fun e ->
+        match prefer_or e with
+        | Cop (Cand, [x; y], _) when is_constant y ->
+          and_int (sign_extend ~bits x ~dbg) (sign_extend ~bits y ~dbg) dbg
+        | Cop (Cor, [x; y], _) when is_constant y ->
+          or_int (sign_extend ~bits x ~dbg) (sign_extend ~bits y ~dbg) dbg
+        | Cop (Cxor, [x; y], _) when is_constant y ->
+          xor_int (sign_extend ~bits x ~dbg) (sign_extend ~bits y ~dbg) dbg
+        | Cop (((Casr | Clsr) as op), [inner; Cconst_int (n, _)], _) as e
+          when is_defined_shift n ->
           (* see middle_end/flambda2/z3/sign_extension.py for proof *)
-          if n > unused_bits
+          if n = unused_bits
+          then
+            match op with
+            | Casr -> e
+            | Clsr -> asr_const inner unused_bits dbg
+            | _ -> assert false
+          else if n > unused_bits
           then
             (* sign-extension is a no-op since the top n bits already match *)
             e
@@ -1961,21 +2218,6 @@ let bigarray_word_kind : Lambda.bigarray_kind -> memory_chunk = function
   | Pbigarray_complex32 -> Single { reg = Float64 }
   | Pbigarray_complex64 -> Double
 
-let and_int e1 e2 dbg =
-  let is_mask32 = function
-    | Cconst_natint (0xFFFF_FFFFn, _) -> true
-    | Cconst_int (n, _) -> Nativeint.of_int n = 0xFFFF_FFFFn
-    | _ -> false
-  in
-  match e1, e2 with
-  | e, m when is_mask32 m -> zero_extend ~bits:32 e ~dbg
-  | m, e when is_mask32 m -> zero_extend ~bits:32 e ~dbg
-  | e1, e2 -> Cop (Cand, [e1; e2], dbg)
-
-let or_int e1 e2 dbg = Cop (Cor, [e1; e2], dbg)
-
-let xor_int e1 e2 dbg = Cop (Cxor, [e1; e2], dbg)
-
 (* Boxed integers *)
 
 let operations_boxed_int (bi : Primitive.boxed_integer) =
diff --git a/middle_end/flambda2/z3/sign_extension.py b/middle_end/flambda2/z3/sign_extension.py
index 6613558072d..272f4a46c71 100755
--- a/middle_end/flambda2/z3/sign_extension.py
+++ b/middle_end/flambda2/z3/sign_extension.py
@@ -29,10 +29,15 @@ def reference_sign_extend(self, bits):
 
     def experimental_sign_extend(self, bits) -> BitVecRef:
         unused_bits = self.size() - bits
+
         return If(
-            self.shift_right > unused_bits,
-            self.as_ast(),
-            (self.inner << (unused_bits - self.shift_right)) >> unused_bits,
+            self.shift_right == unused_bits,
+            If(self.arith, self.as_ast(), self.inner >> unused_bits),
+            If(
+                self.shift_right > unused_bits,
+                self.as_ast(),
+                (self.inner << (unused_bits - self.shift_right)) >> unused_bits,
+            ),
         )