From db5e1b1890ab06e1514bfd7356ead51ad4240977 Mon Sep 17 00:00:00 2001
From: Kuai Wei <kuaiwei.kw@alibaba-inc.com>
Date: Mon, 30 Dec 2024 10:54:44 +0800
Subject: [PATCH] [JIT] Backport 8318446: C2: optimize stores into primitive
 arrays by combining values into larger store

Summary: include these patches for merge stores optimization
 8318446: C2: optimize stores into primitive arrays by combining values into larger store
 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug"
 8335390: C2 MergeStores: wrong result with Unsafe
 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store
 8331085: Crash in MergePrimitiveArrayStores::is_compatible_store()
 8331252: C2: MergeStores: handle negative shift values
 8331054: C2 MergeStores: assert failed: unexpected basic type after JDK-8318446 and JDK-8329555
 8335392: C2 MergeStores: enhanced pointer parsing
 8334342: Add MergeStore JMH benchmarks
 8226411: C2: Avoid memory barriers around off-heap unsafe accesses
 Fix is_ConI() query after port 8318446
 Fix for comments

Testing: CI/CD

Reviewers: zhuoren.wz, MaxXSoft

Issue: https://github.com/dragonwell-project/dragonwell11/issues/920

Fix for comments

Add missing assertion in round_down_power_of_2()
---
 .../share/compiler/compilerDirectives.hpp     |    1 +
 src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp |    5 +-
 .../share/gc/shared/c2/barrierSetC2.cpp       |   16 +-
 src/hotspot/share/oops/accessDecorators.hpp   |    4 +-
 src/hotspot/share/opto/addnode.cpp            |    4 +-
 src/hotspot/share/opto/addnode.hpp            |    2 +-
 src/hotspot/share/opto/c2_globals.hpp         |    6 +
 src/hotspot/share/opto/connode.hpp            |    5 +-
 src/hotspot/share/opto/library_call.cpp       |   12 +-
 src/hotspot/share/opto/memnode.cpp            |  563 ++++++++
 src/hotspot/share/opto/mempointer.cpp         |  383 ++++++
 src/hotspot/share/opto/mempointer.hpp         |  618 +++++++++
 src/hotspot/share/opto/noOverflowInt.hpp      |  114 ++
 src/hotspot/share/opto/node.hpp               |    4 +
 src/hotspot/share/opto/phaseX.cpp             |   10 +-
 .../gtest/opto/test_no_overflow_int.cpp       |  175 +++
 .../c2/TestMergeStoresNullAdrType.java        |   56 +
 .../c2/TestMergeStoresUnsafeArrayPointer.java |  324 +++++
 .../compiler/c2/TestUnalignedAccess.java      |  172 +++
 .../bench/vm/compiler/MergeStoreBench.java    | 1132 +++++++++++++++++
 .../bench/vm/compiler/MergeStores.java        |  780 ++++++++++++
 21 files changed, 4368 insertions(+), 18 deletions(-)
 create mode 100644 src/hotspot/share/opto/mempointer.cpp
 create mode 100644 src/hotspot/share/opto/mempointer.hpp
 create mode 100644 src/hotspot/share/opto/noOverflowInt.hpp
 create mode 100644 test/hotspot/gtest/opto/test_no_overflow_int.cpp
 create mode 100644 test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java
 create mode 100644 test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java
 create mode 100644 test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java
 create mode 100644 test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java
 create mode 100644 test/micro/org/openjdk/bench/vm/compiler/MergeStores.java

diff --git a/src/hotspot/share/compiler/compilerDirectives.hpp b/src/hotspot/share/compiler/compilerDirectives.hpp
index e2861bb6044..7e0bc7acb01 100644
--- a/src/hotspot/share/compiler/compilerDirectives.hpp
+++ b/src/hotspot/share/compiler/compilerDirectives.hpp
@@ -61,6 +61,7 @@
     cflags(PrintIntrinsics,         bool, PrintIntrinsics, PrintIntrinsics) \
 NOT_PRODUCT(cflags(TraceOptoPipelining, bool, TraceOptoPipelining, TraceOptoPipelining)) \
 NOT_PRODUCT(cflags(TraceOptoOutput,     bool, TraceOptoOutput, TraceOptoOutput)) \
+NOT_PRODUCT(cflags(TraceMergeStores, bool, TraceMergeStores, TraceMergeStores)) \
     cflags(TraceSpilling,           bool, TraceSpilling, TraceSpilling) \
     cflags(Vectorize,               bool, false, Vectorize) \
     cflags(VectorizeDebug,          uintx, 0, VectorizeDebug) \
diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
index 8d97939a459..d6e4b363323 100644
--- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
+++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp
@@ -607,12 +607,15 @@ Node* G1BarrierSetC2::load_at_resolved(C2Access& access, const Type* val_type) c
   Node* adr = access.addr().node();
   Node* obj = access.base();
 
+  bool anonymous = (decorators & C2_UNSAFE_ACCESS) != 0;
   bool mismatched = (decorators & C2_MISMATCHED) != 0;
   bool unknown = (decorators & ON_UNKNOWN_OOP_REF) != 0;
   bool in_heap = (decorators & IN_HEAP) != 0;
+  bool in_native = (decorators & IN_NATIVE) != 0;
   bool on_weak = (decorators & ON_WEAK_OOP_REF) != 0;
   bool is_unordered = (decorators & MO_UNORDERED) != 0;
-  bool need_cpu_mem_bar = !is_unordered || mismatched || !in_heap;
+  bool is_mixed = !in_heap && !in_native;
+  bool need_cpu_mem_bar = !is_unordered || mismatched || is_mixed;
 
   Node* offset = adr->is_AddP() ? adr->in(AddPNode::Offset) : kit->top();
   Node* load = CardTableBarrierSetC2::load_at_resolved(access, val_type);
diff --git a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp
index 5452756444c..1a8021b4561 100644
--- a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp
+++ b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp
@@ -40,13 +40,15 @@ void* C2Access::barrier_set_state() const {
 }
 
 bool C2Access::needs_cpu_membar() const {
-  bool mismatched = (_decorators & C2_MISMATCHED) != 0;
+  bool mismatched   = (_decorators & C2_MISMATCHED) != 0;
   bool is_unordered = (_decorators & MO_UNORDERED) != 0;
   bool anonymous = (_decorators & C2_UNSAFE_ACCESS) != 0;
-  bool in_heap = (_decorators & IN_HEAP) != 0;
+  bool in_heap   = (_decorators & IN_HEAP) != 0;
+  bool in_native = (_decorators & IN_NATIVE) != 0;
+  bool is_mixed  = !in_heap && !in_native;
 
-  bool is_write = (_decorators & C2_WRITE_ACCESS) != 0;
-  bool is_read = (_decorators & C2_READ_ACCESS) != 0;
+  bool is_write  = (_decorators & C2_WRITE_ACCESS) != 0;
+  bool is_read   = (_decorators & C2_READ_ACCESS) != 0;
   bool is_atomic = is_read && is_write;
 
   if (is_atomic) {
@@ -60,9 +62,11 @@ bool C2Access::needs_cpu_membar() const {
     // the barriers get omitted and the unsafe reference begins to "pollute"
     // the alias analysis of the rest of the graph, either Compile::can_alias
     // or Compile::must_alias will throw a diagnostic assert.)
-    if (!in_heap || !is_unordered || (mismatched && !_addr.type()->isa_aryptr())) {
+    if (is_mixed || !is_unordered || (mismatched && !_addr.type()->isa_aryptr())) {
       return true;
     }
+  } else {
+    assert(!is_mixed, "not unsafe");
   }
 
   return false;
@@ -78,7 +82,7 @@ Node* BarrierSetC2::store_at_resolved(C2Access& access, C2AccessValue& val) cons
   bool requires_atomic_access = (decorators & MO_UNORDERED) == 0;
 
   bool in_native = (decorators & IN_NATIVE) != 0;
-  assert(!in_native, "not supported yet");
+  assert(!in_native || (unsafe && !access.is_oop()), "not supported yet");
 
   if (access.type() == T_DOUBLE) {
     Node* new_val = kit->dstore_rounding(val.node());
diff --git a/src/hotspot/share/oops/accessDecorators.hpp b/src/hotspot/share/oops/accessDecorators.hpp
index ab27c7e5240..c6d83c6bbdc 100644
--- a/src/hotspot/share/oops/accessDecorators.hpp
+++ b/src/hotspot/share/oops/accessDecorators.hpp
@@ -174,11 +174,11 @@ const DecoratorSet ON_DECORATOR_MASK  = ON_STRONG_OOP_REF | ON_WEAK_OOP_REF |
                                         ON_PHANTOM_OOP_REF | ON_UNKNOWN_OOP_REF;
 
 // === Access Location ===
-// Accesses can take place in, e.g. the heap, old or young generation and different native roots.
+// Accesses can take place in, e.g. the heap, old or young generation, different native roots, or native memory off the heap.
 // The location is important to the GC as it may imply different actions. The following decorators are used:
 // * IN_HEAP: The access is performed in the heap. Many barriers such as card marking will
 //   be omitted if this decorator is not set.
-// * IN_NATIVE: The access is performed in an off-heap data structure pointing into the Java heap.
+// * IN_NATIVE: The access is performed in an off-heap data structure.
 const DecoratorSet IN_HEAP            = UCONST64(1) << 19;
 const DecoratorSet IN_NATIVE          = UCONST64(1) << 20;
 const DecoratorSet IN_DECORATOR_MASK  = IN_HEAP | IN_NATIVE;
diff --git a/src/hotspot/share/opto/addnode.cpp b/src/hotspot/share/opto/addnode.cpp
index bdde4fe8dfe..1450b4d15ea 100644
--- a/src/hotspot/share/opto/addnode.cpp
+++ b/src/hotspot/share/opto/addnode.cpp
@@ -704,9 +704,9 @@ Node* AddPNode::Ideal_base_and_offset(Node* ptr, PhaseTransform* phase,
 //------------------------------unpack_offsets----------------------------------
 // Collect the AddP offset values into the elements array, giving up
 // if there are more than length.
-int AddPNode::unpack_offsets(Node* elements[], int length) {
+int AddPNode::unpack_offsets(Node* elements[], int length) const {
   int count = 0;
-  Node* addr = this;
+  Node const* addr = this;
   Node* base = addr->in(AddPNode::Base);
   while (addr->is_AddP()) {
     if (addr->in(AddPNode::Base) != base) {
diff --git a/src/hotspot/share/opto/addnode.hpp b/src/hotspot/share/opto/addnode.hpp
index 30319a7150c..1897d013a7a 100644
--- a/src/hotspot/share/opto/addnode.hpp
+++ b/src/hotspot/share/opto/addnode.hpp
@@ -154,7 +154,7 @@ class AddPNode : public Node {
 
   // Collect the AddP offset values into the elements array, giving up
   // if there are more than length.
-  int unpack_offsets(Node* elements[], int length);
+  int unpack_offsets(Node* elements[], int length) const;
 
   // Do not match base-ptr edge
   virtual uint match_edge(uint idx) const;
diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp
index ea5cd8299cd..363783e72d2 100644
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@@ -354,6 +354,12 @@
   notproduct(bool, TraceNewVectors, false,                                  \
           "Trace creation of Vector nodes")                                 \
                                                                             \
+  diagnostic(bool, MergeStores, true,                                       \
+          "Optimize stores by combining values into larger store")          \
+                                                                            \
+  develop(bool, TraceMergeStores, false,                                    \
+          "Trace creation of merged stores")                                \
+                                                                            \
   product_pd(bool, OptoBundling,                                            \
           "Generate nops to fill i-cache lines")                            \
                                                                             \
diff --git a/src/hotspot/share/opto/connode.hpp b/src/hotspot/share/opto/connode.hpp
index 1d175461e30..4171df8c175 100644
--- a/src/hotspot/share/opto/connode.hpp
+++ b/src/hotspot/share/opto/connode.hpp
@@ -39,6 +39,7 @@ class ConNode : public TypeNode {
   ConNode( const Type *t ) : TypeNode(t->remove_speculative(),1) {
     init_req(0, (Node*)Compile::current()->root());
     init_flags(Flag_is_Con);
+    init_class_id(Class_Con);
   }
   virtual int  Opcode() const;
   virtual uint hash() const;
@@ -53,7 +54,9 @@ class ConNode : public TypeNode {
 // Simple integer constants
 class ConINode : public ConNode {
 public:
-  ConINode( const TypeInt *t ) : ConNode(t) {}
+  ConINode( const TypeInt *t ) : ConNode(t) {
+    init_class_id(Class_ConI);
+  }
   virtual int Opcode() const;
 
   // Factory method:
diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp
index 9fdc0835f65..d7b7540c75f 100644
--- a/src/hotspot/share/opto/library_call.cpp
+++ b/src/hotspot/share/opto/library_call.cpp
@@ -2201,10 +2201,14 @@ bool LibraryCallKit::inline_unsafe_access(bool is_store, const BasicType type, c
   offset = ConvL2X(offset);
   adr = make_unsafe_address(base, offset, type, kind == Relaxed);
 
-  if (_gvn.type(base)->isa_ptr() != TypePtr::NULL_PTR) {
-    heap_base_oop = base;
-  } else if (type == T_OBJECT) {
-    return false; // off-heap oop accesses are not supported
+  if (_gvn.type(base)->isa_ptr() == TypePtr::NULL_PTR) {
+    if (type != T_OBJECT) {
+      decorators |= IN_NATIVE; // off-heap primitive access
+    } else {
+      return false; // off-heap oop accesses are not supported
+    }
+  } else {
+    heap_base_oop = base; // on-heap or mixed access
   }
 
   // Can base be NULL? Otherwise, always on-heap access.
diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index c33c488588a..bbd82b3deac 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -40,6 +40,7 @@
 #include "opto/machnode.hpp"
 #include "opto/matcher.hpp"
 #include "opto/memnode.hpp"
+#include "opto/mempointer.hpp"
 #include "opto/mulnode.hpp"
 #include "opto/narrowptrnode.hpp"
 #include "opto/phaseX.hpp"
@@ -2561,6 +2562,558 @@ uint StoreNode::hash() const {
   return NO_HASH;
 }
 
+// Link together multiple stores (B/S/C/I) into a longer one.
+//
+// Example: _store = StoreB[i+3]
+//
+//   RangeCheck[i+0]           RangeCheck[i+0]
+//   StoreB[i+0]
+//   RangeCheck[i+3]           RangeCheck[i+3]
+//   StoreB[i+1]         -->   pass:             fail:
+//   StoreB[i+2]               StoreI[i+0]       StoreB[i+0]
+//   StoreB[i+3]
+//
+// The 4 StoreB are merged into a single StoreI node. We have to be careful with RangeCheck[i+1]: before
+// the optimization, if this RangeCheck[i+1] fails, then we execute only StoreB[i+0], and then trap. After
+// the optimization, the new StoreI[i+0] is on the passing path of RangeCheck[i+3], and StoreB[i+0] on the
+// failing path.
+//
+// Note: For normal array stores, every store at first has a RangeCheck. But they can be removed with:
+//       - RCE (RangeCheck Elimination): the RangeChecks in the loop are hoisted out and before the loop,
+//                                       and possibly no RangeChecks remain between the stores.
+//       - RangeCheck smearing: the earlier RangeChecks are adjusted such that they cover later RangeChecks,
+//                              and those later RangeChecks can be removed. Example:
+//
+//                              RangeCheck[i+0]                         RangeCheck[i+0] <- before first store
+//                              StoreB[i+0]                             StoreB[i+0]     <- first store
+//                              RangeCheck[i+1]     --> smeared -->     RangeCheck[i+3] <- only RC between first and last store
+//                              StoreB[i+1]                             StoreB[i+1]     <- second store
+//                              RangeCheck[i+2]     --> removed
+//                              StoreB[i+2]                             StoreB[i+2]
+//                              RangeCheck[i+3]     --> removed
+//                              StoreB[i+3]                             StoreB[i+3]     <- last store
+//
+//                              Thus, it is a common pattern that between the first and last store in a chain
+//                              of adjacent stores there remains exactly one RangeCheck, located between the
+//                              first and the second store (e.g. RangeCheck[i+3]).
+//
+class MergePrimitiveStores : public StackObj {
+private:
+  PhaseGVN* const _phase;
+  StoreNode* const _store;
+
+  NOT_PRODUCT( const bool _trace;)
+
+public:
+  MergePrimitiveStores(PhaseGVN* phase, StoreNode* store) :
+    _phase(phase), _store(store)
+    NOT_PRODUCT( COMMA _trace(Compile::current()->directive()->TraceMergeStoresOption) )
+    {}
+
+  StoreNode* run();
+
+private:
+  bool is_compatible_store(const StoreNode* other_store) const;
+  bool is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const;
+  bool is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const;
+  static bool is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out);
+  enum CFGStatus { CFG_SuccessNoRangeCheck, CFG_SuccessWithRangeCheck, CFG_Failure };
+  static CFGStatus cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store);
+
+  class Status {
+  private:
+    StoreNode* _found_store;
+    bool       _found_range_check;
+
+    Status(StoreNode* found_store, bool found_range_check)
+      : _found_store(found_store), _found_range_check(found_range_check) {}
+
+  public:
+    StoreNode* found_store() const { return _found_store; }
+    bool found_range_check() const { return _found_range_check; }
+    static Status make_failure() { return Status(NULL, false); }
+
+    static Status make(StoreNode* found_store, const CFGStatus cfg_status) {
+      if (cfg_status == CFG_Failure) {
+        return Status::make_failure();
+      }
+      return Status(found_store, cfg_status == CFG_SuccessWithRangeCheck);
+    }
+
+#ifndef PRODUCT
+    void print_on(outputStream* st) const {
+      if (_found_store == NULL) {
+        st->print_cr("None");
+      } else {
+        st->print_cr("Found[%d %s, %s]", _found_store->_idx, _found_store->Name(),
+                                         _found_range_check ? "RC" : "no-RC");
+      }
+    }
+#endif
+  };
+
+  Status find_adjacent_use_store(const StoreNode* def_store) const;
+  Status find_adjacent_def_store(const StoreNode* use_store) const;
+  Status find_use_store(const StoreNode* def_store) const;
+  Status find_def_store(const StoreNode* use_store) const;
+  Status find_use_store_unidirectional(const StoreNode* def_store) const;
+  Status find_def_store_unidirectional(const StoreNode* use_store) const;
+
+  void collect_merge_list(Node_List& merge_list) const;
+  Node* make_merged_input_value(const Node_List& merge_list);
+  StoreNode* make_merged_store(const Node_List& merge_list, Node* merged_input_value);
+
+#ifndef PRODUCT
+  bool is_trace_basic() const {
+    return _trace;
+  }
+
+  bool is_trace_pointer() const {
+    return _trace;
+  }
+
+  bool is_trace_aliasing() const {
+    return _trace;
+  }
+
+  bool is_trace_adjacency() const {
+    return _trace;
+  }
+
+  bool is_trace_success() const {
+    return _trace;
+  }
+
+#endif
+
+  NOT_PRODUCT( void trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const; )
+};
+
+StoreNode* MergePrimitiveStores::run() {
+  // Check for B/S/C/I
+  int opc = _store->Opcode();
+  if (opc != Op_StoreB && opc != Op_StoreC && opc != Op_StoreI) {
+    return NULL;
+  }
+
+  NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] MergePrimitiveStores::run: "); _store->dump(); })
+
+  // The _store must be the "last" store in a chain. If we find a use we could merge with
+  // then that use or a store further down is the "last" store.
+  Status status_use = find_adjacent_use_store(_store);
+  NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] expect no use: "); status_use.print_on(tty); })
+  if (status_use.found_store() != NULL) {
+    return NULL;
+  }
+
+  // Check if we can merge with at least one def, so that we have at least 2 stores to merge.
+  Status status_def = find_adjacent_def_store(_store);
+  NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] expect def: "); status_def.print_on(tty); })
+  if (status_def.found_store() == NULL) {
+    return NULL;
+  }
+
+  ResourceMark rm;
+  Node_List merge_list;
+  collect_merge_list(merge_list);
+
+  Node* merged_input_value = make_merged_input_value(merge_list);
+  if (merged_input_value == NULL) { return NULL; }
+
+  StoreNode* merged_store = make_merged_store(merge_list, merged_input_value);
+
+  NOT_PRODUCT( if (is_trace_success()) { trace(merge_list, merged_input_value, merged_store); } )
+
+  return merged_store;
+}
+
+// Check compatibility between _store and other_store.
+bool MergePrimitiveStores::is_compatible_store(const StoreNode* other_store) const {
+  int opc = _store->Opcode();
+  assert(opc == Op_StoreB || opc == Op_StoreC || opc == Op_StoreI, "precondition");
+
+  if (other_store == NULL ||
+      _store->Opcode() != other_store->Opcode()) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const {
+  if (!is_adjacent_input_pair(def_store->in(MemNode::ValueIn),
+                              use_store->in(MemNode::ValueIn),
+                              def_store->memory_size())) {
+    return false;
+  }
+
+  ResourceMark rm;
+#ifndef PRODUCT
+  const TraceMemPointer trace(is_trace_pointer(),
+                              is_trace_aliasing(),
+                              is_trace_adjacency());
+#endif
+  const MemPointer pointer_use(use_store NOT_PRODUCT( COMMA trace ));
+  const MemPointer pointer_def(def_store NOT_PRODUCT( COMMA trace ));
+  return pointer_def.is_adjacent_to_and_before(pointer_use);
+}
+
+bool MergePrimitiveStores::is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const {
+  // Pattern: [n1 = ConI, n2 = ConI]
+  if (n1->Opcode() == Op_ConI) {
+    return n2->Opcode() == Op_ConI;
+  }
+
+  // Pattern: [n1 = base >> shift, n2 = base >> (shift + memory_size)]
+#ifndef VM_LITTLE_ENDIAN
+  // Pattern: [n1 = base >> (shift + memory_size), n2 = base >> shift]
+  // Swapping n1 with n2 gives same pattern as on little endian platforms.
+  swap(n1, n2);
+#endif // !VM_LITTLE_ENDIAN
+  Node const* base_n2;
+  jint shift_n2;
+  if (!is_con_RShift(n2, base_n2, shift_n2)) {
+    return false;
+  }
+  if (n1->Opcode() == Op_ConvL2I) {
+    // look through
+    n1 = n1->in(1);
+  }
+  Node const* base_n1;
+  jint shift_n1;
+  if (n1 == base_n2) {
+    // n1 = base = base >> 0
+    base_n1 = n1;
+    shift_n1 = 0;
+  } else if (!is_con_RShift(n1, base_n1, shift_n1)) {
+    return false;
+  }
+  int bits_per_store = memory_size * 8;
+  if (base_n1 != base_n2 ||
+      shift_n1 + bits_per_store != shift_n2 ||
+      shift_n1 % bits_per_store != 0) {
+    return false;
+  }
+
+  // both load from same value with correct shift
+  return true;
+}
+
+// Detect pattern: n = base_out >> shift_out
+bool MergePrimitiveStores::is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out) {
+  assert(n != NULL, "precondition");
+
+  int opc = n->Opcode();
+  if (opc == Op_ConvL2I) {
+    n = n->in(1);
+    opc = n->Opcode();
+  }
+
+  if ((opc == Op_RShiftI ||
+       opc == Op_RShiftL ||
+       opc == Op_URShiftI ||
+       opc == Op_URShiftL) &&
+      n->in(2)->is_ConI()) {
+    base_out = n->in(1);
+    shift_out = n->in(2)->get_int();
+    // The shift must be positive:
+    return shift_out >= 0;
+  }
+  return false;
+}
+
+// Check if there is nothing between the two stores, except optionally a RangeCheck leading to an uncommon trap.
+MergePrimitiveStores::CFGStatus MergePrimitiveStores::cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store) {
+  assert(use_store->in(MemNode::Memory) == def_store, "use-def relationship");
+
+  Node* ctrl_use = use_store->in(MemNode::Control);
+  Node* ctrl_def = def_store->in(MemNode::Control);
+  if (ctrl_use == NULL || ctrl_def == NULL) {
+    return CFG_Failure;
+  }
+
+  if (ctrl_use == ctrl_def) {
+    // Same ctrl -> no RangeCheck in between.
+    // Check: use_store must be the only use of def_store.
+    if (def_store->outcnt() > 1) {
+      return CFG_Failure;
+    }
+    return CFG_SuccessNoRangeCheck;
+  }
+
+  // Different ctrl -> could have RangeCheck in between.
+  // Check: 1. def_store only has these uses: use_store and MergeMem for uncommon trap, and
+  //        2. ctrl separated by RangeCheck.
+  if (def_store->outcnt() != 2) {
+    return CFG_Failure; // Cannot have exactly these uses: use_store and MergeMem for uncommon trap.
+  }
+  int use_store_out_idx = def_store->raw_out(0) == use_store ? 0 : 1;
+  Node* merge_mem = def_store->raw_out(1 - use_store_out_idx)->isa_MergeMem();
+  if (merge_mem == NULL ||
+      merge_mem->outcnt() != 1) {
+    return CFG_Failure; // Does not have MergeMem for uncommon trap.
+  }
+  if (!ctrl_use->is_IfProj() ||
+      !ctrl_use->in(0)->is_RangeCheck() ||
+      ctrl_use->in(0)->outcnt() != 2) {
+    return CFG_Failure; // Not RangeCheck.
+  }
+  ProjNode* other_proj = ctrl_use->as_IfProj()->other_if_proj();
+  Node* trap = other_proj->is_uncommon_trap_proj(Deoptimization::Reason_range_check);
+  if (trap != merge_mem->unique_out() ||
+      ctrl_use->in(0)->in(0) != ctrl_def) {
+    return CFG_Failure; // Not RangeCheck with merge_mem leading to uncommon trap.
+  }
+
+  return CFG_SuccessWithRangeCheck;
+}
+
+MergePrimitiveStores::Status MergePrimitiveStores::find_adjacent_use_store(const StoreNode* def_store) const {
+  Status status_use = find_use_store(def_store);
+  StoreNode* use_store = status_use.found_store();
+  if (use_store != NULL && !is_adjacent_pair(use_store, def_store)) {
+    return Status::make_failure();
+  }
+  return status_use;
+}
+
+MergePrimitiveStores::Status MergePrimitiveStores::find_adjacent_def_store(const StoreNode* use_store) const {
+  Status status_def = find_def_store(use_store);
+  StoreNode* def_store = status_def.found_store();
+  if (def_store != NULL && !is_adjacent_pair(use_store, def_store)) {
+    return Status::make_failure();
+  }
+  return status_def;
+}
+
+MergePrimitiveStores::Status MergePrimitiveStores::find_use_store(const StoreNode* def_store) const {
+  Status status_use = find_use_store_unidirectional(def_store);
+
+#ifdef ASSERT
+  StoreNode* use_store = status_use.found_store();
+  if (use_store != NULL) {
+    Status status_def = find_def_store_unidirectional(use_store);
+    assert(status_def.found_store() == def_store &&
+           status_def.found_range_check() == status_use.found_range_check(),
+           "find_use_store and find_def_store must be symmetric");
+  }
+#endif
+
+  return status_use;
+}
+
+MergePrimitiveStores::Status MergePrimitiveStores::find_def_store(const StoreNode* use_store) const {
+  Status status_def = find_def_store_unidirectional(use_store);
+
+#ifdef ASSERT
+  StoreNode* def_store = status_def.found_store();
+  if (def_store != NULL) {
+    Status status_use = find_use_store_unidirectional(def_store);
+    assert(status_use.found_store() == use_store &&
+           status_use.found_range_check() == status_def.found_range_check(),
+           "find_use_store and find_def_store must be symmetric");
+  }
+#endif
+
+  return status_def;
+}
+
+MergePrimitiveStores::Status MergePrimitiveStores::find_use_store_unidirectional(const StoreNode* def_store) const {
+  assert(is_compatible_store(def_store), "precondition: must be compatible with _store");
+
+  for (DUIterator_Fast imax, i = def_store->fast_outs(imax); i < imax; i++) {
+    StoreNode* use_store = def_store->fast_out(i)->isa_Store();
+    if (is_compatible_store(use_store)) {
+      return Status::make(use_store, cfg_status_for_pair(use_store, def_store));
+    }
+  }
+
+  return Status::make_failure();
+}
+
+MergePrimitiveStores::Status MergePrimitiveStores::find_def_store_unidirectional(const StoreNode* use_store) const {
+  assert(is_compatible_store(use_store), "precondition: must be compatible with _store");
+
+  StoreNode* def_store = use_store->in(MemNode::Memory)->isa_Store();
+  if (!is_compatible_store(def_store)) {
+    return Status::make_failure();
+  }
+
+  return Status::make(def_store, cfg_status_for_pair(use_store, def_store));
+}
+
+static int round_down_power_of_2(uint value) {
+  assert(value > 0, "Invalid value");
+  return 1 << log2_uint(value);
+}
+
+void MergePrimitiveStores::collect_merge_list(Node_List& merge_list) const {
+  // The merged store can be at most 8 bytes.
+  const uint merge_list_max_size = 8 / _store->memory_size();
+  assert(merge_list_max_size >= 2 &&
+         merge_list_max_size <= 8 &&
+         is_power_of_2(merge_list_max_size),
+         "must be 2, 4 or 8");
+
+  // Traverse up the chain of adjacent def stores.
+  StoreNode* current = _store;
+  merge_list.push(current);
+  while (current != NULL && merge_list.size() < merge_list_max_size) {
+    Status status = find_adjacent_def_store(current);
+    NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] find def: "); status.print_on(tty); })
+
+    current = status.found_store();
+    if (current != NULL) {
+      merge_list.push(current);
+
+      // We can have at most one RangeCheck.
+      if (status.found_range_check()) {
+        NOT_PRODUCT( if (is_trace_basic()) { tty->print_cr("[TraceMergeStores] found RangeCheck, stop traversal."); })
+        break;
+      }
+    }
+  }
+
+  NOT_PRODUCT( if (is_trace_basic()) { tty->print_cr("[TraceMergeStores] found:"); merge_list.dump(); })
+
+  // Truncate the merge_list to a power of 2.
+  const uint pow2size = round_down_power_of_2(merge_list.size());
+  assert(pow2size >= 2, "must be merging at least 2 stores");
+  while (merge_list.size() > pow2size) { merge_list.pop(); }
+
+  NOT_PRODUCT( if (is_trace_basic()) { tty->print_cr("[TraceMergeStores] truncated:"); merge_list.dump(); })
+}
+
+// Merge the input values of the smaller stores to a single larger input value.
+Node* MergePrimitiveStores::make_merged_input_value(const Node_List& merge_list) {
+  int new_memory_size = _store->memory_size() * merge_list.size();
+  Node* first = merge_list.at(merge_list.size()-1);
+  Node* merged_input_value = NULL;
+  if (_store->in(MemNode::ValueIn)->Opcode() == Op_ConI) {
+    // Pattern: [ConI, ConI, ...] -> new constant
+    jlong con = 0;
+    jlong bits_per_store = _store->memory_size() * 8;
+    jlong mask = (((jlong)1) << bits_per_store) - 1;
+    for (uint i = 0; i < merge_list.size(); i++) {
+      jlong con_i = merge_list.at(i)->in(MemNode::ValueIn)->get_int();
+#ifdef VM_LITTLE_ENDIAN
+      con = con << bits_per_store;
+      con = con | (mask & con_i);
+#else // VM_LITTLE_ENDIAN
+      con_i = (mask & con_i) << (i * bits_per_store);
+      con = con | con_i;
+#endif // VM_LITTLE_ENDIAN
+    }
+    merged_input_value = _phase->longcon(con);
+  } else {
+    // Pattern: [base >> 24, base >> 16, base >> 8, base] -> base
+    //             |                                  |
+    //           _store                             first
+    //
+    Node* hi = _store->in(MemNode::ValueIn);
+    Node* lo = first->in(MemNode::ValueIn);
+#ifndef VM_LITTLE_ENDIAN
+    // `_store` and `first` are swapped in the diagram above
+    swap(hi, lo);
+#endif // !VM_LITTLE_ENDIAN
+    Node const* hi_base;
+    jint hi_shift;
+    merged_input_value = lo;
+    bool is_true = is_con_RShift(hi, hi_base, hi_shift);
+    assert(is_true, "must detect con RShift");
+    if (merged_input_value != hi_base && merged_input_value->Opcode() == Op_ConvL2I) {
+      // look through
+      merged_input_value = merged_input_value->in(1);
+    }
+    if (merged_input_value != hi_base) {
+      // merged_input_value is not the base
+      return NULL;
+    }
+  }
+
+  if (_phase->type(merged_input_value)->isa_long() != NULL && new_memory_size <= 4) {
+    // Example:
+    //
+    //   long base = ...;
+    //   a[0] = (byte)(base >> 0);
+    //   a[1] = (byte)(base >> 8);
+    //
+    merged_input_value = _phase->transform(new ConvL2INode(merged_input_value));
+  }
+
+  assert((_phase->type(merged_input_value)->isa_int() != NULL && new_memory_size <= 4) ||
+         (_phase->type(merged_input_value)->isa_long() != NULL && new_memory_size == 8),
+         "merged_input_value is either int or long, and new_memory_size is small enough");
+
+  return merged_input_value;
+}
+
+//                                                                                                          //
+// first_ctrl    first_mem   first_adr                first_ctrl    first_mem         first_adr             //
+//  |                |           |                     |                |                 |                 //
+//  |                |           |                     |                +---------------+ |                 //
+//  |                |           |                     |                |               | |                 //
+//  |                | +---------+                     |                | +---------------+                 //
+//  |                | |                               |                | |             | |                 //
+//  +--------------+ | |  v1                           +------------------------------+ | |  v1             //
+//  |              | | |  |                            |                | |           | | |  |              //
+// RangeCheck     first_store                         RangeCheck        | |          first_store            //
+//  |                |  |                              |                | |                |                //
+// last_ctrl         |  +----> unc_trap               last_ctrl         | |                +----> unc_trap  //
+//  |                |                       ===>      |                | |                                 //
+//  +--------------+ | a2 v2                           |                | |                                 //
+//  |              | | |  |                            |                | |                                 //
+//  |             second_store                         |                | |                                 //
+//  |                |                                 |                | | [v1 v2   ...   vn]              //
+// ...              ...                                |                | |         |                       //
+//  |                |                                 |                | |         v                       //
+//  +--------------+ | an vn                           +--------------+ | | merged_input_value              //
+//                 | | |  |                                           | | |  |                              //
+//                last_store (= _store)                              merged_store                           //
+//                                                                                                          //
+StoreNode* MergePrimitiveStores::make_merged_store(const Node_List& merge_list, Node* merged_input_value) {
+  Node* first_store = merge_list.at(merge_list.size()-1);
+  Node* last_ctrl   = _store->in(MemNode::Control); // after (optional) RangeCheck
+  Node* first_mem   = first_store->in(MemNode::Memory);
+  Node* first_adr   = first_store->in(MemNode::Address);
+
+  const TypePtr* new_adr_type = _store->adr_type();
+
+  int new_memory_size = _store->memory_size() * merge_list.size();
+  BasicType bt = T_ILLEGAL;
+  switch (new_memory_size) {
+    case 2: bt = T_SHORT; break;
+    case 4: bt = T_INT;   break;
+    case 8: bt = T_LONG;  break;
+  }
+
+  StoreNode* merged_store = StoreNode::make(*_phase, last_ctrl, first_mem, first_adr,
+                                            new_adr_type, merged_input_value, bt, MemNode::unordered);
+
+  // Marking the store mismatched is sufficient to prevent reordering, since array stores
+  // are all on the same slice. Hence, we need no barriers.
+  merged_store->set_mismatched_access();
+
+  // Constants above may now also be be packed -> put candidate on worklist
+  _phase->is_IterGVN()->_worklist.push(first_mem);
+
+  return merged_store;
+}
+
+#ifndef PRODUCT
+void MergePrimitiveStores::trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const {
+  stringStream ss;
+  ss.print_cr("[TraceMergeStores]: Replace");
+  for (int i = (int)merge_list.size() - 1; i >= 0; i--) {
+    merge_list.at(i)->dump("\n", false, &ss);
+  }
+  ss.print_cr("[TraceMergeStores]: with");
+  merged_input_value->dump("\n", false, &ss);
+  merged_store->dump("\n", false, &ss);
+  tty->print("%s", ss.as_string());
+}
+#endif
+
 //------------------------------Ideal------------------------------------------
 // Change back-to-back Store(, p, x) -> Store(m, p, y) to Store(m, p, x).
 // When a store immediately follows a relevant allocation/initialization,
@@ -2634,6 +3187,16 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) {
     }
   }
 
+  if (MergeStores && UseUnalignedAccesses) {
+    if (phase->C->post_loop_opts_phase()) {
+      MergePrimitiveStores merge(phase, this);
+      Node* progress = merge.run();
+      if (progress != NULL) { return progress; }
+    } else {
+      phase->C->record_for_post_loop_opts_igvn(this);
+    }
+  }
+
   return NULL;                  // No further progress
 }
 
diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp
new file mode 100644
index 00000000000..2a1921663d7
--- /dev/null
+++ b/src/hotspot/share/opto/mempointer.cpp
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "opto/mempointer.hpp"
+#include "utilities/resourceHash.hpp"
+
+// Recursively parse the pointer expression with a DFS all-path traversal
+// (i.e. with node repetitions), starting at the pointer.
+MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form() {
+  assert(_worklist.is_empty(), "no prior parsing");
+  assert(_summands.is_empty(), "no prior parsing");
+
+  Node* pointer = _mem->in(MemNode::Address);
+
+  // Start with the trivial summand.
+  _worklist.push(MemPointerSummand(pointer, NoOverflowInt(1)));
+
+  // Decompose the summands until only terminal summands remain. This effectively
+  // parses the pointer expression recursively.
+  int traversal_count = 0;
+  while (_worklist.is_nonempty()) {
+    // Bail out if the graph is too complex.
+    if (traversal_count++ > 1000) { return MemPointerDecomposedForm::make_trivial(pointer); }
+    parse_sub_expression(_worklist.pop());
+  }
+
+  // Bail out if there is a constant overflow.
+  if (_con.is_NaN()) { return MemPointerDecomposedForm::make_trivial(pointer); }
+
+  // Sorting by variable idx means that all summands with the same variable are consecutive.
+  // This simplifies the combining of summands with the same variable below.
+  _summands.sort(MemPointerSummand::cmp_by_variable_idx);
+
+  // Combine summands for the same variable, adding up the scales.
+  int pos_put = 0;
+  int pos_get = 0;
+  while (pos_get < _summands.length()) {
+    const MemPointerSummand& summand = _summands.at(pos_get++);
+    Node* variable      = summand.variable();
+    NoOverflowInt scale = summand.scale();
+    // Add up scale of all summands with the same variable.
+    while (pos_get < _summands.length() && _summands.at(pos_get).variable() == variable) {
+      MemPointerSummand s = _summands.at(pos_get++);
+      scale = scale + s.scale();
+    }
+    // Bail out if scale is NaN.
+    if (scale.is_NaN()) {
+      return MemPointerDecomposedForm::make_trivial(pointer);
+    }
+    // Keep summands with non-zero scale.
+    if (!scale.is_zero()) {
+      _summands.at_put(pos_put++, MemPointerSummand(variable, scale));
+    }
+  }
+  _summands.trunc_to(pos_put);
+
+  return MemPointerDecomposedForm::make(pointer, _summands, _con);
+}
+
+// Parse a sub-expression of the pointer, starting at the current summand. We parse the
+// current node, and see if it can be decomposed into further summands, or if the current
+// summand is terminal.
+void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSummand& summand) {
+  Node* n = summand.variable();
+  const NoOverflowInt scale = summand.scale();
+  const NoOverflowInt one(1);
+
+  int opc = n->Opcode();
+  if (is_safe_to_decompose_op(opc, scale)) {
+    switch (opc) {
+      case Op_ConI:
+      case Op_ConL:
+      {
+        // Terminal: add to constant.
+        NoOverflowInt con = (opc == Op_ConI) ? NoOverflowInt(n->get_int())
+                                             : NoOverflowInt(n->get_long());
+        _con = _con + scale * con;
+        return;
+      }
+      case Op_AddP:
+      case Op_AddL:
+      case Op_AddI:
+      {
+        // Decompose addition.
+        Node* a = n->in((opc == Op_AddP) ? 2 : 1);
+        Node* b = n->in((opc == Op_AddP) ? 3 : 2);
+        _worklist.push(MemPointerSummand(a, scale));
+        _worklist.push(MemPointerSummand(b, scale));
+        return;
+      }
+      case Op_SubL:
+      case Op_SubI:
+      {
+        // Decompose subtraction.
+        Node* a = n->in(1);
+        Node* b = n->in(2);
+
+        NoOverflowInt sub_scale = NoOverflowInt(-1) * scale;
+
+        _worklist.push(MemPointerSummand(a, scale));
+        _worklist.push(MemPointerSummand(b, sub_scale));
+        return;
+      }
+      case Op_MulL:
+      case Op_MulI:
+      case Op_LShiftL:
+      case Op_LShiftI:
+      {
+        // Only multiplication with constants is allowed: factor * variable
+        // IGVN already folds constants to in(2). If we find a variable there
+        // instead, we cannot further decompose this summand, and have to add
+        // it to the terminal summands.
+        Node* variable = n->in(1);
+        Node* con      = n->in(2);
+        if (!con->is_Con()) { break; }
+        NoOverflowInt factor;
+        switch (opc) {
+          case Op_MulL:    // variable * con
+            factor = NoOverflowInt(con->get_long());
+            break;
+          case Op_MulI:    // variable * con
+            factor = NoOverflowInt(con->get_int());
+            break;
+          case Op_LShiftL: // variable << con = variable * (1 << con)
+            factor = one << NoOverflowInt(con->get_int());
+            break;
+          case Op_LShiftI: // variable << con = variable * (1 << con)
+            factor = one << NoOverflowInt(con->get_int());
+            break;
+        }
+
+        // Accumulate scale.
+        NoOverflowInt new_scale = scale * factor;
+
+        _worklist.push(MemPointerSummand(variable, new_scale));
+        return;
+      }
+      case Op_CastII:
+      // case Op_CastLL:
+      case Op_CastX2P:
+      case Op_ConvI2L:
+      // On 32bit systems we can also look through ConvL2I, since the final result will always
+      // be truncated back with ConvL2I. On 64bit systems we cannot decompose ConvL2I because
+      // such int values will eventually be expanded to long with a ConvI2L:
+      //
+      //   valL = max_jint + 1
+      //   ConvI2L(ConvL2I(valL)) = ConvI2L(min_jint) = min_jint != max_jint + 1 = valL
+      //
+      NOT_LP64( case Op_ConvL2I: )
+      {
+        // Decompose: look through.
+        Node* a = n->in(1);
+        _worklist.push(MemPointerSummand(a, scale));
+        return;
+      }
+      default:
+        // All other operations cannot be further decomposed. We just add them to the
+        // terminal summands below.
+        break;
+    }
+  }
+
+  // Default: we could not parse the "summand" further, i.e. it is terminal.
+  _summands.push(summand);
+}
+
+// Check if the decomposition of operation opc is guaranteed to be safe.
+// Please refer to the definition of "safe decomposition" in mempointer.hpp
+bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const {
+#ifndef _LP64
+  // On 32-bit platforms, the pointer has 32bits, and thus any higher bits will always
+  // be truncated. Thus, it does not matter if we have int or long overflows.
+  // Simply put: all decompositions are (SAFE1).
+  return true;
+#else
+
+  switch (opc) {
+    // These operations are always safe to decompose, i.e. (SAFE1):
+    case Op_ConI:
+    case Op_ConL:
+    case Op_AddP:
+    case Op_AddL:
+    case Op_SubL:
+    case Op_MulL:
+    case Op_LShiftL:
+    case Op_CastII:
+    // case Op_CastLL:
+    case Op_CastX2P:
+    case Op_CastPP:
+    case Op_ConvI2L:
+      return true;
+
+    // But on 64-bit platforms, these operations are not trivially safe to decompose:
+    case Op_AddI:    // ConvI2L(a +  b)    != ConvI2L(a) +  ConvI2L(b)
+    case Op_SubI:    // ConvI2L(a -  b)    != ConvI2L(a) -  ConvI2L(b)
+    case Op_MulI:    // ConvI2L(a *  conI) != ConvI2L(a) *  ConvI2L(conI)
+    case Op_LShiftI: // ConvI2L(a << conI) != ConvI2L(a) << ConvI2L(conI)
+      break; // Analysis below.
+
+    // All other operations are assumed not safe to decompose, or simply cannot be decomposed
+    default:
+      return false;
+  }
+
+  const TypeAryPtr* ary_ptr_t = _mem->adr_type()->isa_aryptr();
+  if (ary_ptr_t != NULL) {
+    // Array accesses that are not Unsafe always have a RangeCheck which ensures
+    // that there is no int overflow. And without overflows, all decompositions
+    // are (SAFE1).
+    if (!_mem->is_unsafe_access()) {
+      return true;
+    }
+
+    // Intuition: In general, the decomposition of AddI, SubI, MulI or LShiftI is not safe,
+    //            because of overflows. But under some conditions, we can prove that such a
+    //            decomposition is (SAFE2). Intuitively, we want to prove that an overflow
+    //            would mean that the pointers have such a large distance, that at least one
+    //            must lie out of bounds. In the proof of the "MemPointer Lemma", we thus
+    //            get a contradiction with the condition that both pointers are in bounds.
+    //
+    // We prove that the decomposition of AddI, SubI, MulI (with constant) and ShiftI (with
+    // constant) is (SAFE2), under the condition:
+    //
+    //   abs(scale) % array_element_size_in_bytes = 0
+    //
+    // First, we describe how the decomposition works:
+    //
+    //   mp_i = con + sum(other_summands) + summand
+    //          -------------------------   -------
+    //          rest                        scale * ConvI2L(op)
+    //
+    //  We decompose the summand depending on the op, where we know that there is some
+    //  integer y, such that:
+    //
+    //    scale * ConvI2L(a + b)     =  scale * ConvI2L(a) + scale * ConvI2L(b)  +  scale * y * 2^32
+    //    scale * ConvI2L(a - b)     =  scale * ConvI2L(a) - scale * ConvI2L(b)  +  scale * y * 2^32
+    //    scale * ConvI2L(a * con)   =  scale * con * ConvI2L(a)                 +  scale * y * 2^32
+    //    scale * ConvI2L(a << con)  =  scale * (1 << con) * ConvI2L(a)          +  scale * y * 2^32
+    //    \_______________________/     \_____________________________________/     \______________/
+    //      before decomposition          after decomposition ("new_summands")     overflow correction
+    //
+    //  Thus, for AddI and SubI, we get:
+    //    summand = new_summand1 + new_summand2 + scale * y * 2^32
+    //
+    //    mp_{i+1} = con + sum(other_summands) + new_summand1 + new_summand2
+    //             = con + sum(other_summands) + summand - scale * y * 2^32
+    //             = mp_i                                - scale * y * 2^32
+    //
+    //  And for MulI and ShiftI we get:
+    //    summand = new_summand + scale * y * 2^32
+    //
+    //    mp_{i+1} = con + sum(other_summands) + new_summand
+    //             = con + sum(other_summands) + summand - scale * y * 2^32
+    //             = mp_i                                - scale * y * 2^32
+    //
+    //  Further:
+    //    abs(scale) % array_element_size_in_bytes = 0
+    //  implies that there is some integer z, such that:
+    //    z * array_element_size_in_bytes = scale
+    //
+    //  And hence, with "x = y * z", the decomposition is (SAFE2) under the assumed condition:
+    //    mp_i = mp_{i+1} + scale                           * y * 2^32
+    //         = mp_{i+1} + z * array_element_size_in_bytes * y * 2^32
+    //         = mp_{i+1} + x * array_element_size_in_bytes     * 2^32
+    //
+    BasicType array_element_bt = ary_ptr_t->elem()->array_element_basic_type();
+    if (is_java_primitive(array_element_bt)) {
+      NoOverflowInt array_element_size_in_bytes = NoOverflowInt(type2aelembytes(array_element_bt));
+      if (scale.is_multiple_of(array_element_size_in_bytes)) {
+        return true;
+      }
+    }
+  }
+
+  return false;
+#endif
+}
+
+// Compute the aliasing between two MemPointerDecomposedForm. We use the "MemPointer Lemma" to
+// prove that the computed aliasing also applies for the underlying pointers. Note that the
+// condition (S0) is already given, because the MemPointerDecomposedForm is always constructed
+// using only safe decompositions.
+//
+// Pre-Condition:
+//   We assume that both pointers are in-bounds of their respective memory object. If this does
+//   not hold, for example, with the use of Unsafe, then we would already have undefined behavior,
+//   and we are allowed to do anything.
+MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerDecomposedForm& other
+                                                               NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const {
+#ifndef PRODUCT
+  if (trace.is_trace_aliasing()) {
+    tty->print_cr("MemPointerDecomposedForm::get_aliasing_with:");
+    print_on(tty);
+    other.print_on(tty);
+  }
+#endif
+
+  // "MemPointer Lemma" condition (S2): check if all summands are the same:
+  for (uint i = 0; i < SUMMANDS_SIZE; i++) {
+    const MemPointerSummand s1 = summands_at(i);
+    const MemPointerSummand s2 = other.summands_at(i);
+    if (s1 != s2) {
+#ifndef PRODUCT
+      if (trace.is_trace_aliasing()) {
+        tty->print_cr("  -> Aliasing unknown, differ on summand %d.", i);
+      }
+#endif
+      return MemPointerAliasing::make_unknown();
+    }
+  }
+
+  // "MemPointer Lemma" condition (S3): check that the constants do not differ too much:
+  const NoOverflowInt distance = other.con() - con();
+  // We must check that: abs(distance) < 2^32
+  // However, this is only false if: distance = min_jint
+  if (distance.is_NaN() || distance.value() == min_jint) {
+#ifndef PRODUCT
+    if (trace.is_trace_aliasing()) {
+      tty->print("  -> Aliasing unknown, bad distance: ");
+      distance.print_on(tty);
+      tty->cr();
+    }
+#endif
+    return MemPointerAliasing::make_unknown();
+  }
+
+  // "MemPointer Lemma" condition (S1):
+  //   Given that all summands are the same, we know that both pointers point into the
+  //   same memory object. With the Pre-Condition, we know that both pointers are in
+  //   bounds of that same memory object.
+
+  // Hence, all 4 conditions of the "MemoryPointer Lemma" are established, and hence
+  // we know that the distance between the underlying pointers is equal to the distance
+  // we computed for the MemPointers:
+  //   p_other - p_this = distance = other.con - this.con
+#ifndef PRODUCT
+    if (trace.is_trace_aliasing()) {
+      tty->print_cr("  -> Aliasing always, distance = %d.", distance.value());
+    }
+#endif
+  return MemPointerAliasing::make_always(distance.value());
+}
+
+bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const {
+  const MemPointerDecomposedForm& s1 = decomposed_form();
+  const MemPointerDecomposedForm& s2 = other.decomposed_form();
+  const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _trace ));
+  const jint size = mem()->memory_size();
+  const bool is_adjacent = aliasing.is_always_at_distance(size);
+
+#ifndef PRODUCT
+  if (_trace.is_trace_adjacency()) {
+    tty->print("Adjacent: %s, because size = %d and aliasing = ",
+               is_adjacent ? "true" : "false", size);
+    aliasing.print_on(tty);
+    tty->cr();
+  }
+#endif
+
+  return is_adjacent;
+}
diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp
new file mode 100644
index 00000000000..6da90eb1a09
--- /dev/null
+++ b/src/hotspot/share/opto/mempointer.hpp
@@ -0,0 +1,618 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_OPTO_MEMPOINTER_HPP
+#define SHARE_OPTO_MEMPOINTER_HPP
+
+#include "opto/memnode.hpp"
+#include "opto/noOverflowInt.hpp"
+
+// The MemPointer is a shared facility to parse pointers and check the aliasing of pointers,
+// e.g. checking if two stores are adjacent.
+//
+// -----------------------------------------------------------------------------------------
+//
+// Intuition and Examples:
+//   We parse / decompose pointers into a linear form:
+//
+//     pointer = SUM(scale_i * variable_i) + con
+//
+//   where SUM() adds all "scale_i * variable_i" for each i together.
+//
+//   The con and scale_i are compile-time constants (NoOverflowInt), and the variable_i are
+//   compile-time variables (C2 nodes).
+//
+//   For the MemPointer, we do not explicitly track the base address. For Java heap pointers, the
+//   base address is just a variable in a summand with scale == 1. For native memory (C heap)
+//   pointers, the base address is null, and is hence implicitly a zero constant.
+//
+//
+//   Example 1: byte array access:
+//
+//     array[i]
+//
+//     pointer =           array_base + ARRAY_BYTE_BASE_OFFSET + 1       * i
+//             = 1       * array_base + ARRAY_BYTE_BASE_OFFSET + 1       * i
+//               --------------------   ----------------------   --------------------
+//             = scale_0 * variable_0 + con                    + scale_1 * variable_1
+//
+//
+//   Example 2: int array access
+//
+//     array[5 + i + 3 * j]
+//
+//     pointer =           array_base + ARRAY_INT_BASE_OFFSET + 4 * 5 + 4       * i          + 4       * 3 * j
+//             = 1       * array_base + ARRAY_INT_BASE_OFFSET + 20    + 4       * i          + 12      * j
+//               --------------------   -----------------------------   --------------------   --------------------
+//             = scale_0 * variable_0 + con                           + scale_1 * variable_1 + scale_2 * variable_2
+//
+//
+//   Example 3: Unsafe with int array
+//
+//     UNSAFE.getInt(array, ARRAY_INT_BASE_OFFSET + 4 * i);
+//
+//     pointer =           array_base + ARRAY_INT_BASE_OFFSET + 4       * i
+//             = 1       * array_base + ARRAY_INT_BASE_OFFSET + 4       * i
+//               --------------------   ---------------------   --------------------
+//             = scale_0 * variable_0 + con                   + scale_1 * variable_1
+//
+//
+//   Example 4: Unsafe with native memory address
+//
+//     long address;
+//     UNSAFE.getInt(null, address + 4 * i);
+//
+//     pointer =           address          + 4       * i
+//             = 1       * address    + 0   + 4       * i
+//               --------------------   ---   --------------------
+//             = scale_0 * variable_0 + con + scale_1 * variable_1
+//
+//
+//   Example 5: MemorySegment with byte array as backing type
+//
+//     byte[] array = new byte[1000];
+//     MemorySegment ms = MemorySegment.ofArray(array);
+//     assert ms.heapBase().get() == array: "array is base";
+//     assert ms.address() == 0: "zero offset from base";
+//     byte val = ms.get(ValueLayout.JAVA_BYTE, i);
+//
+//     pointer =           ms.heapBase() + ARRAY_BYTE_BASE_OFFSET + ms.address() +           i
+//             = 1       * array_base    + ARRAY_BYTE_BASE_OFFSET + 0            + 1       * i
+//               -----------------------   -------------------------------------   --------------------
+//             = scale_0 * variable_0    + con                                   + scale_1 * variable_1
+//
+//
+//   Example 6: MemorySegment with native memory
+//
+//     MemorySegment ms = Arena.ofAuto().allocate(1000, 1);
+//     assert ms.heapBase().isEmpty(): "null base";
+//     assert ms.address() != 0: "non-zero native memory address";
+//     short val = ms.get(ValueLayout.JAVA_SHORT, 2L * i);
+//
+//     pointer = ms.heapBase() +           ms.address() + 2         i
+//             = 0             + 1       * ms.address() + 2       * i
+//               ------------    ----------------------   --------------------
+//             = con             scale_0 * variable_0   + scale_1 * variable_1
+//
+//
+//   Example 7: Non-linear access to int array
+//
+//     array[5 + i + j * k]
+//
+//     pointer =           array_base + ARRAY_INT_BASE_OFFSET + 4 * 5 + 4       * i          + 4       * j * k
+//             = 1       * array_base + ARRAY_INT_BASE_OFFSET + 20    + 4       * i          + 4       * j * k
+//               --------------------   -----------------------------   --------------------   --------------------
+//             = scale_0 * variable_0 + con                           + scale_1 * variable_1 + scale_2 * variable_2
+//
+//     Note: we simply stop parsing once a term is not linear. We keep "j * k" as its own variable.
+//
+//
+//   Example 8: Unsafe with native memory address, non-linear access
+//
+//     UNSAFE.getInt(null, i * j);
+//
+//     pointer =                 i * j
+//             = 0   + 1       * i * j
+//               ---   --------------------
+//             = con + scale_0 * variable_0
+//
+//     Note: we can always parse a pointer into its trivial linear form:
+//
+//             pointer = 0 + 1 * pointer.
+//
+// -----------------------------------------------------------------------------------------
+//
+// MemPointerDecomposedForm:
+//   When the pointer is parsed, it is decomposed into a SUM of summands plus a constant:
+//
+//     pointer = SUM(summands) + con
+//
+//   Where each summand_i in summands has the form:
+//
+//     summand_i = scale_i * variable_i
+//
+//   Hence, the full decomposed form is:
+//
+//     pointer = SUM(scale_i * variable_i) + con
+//
+//   Note: the scale_i are compile-time constants (NoOverflowInt), and the variable_i are
+//         compile-time variables (C2 nodes).
+//   On 64-bit systems, this decomposed form is computed with long-add/mul, on 32-bit systems
+//   it is computed with int-add/mul.
+//
+// MemPointerAliasing:
+//   The decomposed form allows us to determine the aliasing between two pointers easily. For
+//   example, if two pointers are identical, except for their constant:
+//
+//     pointer1 = SUM(summands) + con1
+//     pointer2 = SUM(summands) + con2
+//
+//   then we can easily compute the distance between the pointers (distance = con2 - con1),
+//   and determine if they are adjacent.
+//
+// MemPointerDecomposedFormParser:
+//   Any pointer can be parsed into this (default / trivial) decomposed form:
+//
+//     pointer = 1       * pointer    + 0
+//               scale_0 * variable_0 + con
+//
+//   However, this is not particularly useful to compute aliasing. We would like to decompose
+//   the pointer as far as possible, i.e. extract as many summands and add up the constants to
+//   a single constant.
+//
+//   Example (normal int-array access):
+//     pointer1 = array[i + 0] = array_base + array_int_base_offset + 4L * ConvI2L(i + 0)
+//     pointer2 = array[i + 1] = array_base + array_int_base_offset + 4L * ConvI2L(i + 1)
+//
+//     At first, computing the aliasing is not immediately straight-forward in the general case because
+//     the distance is hidden inside the ConvI2L. We can convert this (with array_int_base_offset = 16)
+//     into these decomposed forms:
+//
+//     pointer1 = 1L * array_base + 4L * i + 16L
+//     pointer2 = 1L * array_base + 4L * i + 20L
+//
+//     This allows us to easily see that these two pointers are adjacent (distance = 4).
+//
+//   Hence, in MemPointerDecomposedFormParser::parse_decomposed_form, we start with the pointer as
+//   a trivial summand. A summand can either be decomposed further or it is terminal (cannot
+//   be decomposed further). We decompose the summands recursively until all remaining summands
+//   are terminal, see MemPointerDecomposedFormParser::parse_sub_expression. This effectively parses
+//   the pointer expression recursively.
+//
+// -----------------------------------------------------------------------------------------
+//
+//   We have to be careful on 64-bit systems with ConvI2L: decomposing its input is not
+//   correct in general, overflows may not be preserved in the decomposed form:
+//
+//     AddI:     ConvI2L(a +  b)    != ConvI2L(a) +  ConvI2L(b)
+//     SubI:     ConvI2L(a -  b)    != ConvI2L(a) -  ConvI2L(b)
+//     MulI:     ConvI2L(a *  conI) != ConvI2L(a) *  ConvI2L(conI)
+//     LShiftI:  ConvI2L(a << conI) != ConvI2L(a) << ConvI2L(conI)
+//
+//   If we want to prove the correctness of MemPointerAliasing, we need some guarantees,
+//   that the MemPointers adequately represent the underlying pointers, such that we can
+//   compute the aliasing based on the summands and constants.
+//
+// -----------------------------------------------------------------------------------------
+//
+//   Below, we will formulate a "MemPointer Lemma" that helps us to prove the correctness of
+//   the MemPointerAliasing computations. To prove the "MemPointer Lemma", we need to define
+//   the idea of a "safe decomposition", and then prove that all the decompositions we apply
+//   are such "safe decompositions".
+//
+//
+// Definition: Safe decomposition
+//   Trivial decomposition:
+//     (SAFE0) The trivial decomposition from p to mp_0 = 0 + 1 * p is always safe.
+//
+//   Non-trivial decomposition:
+//     We decompose summand in:
+//       mp_i     = con + summand                     + SUM(other_summands)
+//     resulting in:      +-------------------------+
+//       mp_{i+1} = con + dec_con + SUM(dec_summands) + SUM(other_summands)
+//                = new_con + SUM(new_summands)
+//   where mp_i means that the original pointer p was decomposed i times.
+//
+//   We call a non-trivial decomposition safe if either:
+//     (SAFE1) No matter the values of the summand variables:
+//               mp_i = mp_{i+1}
+//
+//     (SAFE2) The pointer is on an array with a known array_element_size_in_bytes,
+//             and there is an integer x, such that:
+//               mp_i = mp_{i+1} + x * array_element_size_in_bytes * 2^32
+//
+//             Note: if "x = 0", we have "mp1 = mp2", and if "x != 0", then mp1 and mp2
+//                   have a distance at least twice as large as the array size, and so
+//                   at least one of mp1 or mp2 must be out of bounds of the array.
+//
+// MemPointer Lemma:
+//    Given two pointers p1 and p2, and their respective MemPointers mp1 and mp2.
+//    If these conditions hold:
+//      (S0) mp1 and mp2 are constructed only with safe decompositions (SAFE0, SAFE1, SAFE2)
+//           from p1 and p2, respectively.
+//      (S1) Both p1 and p2 are within the bounds of the same memory object.
+//      (S2) The constants do not differ too much: abs(mp1.con - mp2.con) < 2^31.
+//      (S3) All summands of mp1 and mp2 are identical (i.e. only the constants are possibly different).
+//
+//    then the pointer difference between p1 and p2 is identical to the difference between
+//    mp1 and mp2:
+//      p1 - p2 = mp1 - mp2
+//
+//    Note: MemPointerDecomposedForm::get_aliasing_with relies on this MemPointer Lemma to
+//          prove the correctness of its aliasing computation between two MemPointers.
+//
+//
+//    Note: MemPointerDecomposedFormParser::is_safe_to_decompose_op checks that all
+//          decompositions we apply are safe.
+//
+//
+//  Proof of the "MemPointer Lemma":
+//    Assume (S0-S3) and show that
+//      p1 - p2 = mp1 - mp2
+//
+//    We make a case distinction over the types of decompositions used in the construction of mp1 and mp2.
+//
+//    Trivial Case: Only trivial (SAFE0) decompositions were used:
+//      mp1 = 0 + 1 * p1 = p1
+//      mp2 = 0 + 1 * p2 = p2
+//      =>
+//      p1 - p2 = mp1 - mp2
+//
+//    Unsafe Case: We apply at least one unsafe decomposition:
+//      This is a contradiction to (S0) and we are done.
+//
+//    Case 1: Only decomposition of type (SAFE0) and (SAFE1) are used:
+//      We make an induction proof over the decompositions from p1 to mp1, starting with
+//      the trivial decomposition (SAFE0):
+//        mp1_0 = 0 + 1 * p1 = p1
+//      Then for the i-th non-trivial decomposition (SAFE1) we know that
+//        mp1_i = mp1_{i+1}
+//      and hence, after the n-th non-trivial decomposition from p1:
+//        p1 = mp1_0 = mp1_i = mp1_n = mp1
+//      Analogously, we can prove:
+//        p2 = mp2
+//
+//      p1 = mp1
+//      p2 = mp2
+//      =>
+//      p1 - p2 = mp1 - mp2
+//
+//    Case 2: At least one decomposition of type (SAFE2) and no unsafe decomposition is used.
+//      Given we have (SAFE2) decompositions, we know that we are operating on an array of
+//      known array_element_size_in_bytes. We can weaken the guarantees from (SAFE1)
+//      decompositions to the same guarantee as (SAFE2) decompositions. Hence all applied
+//      non-trivial decompositions satisfy:
+//        mp1_i = mp1_{i+1} + x1_i * array_element_size_in_bytes * 2^32
+//      where x1_i = 0 for (SAFE1) decompositions.
+//
+//      We make an induction proof over the decompositions from p1 to mp1, starting with
+//      the trivial decomposition (SAFE0):
+//        mp1_0 = 0 + 1 * p1 = p1
+//      Then for the i-th non-trivial decomposition (SAFE1) or (SAFE2), we know that
+//        mp1_i = mp1_{i+1} + x1_i * array_element_size_in_bytes * 2^32
+//      and hence, if mp1 was decomposed with n non-trivial decompositions (SAFE1) or (SAFE2) from p1:
+//        p1 = mp1 + x1 * array_element_size_in_bytes * 2^32
+//      where
+//        x1 = SUM(x1_i)
+//      Analogously, we can prove:
+//        p2 = mp2 + x2 * array_element_size_in_bytes * 2^32
+//
+//      And hence, with x = x1 - x2 we have:
+//        p1 - p2 = mp1 - mp2 + x * array_element_size_in_bytes * 2^32
+//
+//      If "x = 0", then it follows:
+//        p1 - p2 = mp1 - mp2
+//
+//      If "x != 0", then:
+//        abs(p1 - p2) =  abs(mp1 - mp2 + x * array_element_size_in_bytes * 2^32)
+//                     >= abs(x * array_element_size_in_bytes * 2^32) - abs(mp1 - mp2)
+//                            -- apply x != 0 --
+//                     >= array_element_size_in_bytes * 2^32          - abs(mp1 - mp2)
+//                                                                    -- apply (S3) --
+//                     =  array_element_size_in_bytes * 2^32          - abs(mp1.con - mp2.con)
+//                                                                        -- apply (S2) --
+//                     >  array_element_size_in_bytes * 2^32          - 2^31
+//                        -- apply array_element_size_in_bytes > 0 --
+//                     >= array_element_size_in_bytes * 2^31
+//                     >= max_possible_array_size_in_bytes
+//                     >= array_size_in_bytes
+//
+//        This shows that p1 and p2 have a distance greater than the array size, and hence at least one of the two
+//        pointers must be out of bounds. This contradicts our assumption (S1) and we are done.
+
+
+#ifndef PRODUCT
+class TraceMemPointer : public StackObj {
+private:
+  const bool _is_trace_pointer;
+  const bool _is_trace_aliasing;
+  const bool _is_trace_adjacency;
+
+public:
+  TraceMemPointer(const bool is_trace_pointer,
+                  const bool is_trace_aliasing,
+                  const bool is_trace_adjacency) :
+    _is_trace_pointer(  is_trace_pointer),
+    _is_trace_aliasing( is_trace_aliasing),
+    _is_trace_adjacency(is_trace_adjacency)
+    {}
+
+  bool is_trace_pointer()   const { return _is_trace_pointer; }
+  bool is_trace_aliasing()  const { return _is_trace_aliasing; }
+  bool is_trace_adjacency() const { return _is_trace_adjacency; }
+};
+#endif
+
+// Class to represent aliasing between two MemPointer.
+class MemPointerAliasing {
+public:
+  enum Aliasing {
+    Unknown, // Distance unknown.
+             //   Example: two "int[]" with different variable index offsets.
+             //            e.g. "array[i]  vs  array[j]".
+             //            e.g. "array1[i] vs  array2[j]".
+    Always}; // Constant distance = p1 - p2.
+             //   Example: The same address expression, except for a constant offset
+             //            e.g. "array[i]  vs  array[i+1]".
+private:
+  const Aliasing _aliasing;
+  const jint _distance;
+
+  MemPointerAliasing(const Aliasing aliasing, const jint distance) :
+    _aliasing(aliasing),
+    _distance(distance)
+  {
+    assert(_distance != min_jint, "given by condition (S3) of MemPointer Lemma");
+  }
+
+public:
+  static MemPointerAliasing make_unknown() {
+    return MemPointerAliasing(Unknown, 0);
+  }
+
+  static MemPointerAliasing make_always(const jint distance) {
+    return MemPointerAliasing(Always, distance);
+  }
+
+  // Use case: exact aliasing and adjacency.
+  bool is_always_at_distance(const jint distance) const {
+    return _aliasing == Always && _distance == distance;
+  }
+
+#ifndef PRODUCT
+  void print_on(outputStream* st) const {
+    switch(_aliasing) {
+      case Unknown: st->print("Unknown");               break;
+      case Always:  st->print("Always(%d)", _distance); break;
+      default: ShouldNotReachHere();
+    }
+  }
+#endif
+};
+
+// Summand of a MemPointerDecomposedForm:
+//
+//   summand = scale * variable
+//
+// where variable is a C2 node.
+class MemPointerSummand : public StackObj {
+private:
+  Node* _variable;
+  NoOverflowInt _scale;
+
+public:
+  MemPointerSummand() :
+      _variable(NULL),
+      _scale(NoOverflowInt::make_NaN()) {}
+  MemPointerSummand(Node* variable, const NoOverflowInt& scale) :
+      _variable(variable),
+      _scale(scale)
+  {
+    assert(_variable != NULL, "must have variable");
+    assert(!_scale.is_zero(), "non-zero scale");
+  }
+
+  Node* variable() const { return _variable; }
+  NoOverflowInt scale() const { return _scale; }
+
+  static int cmp_by_variable_idx(MemPointerSummand* p1, MemPointerSummand* p2) {
+    if (p1->variable() == NULL) {
+      return (p2->variable() == NULL) ? 0 : 1;
+    } else if (p2->variable() == NULL) {
+      return -1;
+    }
+
+    return p1->variable()->_idx - p2->variable()->_idx;
+  }
+
+  friend bool operator==(const MemPointerSummand a, const MemPointerSummand b) {
+    // Both "null" -> equal.
+    if (a.variable() == NULL && b.variable() == NULL) { return true; }
+
+    // Same variable and scale?
+    if (a.variable() != b.variable()) { return false; }
+    return a.scale() == b.scale();
+  }
+
+  friend bool operator!=(const MemPointerSummand a, const MemPointerSummand b) {
+    return !(a == b);
+  }
+
+#ifndef PRODUCT
+  void print_on(outputStream* st) const {
+    st->print("Summand[");
+    _scale.print_on(st);
+    tty->print(" * [%d %s]]", _variable->_idx, _variable->Name());
+  }
+#endif
+};
+
+// Decomposed form of the pointer sub-expression of "pointer".
+//
+//   pointer = SUM(summands) + con
+//
+class MemPointerDecomposedForm : public StackObj {
+private:
+  // We limit the number of summands to 10. This is just a best guess, and not at this
+  // point supported by evidence. But I think it is reasonable: usually, a pointer
+  // contains a base pointer (e.g. array pointer or null for native memory) and a few
+  // variables. It should be rare that we have more than 9 variables.
+  static const int SUMMANDS_SIZE = 10;
+
+  Node* _pointer; // pointer node associated with this (sub)pointer
+
+  MemPointerSummand _summands[SUMMANDS_SIZE];
+  NoOverflowInt _con;
+
+public:
+  // Empty
+  MemPointerDecomposedForm() : _pointer(NULL), _con(NoOverflowInt::make_NaN()) {}
+
+private:
+  // Default / trivial: pointer = 0 + 1 * pointer
+  MemPointerDecomposedForm(Node* pointer) : _pointer(pointer), _con(NoOverflowInt(0)) {
+    assert(pointer != NULL, "pointer must be non-null");
+    _summands[0] = MemPointerSummand(pointer, NoOverflowInt(1));
+  }
+
+  MemPointerDecomposedForm(Node* pointer, const GrowableArray<MemPointerSummand>& summands, const NoOverflowInt& con)
+    : _pointer(pointer), _con(con) {
+    assert(!_con.is_NaN(), "non-NaN constant");
+    assert(summands.length() <= SUMMANDS_SIZE, "summands must fit");
+    for (int i = 0; i < summands.length(); i++) {
+      MemPointerSummand s = summands.at(i);
+      assert(s.variable() != NULL, "variable cannot be null");
+      assert(!s.scale().is_NaN(), "non-NaN scale");
+      _summands[i] = s;
+    }
+  }
+
+public:
+  static MemPointerDecomposedForm make_trivial(Node* pointer) {
+    return MemPointerDecomposedForm(pointer);
+  }
+
+  static MemPointerDecomposedForm make(Node* pointer, const GrowableArray<MemPointerSummand>& summands, const NoOverflowInt& con) {
+    if (summands.length() <= SUMMANDS_SIZE) {
+      return MemPointerDecomposedForm(pointer, summands, con);
+    } else {
+      return MemPointerDecomposedForm::make_trivial(pointer);
+    }
+  }
+
+  MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other
+                                       NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const;
+
+  const MemPointerSummand summands_at(const uint i) const {
+    assert(i < SUMMANDS_SIZE, "in bounds");
+    return _summands[i];
+  }
+
+  const NoOverflowInt con() const { return _con; }
+
+#ifndef PRODUCT
+  void print_on(outputStream* st) const {
+    if (_pointer == NULL) {
+      st->print_cr("MemPointerDecomposedForm empty.");
+      return;
+    }
+    st->print("MemPointerDecomposedForm[%d %s:  con = ", _pointer->_idx, _pointer->Name());
+    _con.print_on(st);
+    for (int i = 0; i < SUMMANDS_SIZE; i++) {
+      const MemPointerSummand& summand = _summands[i];
+      if (summand.variable() != NULL) {
+        st->print(", ");
+        summand.print_on(st);
+      }
+    }
+    st->print_cr("]");
+  }
+#endif
+};
+
+class MemPointerDecomposedFormParser : public StackObj {
+private:
+  const MemNode* _mem;
+
+  // Internal data-structures for parsing.
+  NoOverflowInt _con;
+  GrowableArray<MemPointerSummand> _worklist;
+  GrowableArray<MemPointerSummand> _summands;
+
+  // Resulting decomposed-form.
+  MemPointerDecomposedForm _decomposed_form;
+
+public:
+  MemPointerDecomposedFormParser(const MemNode* mem) : _mem(mem), _con(NoOverflowInt(0)) {
+    _decomposed_form = parse_decomposed_form();
+  }
+
+  const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; }
+
+private:
+  MemPointerDecomposedForm parse_decomposed_form();
+  void parse_sub_expression(const MemPointerSummand& summand);
+
+  bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const;
+};
+
+// Facility to parse the pointer of a Load or Store, so that aliasing between two such
+// memory operations can be determined (e.g. adjacency).
+class MemPointer : public StackObj {
+private:
+  const MemNode* _mem;
+  const MemPointerDecomposedForm _decomposed_form;
+
+  NOT_PRODUCT( const TraceMemPointer& _trace; )
+
+public:
+  MemPointer(const MemNode* mem NOT_PRODUCT( COMMA const TraceMemPointer& trace)) :
+    _mem(mem),
+    _decomposed_form(init_decomposed_form(_mem))
+    NOT_PRODUCT( COMMA _trace(trace) )
+  {
+#ifndef PRODUCT
+    if (_trace.is_trace_pointer()) {
+      tty->print_cr("MemPointer::MemPointer:");
+      tty->print("mem: "); mem->dump();
+      _mem->in(MemNode::Address)->dump();
+      _decomposed_form.print_on(tty);
+    }
+#endif
+  }
+
+  const MemNode* mem() const { return _mem; }
+  const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; }
+  bool is_adjacent_to_and_before(const MemPointer& other) const;
+
+private:
+  static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem) {
+    assert(mem->is_Store(), "only stores are supported");
+    ResourceMark rm;
+    MemPointerDecomposedFormParser parser(mem);
+    return parser.decomposed_form();
+  }
+};
+
+#endif // SHARE_OPTO_MEMPOINTER_HPP
diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp
new file mode 100644
index 00000000000..227f815deb9
--- /dev/null
+++ b/src/hotspot/share/opto/noOverflowInt.hpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef SHARE_OPTO_NOOVERFLOWINT_HPP
+#define SHARE_OPTO_NOOVERFLOWINT_HPP
+
+#include "utilities/ostream.hpp"
+
+// Wrapper around jint, which detects overflow.
+// If any operation overflows, then it returns a NaN.
+class NoOverflowInt {
+private:
+  bool _is_NaN; // overflow, uninitialized, etc.
+  jint _value;
+
+public:
+  // Default: NaN.
+  NoOverflowInt() : _is_NaN(true), _value(0) {}
+
+  // Create from jlong (or jint) -> NaN if overflows jint.
+  NoOverflowInt(jlong value) : _is_NaN(true), _value(0) {
+    jint trunc = (jint)value;
+    if ((jlong)trunc == value) {
+      _is_NaN = false;
+      _value = trunc;
+    }
+  }
+
+  static NoOverflowInt make_NaN() { return NoOverflowInt(); }
+
+  bool is_NaN() const { return _is_NaN; }
+  jint value() const { assert(!is_NaN(), "NaN not allowed"); return _value; }
+  bool is_zero() const { return !is_NaN() && value() == 0; }
+
+  friend NoOverflowInt operator+(const NoOverflowInt& a, const NoOverflowInt& b) {
+    if (a.is_NaN()) { return a; }
+    if (b.is_NaN()) { return b; }
+    return NoOverflowInt((jlong)a.value() + (jlong)b.value());
+  }
+
+  friend NoOverflowInt operator-(const NoOverflowInt& a, const NoOverflowInt& b) {
+    if (a.is_NaN()) { return a; }
+    if (b.is_NaN()) { return b; }
+    return NoOverflowInt((jlong)a.value() - (jlong)b.value());
+  }
+
+  friend NoOverflowInt operator*(const NoOverflowInt& a, const NoOverflowInt& b) {
+    if (a.is_NaN()) { return a; }
+    if (b.is_NaN()) { return b; }
+    return NoOverflowInt((jlong)a.value() * (jlong)b.value());
+  }
+
+  friend NoOverflowInt operator<<(const NoOverflowInt& a, const NoOverflowInt& b) {
+    if (a.is_NaN()) { return a; }
+    if (b.is_NaN()) { return b; }
+    jint shift = b.value();
+    if (shift < 0 || shift > 31) { return make_NaN(); }
+    return NoOverflowInt((jlong)a.value() << shift);
+  }
+
+  friend bool operator==(const NoOverflowInt& a, const NoOverflowInt& b) {
+    if (a.is_NaN()) { return false; }
+    if (b.is_NaN()) { return false; }
+    return a.value() == b.value();
+  }
+
+  NoOverflowInt abs() const {
+    if (is_NaN()) { return *this; }
+    if (value() >= 0) { return *this; }
+    return NoOverflowInt(0) - *this;
+  }
+
+  bool is_multiple_of(const NoOverflowInt& other) const {
+    NoOverflowInt a = this->abs();
+    NoOverflowInt b = other.abs();
+    if (a.is_NaN()) { return false; }
+    if (b.is_NaN()) { return false; }
+    if (b.is_zero()) { return false; }
+    return a.value() % b.value() == 0;
+  }
+
+#ifndef PRODUCT
+  void print_on(outputStream* st) const {
+    if (is_NaN()) {
+      st->print("NaN");
+    } else {
+      st->print("%d", value());
+    }
+  }
+#endif
+};
+
+#endif // SHARE_OPTO_NOOVERFLOWINT_HPP
diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp
index 1a8b0d0296f..2ddd824798c 100644
--- a/src/hotspot/share/opto/node.hpp
+++ b/src/hotspot/share/opto/node.hpp
@@ -61,6 +61,7 @@ class CmpNode;
 class CodeBuffer;
 class ConstraintCastNode;
 class ConNode;
+class ConINode;
 class CompareAndSwapNode;
 class CompareAndExchangeNode;
 class CountedLoopNode;
@@ -689,6 +690,8 @@ class Node {
 #if INCLUDE_SHENANDOAHGC
       DEFINE_CLASS_ID(ShenandoahBarrier, Type, 7)
 #endif
+      DEFINE_CLASS_ID(Con, Type, 8)
+        DEFINE_CLASS_ID(ConI, Con, 0)
 
     DEFINE_CLASS_ID(Proj,  Node, 3)
       DEFINE_CLASS_ID(CatchProj, Proj, 0)
@@ -825,6 +828,7 @@ class Node {
   DEFINE_CLASS_QUERY(CatchProj)
   DEFINE_CLASS_QUERY(CheckCastPP)
   DEFINE_CLASS_QUERY(CastII)
+  DEFINE_CLASS_QUERY(ConI)
   DEFINE_CLASS_QUERY(ConstraintCast)
   DEFINE_CLASS_QUERY(ClearArray)
   DEFINE_CLASS_QUERY(CMove)
diff --git a/src/hotspot/share/opto/phaseX.cpp b/src/hotspot/share/opto/phaseX.cpp
index 9cf53dc10e0..bcce06fb259 100644
--- a/src/hotspot/share/opto/phaseX.cpp
+++ b/src/hotspot/share/opto/phaseX.cpp
@@ -2230,7 +2230,15 @@ void PhasePeephole::print_statistics() {
 //------------------------------set_req_X--------------------------------------
 void Node::set_req_X( uint i, Node *n, PhaseIterGVN *igvn ) {
   assert( is_not_dead(n), "can not use dead node");
-  assert( igvn->hash_find(this) != this, "Need to remove from hash before changing edges" );
+#ifdef ASSERT
+  if (igvn->hash_find(this) == this) {
+    tty->print_cr("Need to remove from hash before changing edges");
+    this->dump(1);
+    tty->print_cr("Set at i = %d", i);
+    n->dump();
+    assert(false, "Need to remove from hash before changing edges");
+  }
+#endif
   Node *old = in(i);
   set_req(i, n);
 
diff --git a/test/hotspot/gtest/opto/test_no_overflow_int.cpp b/test/hotspot/gtest/opto/test_no_overflow_int.cpp
new file mode 100644
index 00000000000..7b4b4259bb8
--- /dev/null
+++ b/test/hotspot/gtest/opto/test_no_overflow_int.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "opto/noOverflowInt.hpp"
+#include "unittest.hpp"
+
+static void check_jlong(const jlong val) {
+  const NoOverflowInt x(val);
+
+  if (val > max_jint || min_jint > val) {
+    ASSERT_TRUE(x.is_NaN());
+  } else {
+    ASSERT_FALSE(x.is_NaN());
+    ASSERT_EQ(x.value(), val);
+  }
+}
+
+TEST_VM(opto, NoOverflowInt_check_jlong) {
+  jlong start = (jlong)min_jint - 10000LL;
+  jlong end   = (jlong)max_jint + 10000LL;
+  for (jlong i = start; i < end; i+= 1000LL) {
+    check_jlong(i);
+  }
+
+  check_jlong((jlong)min_jint - 1LL);
+  check_jlong((jlong)min_jint);
+  check_jlong((jlong)min_jint + 1LL);
+  check_jlong((jlong)max_jint - 1LL);
+  check_jlong((jlong)max_jint);
+  check_jlong((jlong)max_jint + 1LL);
+
+  const NoOverflowInt nan;
+  ASSERT_TRUE(nan.is_NaN());
+}
+
+TEST_VM(opto, NoOverflowInt_add_sub) {
+  const NoOverflowInt nan;
+  const NoOverflowInt zero(0);
+  const NoOverflowInt one(1);
+  const NoOverflowInt two(2);
+  const NoOverflowInt big(1 << 30);
+
+  ASSERT_EQ((one + two).value(), 3);
+  ASSERT_EQ((one - two).value(), -1);
+  ASSERT_TRUE((nan + one).is_NaN());
+  ASSERT_TRUE((one + nan).is_NaN());
+  ASSERT_TRUE((nan + nan).is_NaN());
+  ASSERT_TRUE((nan - one).is_NaN());
+  ASSERT_TRUE((one - nan).is_NaN());
+  ASSERT_TRUE((nan - nan).is_NaN());
+
+  ASSERT_EQ((big + one).value(), (1 << 30) + 1);
+  ASSERT_TRUE((big + big).is_NaN());
+  ASSERT_EQ((big - one).value(), (1 << 30) - 1);
+  ASSERT_EQ((big - big).value(), 0);
+
+  ASSERT_EQ((big - one + big).value(), max_jint);
+  ASSERT_EQ((zero - big - big).value(), min_jint);
+  ASSERT_TRUE((zero - big - big - one).is_NaN());
+}
+
+TEST_VM(opto, NoOverflowInt_mul) {
+  const NoOverflowInt nan;
+  const NoOverflowInt zero(0);
+  const NoOverflowInt one(1);
+  const NoOverflowInt two(2);
+  const NoOverflowInt big(1 << 30);
+
+  ASSERT_EQ((one * two).value(), 2);
+  ASSERT_TRUE((nan * one).is_NaN());
+  ASSERT_TRUE((one * nan).is_NaN());
+  ASSERT_TRUE((nan * nan).is_NaN());
+
+  ASSERT_EQ((big * one).value(), (1 << 30));
+  ASSERT_EQ((one * big).value(), (1 << 30));
+  ASSERT_EQ((big * zero).value(), 0);
+  ASSERT_EQ((zero * big).value(), 0);
+  ASSERT_TRUE((big * big).is_NaN());
+  ASSERT_TRUE((big * two).is_NaN());
+
+  ASSERT_EQ(((big - one) * two).value(), max_jint - 1);
+  ASSERT_EQ(((one - big) * two).value(), min_jint + 2);
+  ASSERT_EQ(((zero - big) * two).value(), min_jint);
+  ASSERT_TRUE(((big + one) * two).is_NaN());
+  ASSERT_TRUE(((zero - big - one) * two).is_NaN());
+}
+
+TEST_VM(opto, NoOverflowInt_lshift) {
+  const NoOverflowInt nan;
+  const NoOverflowInt zero(0);
+  const NoOverflowInt one(1);
+  const NoOverflowInt two(2);
+  const NoOverflowInt big(1 << 30);
+
+  for (int i = 0; i < 31; i++) {
+    ASSERT_EQ((one << NoOverflowInt(i)).value(), 1LL << i);
+  }
+  for (int i = 31; i < 1000; i++) {
+    ASSERT_TRUE((one << NoOverflowInt(i)).is_NaN());
+  }
+  for (int i = -1000; i < 0; i++) {
+    ASSERT_TRUE((one << NoOverflowInt(i)).is_NaN());
+  }
+
+  ASSERT_EQ((NoOverflowInt(3) << NoOverflowInt(2)).value(), 3 * 4);
+  ASSERT_EQ((NoOverflowInt(11) << NoOverflowInt(5)).value(), 11 * 32);
+  ASSERT_EQ((NoOverflowInt(-13) << NoOverflowInt(4)).value(), -13 * 16);
+}
+
+TEST_VM(opto, NoOverflowInt_misc) {
+  const NoOverflowInt nan;
+  const NoOverflowInt zero(0);
+  const NoOverflowInt one(1);
+  const NoOverflowInt two(2);
+  const NoOverflowInt big(1 << 30);
+
+  // operator==
+  ASSERT_FALSE(nan == nan);
+  ASSERT_FALSE(nan == zero);
+  ASSERT_FALSE(zero == nan);
+  ASSERT_TRUE(zero == zero);
+  ASSERT_TRUE(one == one);
+  ASSERT_TRUE((one + two) == (two + one));
+  ASSERT_TRUE((big + two) == (two + big));
+  ASSERT_FALSE((big + big) == (big + big));
+  ASSERT_TRUE((big - one + big) == (big - one + big));
+
+  // abs
+  for (int i = 0; i < (1 << 31); i += 1024) {
+    ASSERT_EQ(NoOverflowInt(i).abs().value(), i);
+    ASSERT_EQ(NoOverflowInt(-i).abs().value(), i);
+  }
+  ASSERT_EQ(NoOverflowInt(max_jint).abs().value(), max_jint);
+  ASSERT_EQ(NoOverflowInt(min_jint + 1).abs().value(), max_jint);
+  ASSERT_TRUE(NoOverflowInt(min_jint).abs().is_NaN());
+  ASSERT_TRUE(NoOverflowInt(nan).abs().is_NaN());
+
+  // is_multiple_of
+  ASSERT_TRUE(one.is_multiple_of(one));
+  ASSERT_FALSE(one.is_multiple_of(nan));
+  ASSERT_FALSE(nan.is_multiple_of(one));
+  ASSERT_FALSE(nan.is_multiple_of(nan));
+  for (int i = 0; i < (1 << 31); i += 1023) {
+    ASSERT_TRUE(NoOverflowInt(i).is_multiple_of(one));
+    ASSERT_TRUE(NoOverflowInt(-i).is_multiple_of(one));
+    ASSERT_FALSE(NoOverflowInt(i).is_multiple_of(zero));
+    ASSERT_FALSE(NoOverflowInt(-i).is_multiple_of(zero));
+  }
+  ASSERT_TRUE(NoOverflowInt(33 * 7).is_multiple_of(NoOverflowInt(33)));
+  ASSERT_TRUE(NoOverflowInt(13 * 5).is_multiple_of(NoOverflowInt(5)));
+  ASSERT_FALSE(NoOverflowInt(7).is_multiple_of(NoOverflowInt(5)));
+}
+
diff --git a/test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java b/test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java
new file mode 100644
index 00000000000..f267c14a733
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.c2;
+
+/*
+ * @test
+ * @bug 8318446 8331085
+ * @summary Test merge stores, when "adr_type() == nullptr" because of TOP somewhere in the address.
+ * @run main/othervm -XX:CompileCommand=compileonly,compiler.c2.TestMergeStoresNullAdrType::test
+ *                   -XX:-TieredCompilation -Xcomp
+ *                   compiler.c2.TestMergeStoresNullAdrType
+ * @run main compiler.c2.TestMergeStoresNullAdrType
+ */
+
+public class TestMergeStoresNullAdrType {
+    static int arr[] = new int[100];
+
+    static void test() {
+        boolean b = false;
+        for (int k = 269; k > 10; --k) {
+            b = b;
+            int j = 6;
+            while ((j -= 3) > 0) {
+                if (b) {
+                } else {
+                    arr[j] >>= 2;
+                }
+            }
+        }
+    }
+
+    public static void main(String[] args) {
+        test();
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java b/test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java
new file mode 100644
index 00000000000..3b65272c3c7
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8335390
+ * @summary Test merge stores for some Unsafe store address patterns.
+ * @modules java.base/jdk.internal.misc
+ * @requires vm.bits == 64
+ * @requires os.maxMemory > 8G
+ * @run main/othervm -XX:CompileCommand=compileonly,compiler.c2.TestMergeStoresUnsafeArrayPointer::test*
+ *                   -Xbatch
+ *                   -Xmx8g
+ *                   compiler.c2.TestMergeStoresUnsafeArrayPointer
+ * @run main/othervm -Xmx8g
+ *                   compiler.c2.TestMergeStoresUnsafeArrayPointer
+ */
+
+package compiler.c2;
+import jdk.internal.misc.Unsafe;
+
+public class TestMergeStoresUnsafeArrayPointer {
+    static final Unsafe UNSAFE = Unsafe.getUnsafe();
+
+    // We allocate a big int array of length:
+    static final int SIZE = (1 << 30) + 100;
+
+    // This gives us a memory region of 4x as many bytes:
+    static final long BYTE_SIZE = 4L * SIZE; // = 1L << 32 + 400L
+
+    // We set an "anchor" in the middle of this memory region, in bytes:
+    static final long ANCHOR = BYTE_SIZE / 2;
+
+    static int four = 4;
+    static int max_int = Integer.MAX_VALUE;
+    static int min_int = Integer.MIN_VALUE;
+    static int val_2_to_30 = (1 << 30);
+    static int large_by_53 = (int)((1L << 31) / 53L + 1L);
+
+    public static void main(String[] args) {
+        System.out.println("Allocate big array of SIZE = " + SIZE);
+        int[] big = new int[SIZE];
+
+        // Each test is executed a few times, so that we can see the difference between
+        // interpreter and compiler.
+        int errors = 0;
+
+        long val = 0;
+        System.out.println("test1");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test1(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test1 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        val = 0;
+        System.out.println("test2");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test2(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test2 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        val = 0;
+        System.out.println("test3");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test3(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test3 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        val = 0;
+        System.out.println("test4");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test4(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test4 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        val = 0;
+        System.out.println("test5");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test5(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test5 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        val = 0;
+        System.out.println("test6");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test6(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test6 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        val = 0;
+        System.out.println("test7");
+        for (int i = 0; i < 100_000; i++) {
+            testClear(big);
+            test7(big, ANCHOR);
+            long sum = testSum(big);
+            if (i == 0) {
+                val = sum;
+            } else {
+                if (sum != val) {
+                    System.out.println("ERROR: test7 had wrong value: " + val + " != " + sum);
+                    errors++;
+                    break;
+                }
+            }
+        }
+
+        // No result verification here. We only want to make sure we do not hit asserts.
+        System.out.println("test8 and test9");
+        for (int i = 0; i < 100_000; i++) {
+            test8a(big, ANCHOR);
+            test8b(big, ANCHOR);
+            test8c(big, ANCHOR);
+            test8d(big, ANCHOR);
+            test9a(big, ANCHOR);
+            test9b(big, ANCHOR);
+            test9c(big, ANCHOR);
+        }
+
+        if (errors > 0) {
+            throw new RuntimeException("ERRORS: " + errors);
+        }
+        System.out.println("PASSED");
+    }
+
+    // Only clear and sum over relevant parts of array to make the test fast.
+    static void testClear(int[] a) {
+        for (int j = 0               ; j <              100; j++) { a[j] = j; }
+        for (int j = a.length/2 - 100; j < a.length/2 + 100; j++) { a[j] = j; }
+        for (int j = a.length   - 100; j < a.length   +   0; j++) { a[j] = j; }
+    }
+
+    static long testSum(int[] a) {
+        long sum = 0;
+        for (int j = 0               ; j <              100; j++) { sum += a[j]; }
+        for (int j = a.length/2 - 100; j < a.length/2 + 100; j++) { sum += a[j]; }
+        for (int j = a.length   - 100; j < a.length   +   0; j++) { sum += a[j]; }
+        return sum;
+    }
+
+    // Reference: expected to merge.
+    static void test1(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putInt(a, base + 0, 0x42424242);
+        UNSAFE.putInt(a, base + 4, 0x66666666);
+    }
+
+    // Test: if MergeStores is applied this can lead to wrong results
+    static void test2(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + ANCHOR;
+        UNSAFE.putInt(a, base + 0                 + (long)(four + Integer.MAX_VALUE), 0x42424242);
+        UNSAFE.putInt(a, base + Integer.MAX_VALUE + (long)(four + 4                ), 0x66666666);
+    }
+
+    // Test: if MergeStores is applied this can lead to wrong results
+    //  -> AddI needs overflow check.
+    static void test3(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putInt(a, base + (long)(max_int + 0), 0x42424242);
+        UNSAFE.putInt(a, base + (long)(max_int + 4), 0x66666666);
+    }
+
+    // Test: "max_int - four" cannot be parsed further, but would not make a difference here.
+    static void test4(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putInt(a, base + (long)(min_int - four) + 0, 0x42424242);
+        UNSAFE.putInt(a, base + (long)(min_int - four) + 4, 0x66666666);
+    }
+
+    // Test: if MergeStores is applied this can lead to wrong results
+    //  -> SubI needs overflow check.
+    static void test5(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putInt(a, base + (long)(min_int) - (long)(four) + 0, 0x42424242); // no overflow
+        UNSAFE.putInt(a, base + (long)(min_int - four)         + 4, 0x66666666); // overflow
+    }
+
+    // Test: if MergeStores is applied this can lead to wrong results
+    //  -> LShiftI needs overflow check.
+    static void test6(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putInt(a, base +  (long)(2 * val_2_to_30) + 0, 0x42424242); // overflow
+        UNSAFE.putInt(a, base + 2L * (long)(val_2_to_30) + 4, 0x66666666); // no overflow
+    }
+
+    // Test: if MergeStores is applied this can lead to wrong results
+    //  -> MulI needs overflow check.
+    static void test7(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putInt(a, base +  (long)(53 * large_by_53) + 0, 0x42424242); // overflow
+        UNSAFE.putInt(a, base + 53L * (long)(large_by_53) + 4, 0x66666666); // no overflow
+    }
+
+    // Test: check if large distance leads to assert
+    static void test8a(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base + (1L << 11) + 0,          (byte)42);
+        UNSAFE.putByte(a, base + (1L << 11) + (1L << 30), (byte)11);
+    }
+
+    // Test: check if large distance leads to assert
+    static void test8b(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base + (1L << 11) + (1L << 30), (byte)11);
+        UNSAFE.putByte(a, base + (1L << 11) + 0,          (byte)42);
+    }
+
+    // Test: check if large distance leads to assert
+    static void test8c(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base - (1L << 11) - 0,          (byte)42);
+        UNSAFE.putByte(a, base - (1L << 11) - (1L << 30), (byte)11);
+    }
+
+    // Test: check if large distance leads to assert
+    static void test8d(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base - (1L << 11) - (1L << 30), (byte)11);
+        UNSAFE.putByte(a, base - (1L << 11) - 0,          (byte)42);
+    }
+
+    // Test: check if large distance leads to assert
+    //       case: bad distance: NaN
+    static void test9a(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base - 100,               (byte)42);
+        UNSAFE.putByte(a, base - 100  + (1L << 31), (byte)11);
+    }
+
+    // Test: check if large distance leads to assert
+    //       case: just before NaN, it is still a valid distance for MemPointer aliasing.
+    static void test9b(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base - 100,                   (byte)42);
+        UNSAFE.putByte(a, base - 100  + (1L << 31) - 1, (byte)11);
+    }
+
+    // Test: check if large distance leads to assert
+    //       case: constant too large
+    static void test9c(int[] a, long anchor) {
+        long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor;
+        UNSAFE.putByte(a, base,               (byte)42);
+        UNSAFE.putByte(a, base  + (1L << 31), (byte)11);
+    }
+}
diff --git a/test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java b/test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java
new file mode 100644
index 00000000000..d05dbad4a73
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.c2;
+
+import jdk.internal.misc.Unsafe;
+import jdk.test.lib.Asserts;
+
+/**
+ * @test TestUnalignedAccess
+ * @summary AArch64: C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug".
+ * @bug 8319690
+ * @library /test/lib
+ * @modules java.base/jdk.internal.misc
+ * @run main/othervm compiler.c2.TestUnalignedAccess
+ * @run main/othervm -Xcomp -XX:-TieredCompilation -Xmx1g
+ *                   -XX:CompileCommand=compileonly,compiler.c2.TestUnalignedAccess*::<clinit>
+ *                   compiler.c2.TestUnalignedAccess
+ */
+
+public class TestUnalignedAccess {
+
+    public static final int LEN = 2040;
+
+    static final Unsafe UNSAFE = Unsafe.getUnsafe();
+    static void sink(int x) {}
+
+    public static long lseed = 1;
+    public static int iseed = 2;
+    public static short sseed = 3;
+    public static byte bseed = 4;
+    public static long lres = lseed;
+    public static int ires = iseed;
+    public static short sres = sseed;
+    public static byte bres = bseed;
+
+    public static class TestLong {
+
+        private static final byte[] BYTES = new byte[LEN];
+        private static final long rawdata = 0xbeef;
+        private static final long data;
+
+        static {
+            sink(2);
+            // Signed immediate byte offset: range -256 to 255
+            // Positive immediate byte offset: a multiple of 8 in the range 0 to 32760
+            // Other immediate byte offsets can't be encoded in the instruction field.
+
+            // 1030 can't be encoded as "base + offset" mode into the instruction field.
+            UNSAFE.putLongUnaligned(BYTES, 1030, rawdata);
+            lres += UNSAFE.getLongUnaligned(BYTES, 1030);
+            // 127 can be encoded into simm9 field.
+            UNSAFE.putLongUnaligned(BYTES, 127, lres);
+            lres += UNSAFE.getLongUnaligned(BYTES, 127);
+            // 1096 can be encoded into uimm12 field.
+            UNSAFE.putLongUnaligned(BYTES, 1096, lres);
+            data = UNSAFE.getLongUnaligned(BYTES, 1096);
+        }
+
+    }
+
+    public static class TestInt {
+
+        private static final byte[] BYTES = new byte[LEN];
+        private static final int rawdata = 0xbeef;
+        private static final int data;
+        static {
+            sink(2);
+            // Signed immediate byte offset: range -256 to 255
+            // Positive immediate byte offset, a multiple of 4 in the range 0 to 16380
+            // Other immediate byte offsets can't be encoded in the instruction field.
+
+            // 274 can't be encoded as "base + offset" mode into the instruction field.
+            UNSAFE.putIntUnaligned(BYTES, 274, rawdata);
+            ires += UNSAFE.getIntUnaligned(BYTES, 274);
+            // 255 can be encoded into simm9 field.
+            UNSAFE.putIntUnaligned(BYTES, 255, ires);
+            ires += UNSAFE.getIntUnaligned(BYTES, 255);
+            // 528 can be encoded into uimm12 field.
+            UNSAFE.putIntUnaligned(BYTES, 528, ires);
+            data = UNSAFE.getIntUnaligned(BYTES, 528);
+        }
+
+    }
+
+    public static class TestShort {
+
+        private static final byte[] BYTES = new byte[LEN];
+        private static final short rawdata = (short)0xbeef;
+        private static final short data;
+        static {
+            sink(2);
+            // Signed immediate byte offset: range -256 to 255
+            // Positive immediate byte offset: a multiple of 2 in the range 0 to 8190
+            // Other immediate byte offsets can't be encoded in the instruction field.
+
+            // 257 can't be encoded as "base + offset" mode into the instruction field.
+            UNSAFE.putShortUnaligned(BYTES, 257, rawdata);
+            sres = (short) (sres + UNSAFE.getShortUnaligned(BYTES, 257));
+            // 253 can be encoded into simm9 field.
+            UNSAFE.putShortUnaligned(BYTES, 253, sres);
+            sres = (short) (sres + UNSAFE.getShortUnaligned(BYTES, 253));
+            // 272 can be encoded into uimm12 field.
+            UNSAFE.putShortUnaligned(BYTES, 272, sres);
+            data = UNSAFE.getShortUnaligned(BYTES, 272);
+        }
+
+    }
+
+    public static class TestByte {
+
+        private static final byte[] BYTES = new byte[LEN];
+        private static final byte rawdata = (byte)0x3f;
+        private static final byte data;
+        static {
+            sink(2);
+            // Signed immediate byte offset: range -256 to 255
+            // Positive immediate byte offset: range 0 to 4095
+            // Other immediate byte offsets can't be encoded in the instruction field.
+
+            // 272 can be encoded into simm9 field.
+            UNSAFE.putByte(BYTES, 272, rawdata);
+            bres = (byte) (bres + UNSAFE.getByte(BYTES, 272));
+            // 53 can be encoded into simm9 field.
+            UNSAFE.putByte(BYTES, 53, bres);
+            bres = (byte) (bres + UNSAFE.getByte(BYTES, 53));
+            // 1027 can be encoded into uimm12 field.
+            UNSAFE.putByte(BYTES, 1027, bres);
+            data = UNSAFE.getByte(BYTES, 1027);
+        }
+
+    }
+
+    static void test() {
+        TestLong ta = new TestLong();
+        Asserts.assertEquals(ta.data, (ta.rawdata + lseed) * 2, "putUnaligned long failed!");
+
+        TestInt tb = new TestInt();
+        Asserts.assertEquals(tb.data, (tb.rawdata + iseed) * 2, "putUnaligned int failed!");
+
+        TestShort tc = new TestShort();
+        Asserts.assertEquals(tc.data, (short) (((short) (tc.rawdata + sseed)) * 2), "putUnaligned short failed!");
+
+        TestByte td = new TestByte();
+        Asserts.assertEquals(td.data, (byte) (((byte) (td.rawdata + bseed)) * 2), "put byte failed!");
+    }
+
+    public static void main(String[] strArr) {
+        test();
+    }
+}
diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java
new file mode 100644
index 00000000000..26c8287c4de
--- /dev/null
+++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+import java.lang.invoke.MethodHandles;
+import java.lang.invoke.VarHandle;
+import java.lang.reflect.Field;
+import java.nio.ByteOrder;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+
+import jdk.internal.misc.Unsafe;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@State(Scope.Thread)
+@Warmup(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS)
+@Measurement(iterations = 5, time = 1000, timeUnit = TimeUnit.MILLISECONDS)
+@Fork(value = 3, jvmArgsAppend = {"--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED"})
+public class MergeStoreBench {
+    private static final Unsafe UNSAFE = Unsafe.getUnsafe();
+
+    final static VarHandle INT_L  = MethodHandles.byteArrayViewVarHandle(int[].class , ByteOrder.LITTLE_ENDIAN);
+    final static VarHandle INT_B  = MethodHandles.byteArrayViewVarHandle(int[].class , ByteOrder.BIG_ENDIAN);
+    final static VarHandle LONG_L = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.LITTLE_ENDIAN);
+    final static VarHandle LONG_B = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.BIG_ENDIAN);
+    final static VarHandle CHAR_L = MethodHandles.byteArrayViewVarHandle(char[].class, ByteOrder.LITTLE_ENDIAN);
+    final static VarHandle CHAR_B = MethodHandles.byteArrayViewVarHandle(char[].class, ByteOrder.BIG_ENDIAN);
+
+    final static int NUMBERS = 8192;
+
+    final byte[] bytes4 = new byte[NUMBERS * 4];
+    final byte[] bytes8 = new byte[NUMBERS * 8];
+    final int [] ints   = new int [NUMBERS    ];
+    final long[] longs  = new long[NUMBERS    ];
+    final char[] chars  = new char[NUMBERS    ];
+
+    @Setup
+    public void setup() {
+        Random r = new Random();
+        for (int i = 0; i < ints.length; i++) {
+            ints[i] = r.nextInt();
+            INT_L.set(bytes4, i * 4, i);
+        }
+
+        for (int i = 0; i < longs.length; i++) {
+            longs[i] = r.nextLong();
+            LONG_L.set(bytes8, i * 8, i);
+        }
+    }
+
+    /*
+     * The names of these cases have the following `B/L/V/U` suffixes, which are:
+     * ```
+     * B BigEndian
+     * L LittleEndian
+     * V VarHandle
+     * U Unsafe
+     * R ReverseBytes
+     * C Unsafe.getChar & putChar
+     * S Unsafe.getShort & putShort
+     * ```
+     */
+
+    @Benchmark
+    public void getIntB(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntB(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntBU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntBU(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntBV(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += (int) INT_B.get(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntL(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntL(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntLU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntLU(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntLV(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += (int) INT_L.get(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntRB(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntRB(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntRBU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntRBU(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntRL(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntRL(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntRLU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += getIntRLU(bytes4, i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntRU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += Integer.reverseBytes(
+                    UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4));
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getIntU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntB(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntB(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntBU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntBU(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntBV(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            INT_B.set(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntL(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntL(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntLU(Blackhole BH) {
+        int sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntLU(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntLV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            INT_L.set(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntRB(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntRB(bytes4, i * 4, ints[i]);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntRBU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntRBU(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntRL(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntRL(bytes4, i * 4, ints[i]);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntRLU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            setIntRLU(bytes4, i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntRU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            v = Integer.reverseBytes(v);
+            UNSAFE.putInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setIntU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            int v = ints[i];
+            UNSAFE.putInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongB(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongB(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongBU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongBU(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongBV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += (long) LONG_B.get(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongL(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongL(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongLU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongLU(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongLV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < ints.length; i++) {
+            sum += (long) LONG_L.get(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongRB(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongRB(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongRBU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongRBU(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongRL(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongRL(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongRLU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += getLongRLU(bytes8, i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongRU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += Long.reverseBytes(
+                    UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8));
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getLongU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            sum += UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8);
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongB(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongB(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongBU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongBU(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongBV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            LONG_B.set(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongL(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongL(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongLU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongLU(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongLV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            LONG_L.set(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongRB(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongRB(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongRBU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongRBU(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongRL(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongRL(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongRLU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            setLongRLU(bytes8, i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongRU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            v = Long.reverseBytes(v);
+            UNSAFE.putLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setLongU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            long v = longs[i];
+            UNSAFE.putLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8, v);
+            sum += v;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getCharB(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = getCharB(bytes4, i);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getCharBV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = (char) CHAR_B.get(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getCharBU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = getCharBU(bytes4, i);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getCharL(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = getCharL(bytes4, i);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+    @Benchmark
+    public void getCharLU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = getCharLU(bytes4, i);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+
+    @Benchmark
+    public void getCharLV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = (char) CHAR_L.get(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void getCharC(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            char c = UNSAFE.getChar(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setCharBS(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < chars.length; i++) {
+            char c = chars[i];
+            putShortB(bytes4, i * 2, c);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setCharBV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < chars.length; i++) {
+            char c = chars[i];
+            CHAR_B.set(bytes4, i * 2, c);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setCharLS(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < chars.length; i++) {
+            char c = chars[i];
+            putShortL(bytes4, i * 2, c);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setCharLV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < chars.length; i++) {
+            char c = chars[i];
+            CHAR_L.set(bytes4, i * 2, c);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void setCharC(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < chars.length; i++) {
+            char c = chars[i];
+            UNSAFE.putChar(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2, c);
+            sum += c;
+        }
+        BH.consume(sum);
+    }
+
+    /*
+     * putChars4 Test whether four constant chars can be MergeStored
+     *
+     */
+    @Benchmark
+    public void putChars4B(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4B(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4BU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4BU(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4BV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4BV(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4L(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4L(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4LU(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4LU(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4LV(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4LV(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4C(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4C(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    @Benchmark
+    public void putChars4S(Blackhole BH) {
+        long sum = 0;
+        for (int i = 0; i < longs.length; i++) {
+            putChars4S(bytes8, i * 4);
+            sum += longs[i];
+        }
+        BH.consume(sum);
+    }
+
+    static int getIntB(byte[] array, int offset) {
+        return ((array[offset    ] & 0xff) << 24)
+             | ((array[offset + 1] & 0xff) << 16)
+             | ((array[offset + 2] & 0xff) <<  8)
+             | ((array[offset + 3] & 0xff)      );
+    }
+
+    static int getIntBU(byte[] array, int offset) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        return ((UNSAFE.getByte(array, address    ) & 0xff) << 24)
+             | ((UNSAFE.getByte(array, address + 1) & 0xff) << 16)
+             | ((UNSAFE.getByte(array, address + 2) & 0xff) <<  8)
+             | ((UNSAFE.getByte(array, address + 3) & 0xff)      );
+    }
+
+    static int getIntL(byte[] array, int offset) {
+        return ((array[offset       ] & 0xff)      )
+                | ((array[offset + 1] & 0xff) <<  8)
+                | ((array[offset + 2] & 0xff) << 16)
+                | ((array[offset + 3] & 0xff) << 24);
+    }
+
+    static int getIntRB(byte[] array, int offset) {
+        return Integer.reverseBytes(getIntB(array, offset));
+    }
+
+    static int getIntRBU(byte[] array, int offset) {
+        return Integer.reverseBytes(getIntBU(array, offset));
+    }
+
+    static int getIntRL(byte[] array, int offset) {
+        return Integer.reverseBytes(getIntL(array, offset));
+    }
+
+    static int getIntRLU(byte[] array, int offset) {
+        return Integer.reverseBytes(getIntLU(array, offset));
+    }
+
+    static void setIntB(byte[] array, int offset, int value) {
+        array[offset    ] = (byte) (value >> 24);
+        array[offset + 1] = (byte) (value >> 16);
+        array[offset + 2] = (byte) (value >>  8);
+        array[offset + 3] = (byte) (value      );
+    }
+
+    static void setIntBU(byte[] array, int offset, int value) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        UNSAFE.putByte(array, address    , (byte) (value >> 24));
+        UNSAFE.putByte(array, address + 1, (byte) (value >> 16));
+        UNSAFE.putByte(array, address + 2, (byte) (value >>  8));
+        UNSAFE.putByte(array, address + 3, (byte) (value      ));
+    }
+
+    public static void setIntL(byte[] array, int offset, int value) {
+        array[offset    ] = (byte)  value;
+        array[offset + 1] = (byte) (value >> 8);
+        array[offset + 2] = (byte) (value >> 16);
+        array[offset + 3] = (byte) (value >> 24);
+    }
+
+    public static void setIntLU(byte[] array, int offset, int value) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        UNSAFE.putByte(array, address    , (byte)  value       );
+        UNSAFE.putByte(array, address + 1, (byte) (value >>  8));
+        UNSAFE.putByte(array, address + 2, (byte) (value >> 16));
+        UNSAFE.putByte(array, address + 3, (byte) (value >> 24));
+    }
+
+    public static void setIntRL(byte[] array, int offset, int value) {
+        value = Integer.reverseBytes(value);
+        setIntL(array, offset, value);
+    }
+
+    public static void setIntRLU(byte[] array, int offset, int value) {
+        value = Integer.reverseBytes(value);
+        setIntLU(array, offset, value);
+    }
+
+    public static void setIntRB(byte[] array, int offset, int value) {
+        value = Integer.reverseBytes(value);
+        setIntB(array, offset, value);
+    }
+
+    public static void setIntRBU(byte[] array, int offset, int value) {
+        value = Integer.reverseBytes(value);
+        setIntBU(array, offset, value);
+    }
+
+    static long getLongB(byte[] array, int offset) {
+        return (((long) array[offset    ] & 0xff) << 56)
+             | (((long) array[offset + 1] & 0xff) << 48)
+             | (((long) array[offset + 2] & 0xff) << 40)
+             | (((long) array[offset + 3] & 0xff) << 32)
+             | (((long) array[offset + 4] & 0xff) << 24)
+             | (((long) array[offset + 5] & 0xff) << 16)
+             | (((long) array[offset + 6] & 0xff) << 8)
+             | (((long) array[offset + 7] & 0xff)     );
+    }
+
+    static long getLongBU(byte[] array, int offset) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        return (((long)(UNSAFE.getByte(array, address)     & 0xff)) << 56)
+             | (((long)(UNSAFE.getByte(array, address + 1) & 0xff)) << 48)
+             | (((long)(UNSAFE.getByte(array, address + 2) & 0xff)) << 40)
+             | (((long)(UNSAFE.getByte(array, address + 3) & 0xff)) << 32)
+             | (((long)(UNSAFE.getByte(array, address + 4) & 0xff)) << 24)
+             | (((long)(UNSAFE.getByte(array, address + 5) & 0xff)) << 16)
+             | (((long)(UNSAFE.getByte(array, address + 6) & 0xff)) <<  8)
+             | (((long)(UNSAFE.getByte(array, address + 7) & 0xff))      );
+    }
+
+    public static long getLongL(byte[] array, int offset) {
+        return (((long) array[offset    ] & 0xff)      )
+             | (((long) array[offset + 1] & 0xff) <<  8)
+             | (((long) array[offset + 2] & 0xff) << 16)
+             | (((long) array[offset + 3] & 0xff) << 24)
+             | (((long) array[offset + 4] & 0xff) << 32)
+             | (((long) array[offset + 5] & 0xff) << 40)
+             | (((long) array[offset + 6] & 0xff) << 48)
+             | (((long) array[offset + 7] & 0xff) << 56);
+    }
+
+    static long getLongLU(byte[] array, int offset) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        return (((long)(UNSAFE.getByte(array, address    ) & 0xff))      )
+             | (((long)(UNSAFE.getByte(array, address + 1) & 0xff)) <<  8)
+             | (((long)(UNSAFE.getByte(array, address + 2) & 0xff)) << 16)
+             | (((long)(UNSAFE.getByte(array, address + 3) & 0xff)) << 24)
+             | (((long)(UNSAFE.getByte(array, address + 4) & 0xff)) << 32)
+             | (((long)(UNSAFE.getByte(array, address + 5) & 0xff)) << 40)
+             | (((long)(UNSAFE.getByte(array, address + 6) & 0xff)) << 48)
+             | (((long)(UNSAFE.getByte(array, address + 7) & 0xff)) << 56);
+    }
+
+    static long getLongRB(byte[] array, int offset) {
+        return getLongB(array, offset);
+    }
+
+    static long getLongRBU(byte[] array, int offset) {
+        return getLongBU(array, offset);
+    }
+
+    static long getLongRL(byte[] array, int offset) {
+        return getLongL(array, offset);
+    }
+
+    static long getLongRLU(byte[] array, int offset) {
+        return getLongLU(array, offset);
+    }
+
+    static void setLongB(byte[] array, int offset, long value) {
+        array[offset]     = (byte) (value >> 56);
+        array[offset + 1] = (byte) (value >> 48);
+        array[offset + 2] = (byte) (value >> 40);
+        array[offset + 3] = (byte) (value >> 32);
+        array[offset + 4] = (byte) (value >> 24);
+        array[offset + 5] = (byte) (value >> 16);
+        array[offset + 6] = (byte) (value >>  8);
+        array[offset + 7] = (byte) (value      );
+    }
+
+    public static void setLongL(byte[] array, int offset, long value) {
+        array[offset]     = (byte)  value       ;
+        array[offset + 1] = (byte) (value >> 8 );
+        array[offset + 2] = (byte) (value >> 16);
+        array[offset + 3] = (byte) (value >> 24);
+        array[offset + 4] = (byte) (value >> 32);
+        array[offset + 5] = (byte) (value >> 40);
+        array[offset + 6] = (byte) (value >> 48);
+        array[offset + 7] = (byte) (value >> 56);
+    }
+
+    public static void setLongRL(byte[] array, int offset, long value) {
+        value = Long.reverseBytes(value);
+        setLongL(array, offset, value);
+    }
+
+    public static void setLongRLU(byte[] array, int offset, long value) {
+        value = Long.reverseBytes(value);
+        setLongLU(array, offset, value);
+    }
+
+    public static void setLongRB(byte[] array, int offset, long value) {
+        value = Long.reverseBytes(value);
+        setLongB(array, offset, value);
+    }
+
+    public static void setLongRBU(byte[] array, int offset, long value) {
+        value = Long.reverseBytes(value);
+        setLongBU(array, offset, value);
+    }
+
+    public static void setLongBU(byte[] array, int offset, long value) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        UNSAFE.putByte(array, address    , (byte) (value >> 56));
+        UNSAFE.putByte(array, address + 1, (byte) (value >> 48));
+        UNSAFE.putByte(array, address + 2, (byte) (value >> 40));
+        UNSAFE.putByte(array, address + 3, (byte) (value >> 32));
+        UNSAFE.putByte(array, address + 4, (byte) (value >> 24));
+        UNSAFE.putByte(array, address + 5, (byte) (value >> 16));
+        UNSAFE.putByte(array, address + 6, (byte) (value >>  8));
+        UNSAFE.putByte(array, address + 7, (byte)  value       );
+    }
+
+    public static void setLongLU(byte[] array, int offset, long value) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        UNSAFE.putByte(array, address    , (byte)  value       );
+        UNSAFE.putByte(array, address + 1, (byte) (value >>  8));
+        UNSAFE.putByte(array, address + 2, (byte) (value >> 16));
+        UNSAFE.putByte(array, address + 3, (byte) (value >> 24));
+        UNSAFE.putByte(array, address + 4, (byte) (value >> 32));
+        UNSAFE.putByte(array, address + 5, (byte) (value >> 40));
+        UNSAFE.putByte(array, address + 6, (byte) (value >> 48));
+        UNSAFE.putByte(array, address + 7, (byte) (value >> 56));
+    }
+
+    public static int getIntLU(byte[] array, int offset) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset;
+        return ((UNSAFE.getByte(array, address    ) & 0xff)      )
+             | ((UNSAFE.getByte(array, address + 1) & 0xff) <<  8)
+             | ((UNSAFE.getByte(array, address + 2) & 0xff) << 16)
+             | ((UNSAFE.getByte(array, address + 3) & 0xff) << 24);
+    }
+
+    public static char getCharB(byte[] val, int index) {
+        index <<= 1;
+        return (char)(((val[index    ] & 0xff) << 8)
+                    | ((val[index + 1] & 0xff)));
+    }
+
+    public static char getCharBR(byte[] val, int index) {
+        return Character.reverseBytes(getCharB(val, index));
+    }
+
+    public static char getCharL(byte[] val, int index) {
+        index <<= 1;
+        return (char)(((val[index    ] & 0xff))
+                    | ((val[index + 1] & 0xff) << 8));
+    }
+
+    public static char getCharLR(byte[] val, int index) {
+        return Character.reverseBytes(getCharL(val, index));
+    }
+
+    public static char getCharBU(byte[] array, int offset) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1);
+        return (char) (((UNSAFE.getByte(array, address    ) & 0xff) << 8)
+                     | ((UNSAFE.getByte(array, address + 1) & 0xff)     ));
+    }
+
+    public static char getCharLU(byte[] array, int offset) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1);
+        return (char) (((UNSAFE.getByte(array, address    ) & 0xff)     )
+                     | ((UNSAFE.getByte(array, address + 1) & 0xff) << 8));
+    }
+
+    public void putChars4B(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        putShortB(bytes, offset    , c0);
+        putShortB(bytes, offset + 1, c1);
+        putShortB(bytes, offset + 2, c2);
+        putShortB(bytes, offset + 3, c3);
+    }
+
+    public void putChars4BU(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        putShortBU(bytes, offset    , c0);
+        putShortBU(bytes, offset + 1, c1);
+        putShortBU(bytes, offset + 2, c2);
+        putShortBU(bytes, offset + 3, c3);
+    }
+
+    public void putChars4BV(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        offset <<= 1;
+        CHAR_B.set(bytes, offset    , c0);
+        CHAR_B.set(bytes, offset + 2, c1);
+        CHAR_B.set(bytes, offset + 4, c2);
+        CHAR_B.set(bytes, offset + 6, c3);
+    }
+
+    public void putChars4L(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        putShortL(bytes, offset    , c0);
+        putShortL(bytes, offset + 1, c1);
+        putShortL(bytes, offset + 2, c2);
+        putShortL(bytes, offset + 3, c3);
+    }
+
+    public void putChars4LV(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        offset <<= 1;
+        CHAR_L.set(bytes, offset    , c0);
+        CHAR_L.set(bytes, offset + 2, c1);
+        CHAR_L.set(bytes, offset + 4, c2);
+        CHAR_L.set(bytes, offset + 6, c3);
+    }
+
+    public void putChars4LU(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        putShortLU(bytes, offset    , c0);
+        putShortLU(bytes, offset + 1, c1);
+        putShortLU(bytes, offset + 2, c2);
+        putShortLU(bytes, offset + 3, c3);
+    }
+
+    public void putChars4C(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1);
+        UNSAFE.putChar(bytes, address    , c0);
+        UNSAFE.putChar(bytes, address + 2, c1);
+        UNSAFE.putChar(bytes, address + 4, c2);
+        UNSAFE.putChar(bytes, address + 6, c3);
+    }
+
+    public void putChars4S(byte[] bytes, int offset) {
+        char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l';
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1);
+        UNSAFE.putShort(bytes, address    , (short) c0);
+        UNSAFE.putShort(bytes, address + 2, (short) c1);
+        UNSAFE.putShort(bytes, address + 4, (short) c2);
+        UNSAFE.putShort(bytes, address + 6, (short) c3);
+    }
+
+    private static void putShortB(byte[] val, int index, int c) {
+        index <<= 1;
+        val[index    ] = (byte)(c >> 8);
+        val[index + 1] = (byte)(c     );
+    }
+
+    public static void putShortBU(byte[] array, int offset, int c) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1);
+        UNSAFE.putByte(array, address    , (byte) (c >>  8));
+        UNSAFE.putByte(array, address + 1, (byte) (c      ));
+    }
+
+    private static void putShortL(byte[] val, int index, int c) {
+        index <<= 1;
+        val[index    ] = (byte)(c     );
+        val[index + 1] = (byte)(c >> 8);
+    }
+
+    public static void putShortLU(byte[] array, int offset, int c) {
+        final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1);
+        UNSAFE.putByte(array, address    , (byte) (c     ));
+        UNSAFE.putByte(array, address + 1, (byte) (c >> 8));
+    }
+}
diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java
new file mode 100644
index 00000000000..4db148b454c
--- /dev/null
+++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+
+import jdk.internal.misc.Unsafe;
+// import jdk.internal.util.ByteArrayLittleEndian;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
++@Warmup(iterations = 2, time = 1)
++@Measurement(iterations = 3, time = 1)
++@Fork(value = 1, jvmArgs = {
+        "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
+        "--add-exports", "java.base/jdk.internal.util=ALL-UNNAMED"})
+@State(Scope.Thread)
+public class MergeStores {
+
+    public static final int RANGE = 100;
+
+    static Unsafe UNSAFE = Unsafe.getUnsafe();
+
+    @Param("1")
+    public static short vS;
+
+    @Param("1")
+    public static int vI;
+
+    @Param("1")
+    public static long vL;
+
+    public static int offset = 5;
+    public static byte[]  aB = new byte[RANGE];
+    public static short[] aS = new short[RANGE];
+    public static int[]   aI = new int[RANGE];
+    public static long native_adr = UNSAFE.allocateMemory(RANGE * 8);
+
+    // -------------------------------------------
+    // -------     Little-Endian API    ----------
+    // -------------------------------------------
+
+    // Store a short LE into an array using store bytes in an array
+    static void storeShortLE(byte[] bytes, int offset, short value) {
+        storeBytes(bytes, offset, (byte)(value >> 0),
+                                  (byte)(value >> 8));
+    }
+
+    // Store an int LE into an array using store bytes in an array
+    static void storeIntLE(byte[] bytes, int offset, int value) {
+        storeBytes(bytes, offset, (byte)(value >> 0 ),
+                                  (byte)(value >> 8 ),
+                                  (byte)(value >> 16),
+                                  (byte)(value >> 24));
+    }
+
+    // Store an int LE into an array using store bytes in an array
+    static void storeLongLE(byte[] bytes, int offset, long value) {
+        storeBytes(bytes, offset, (byte)(value >> 0 ),
+                                  (byte)(value >> 8 ),
+                                  (byte)(value >> 16),
+                                  (byte)(value >> 24),
+                                  (byte)(value >> 32),
+                                  (byte)(value >> 40),
+                                  (byte)(value >> 48),
+                                  (byte)(value >> 56));
+    }
+
+    // Store 2 bytes into an array
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+    }
+
+    // Store 4 bytes into an array
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+        bytes[offset + 2] = b2;
+        bytes[offset + 3] = b3;
+    }
+
+    // Store 8 bytes into an array
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3,
+                                                     byte b4, byte b5, byte b6, byte b7) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+        bytes[offset + 2] = b2;
+        bytes[offset + 3] = b3;
+        bytes[offset + 4] = b4;
+        bytes[offset + 5] = b5;
+        bytes[offset + 6] = b6;
+        bytes[offset + 7] = b7;
+    }
+
+    // -------------------------------- BENCHMARKS --------------------------------
+
+    @Benchmark
+    public void baseline() {
+    }
+
+    @Benchmark
+    public byte[] baseline_allocate() {
+        byte[] aB = new byte[RANGE];
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_adr0_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[0] = (byte)0x01;
+        aB[1] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_adr1_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[1] = (byte)0x01;
+        aB[2] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeShortLE(aB, offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_unsafe() {
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_leapi() {
+        storeShortLE(aB, offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vS >> 0 );
+        aB[offset + 1] = (byte)(vS >> 8 );
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setShort(aB, offset, vS);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeShortLE(aB, offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vS >> 0 );
+        aB[offset + 1] = (byte)(vS >> 8 );
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_unsafe() {
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setShort(aB, offset, vS);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_leapi() {
+        storeShortLE(aB, offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_adr0_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[0] = (byte)0x01;
+        aB[1] = (byte)0x02;
+        aB[2] = (byte)0x03;
+        aB[3] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_adr1_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[1] = (byte)0x01;
+        aB[2] = (byte)0x02;
+        aB[3] = (byte)0x03;
+        aB[4] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setInt(aB, offset, 0x04030201);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeIntLE(aB, offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_unsafe() {
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setInt(aB, offset, 0x04030201);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_leapi() {
+        storeIntLE(aB, offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setInt(aB, offset, vI);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeIntLE(aB, offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_unsafe() {
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setInt(aB, offset, vI);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_leapi() {
+        storeIntLE(aB, offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_adr0_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[0] = (byte)0x01;
+        aB[1] = (byte)0x02;
+        aB[2] = (byte)0x03;
+        aB[3] = (byte)0x04;
+        aB[4] = (byte)0x05;
+        aB[5] = (byte)0x06;
+        aB[6] = (byte)0x07;
+        aB[7] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_adr1_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[1] = (byte)0x01;
+        aB[2] = (byte)0x02;
+        aB[3] = (byte)0x03;
+        aB[4] = (byte)0x04;
+        aB[5] = (byte)0x05;
+        aB[6] = (byte)0x06;
+        aB[7] = (byte)0x07;
+        aB[8] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        aB[offset + 4] = (byte)0x05;
+        aB[offset + 5] = (byte)0x06;
+        aB[offset + 6] = (byte)0x07;
+        aB[offset + 7] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeLongLE(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        aB[offset + 4] = (byte)0x05;
+        aB[offset + 5] = (byte)0x06;
+        aB[offset + 6] = (byte)0x07;
+        aB[offset + 7] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_leapi() {
+        storeLongLE(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vL >> 0 );
+        aB[offset + 1] = (byte)(vL >> 8 );
+        aB[offset + 2] = (byte)(vL >> 16);
+        aB[offset + 3] = (byte)(vL >> 24);
+        aB[offset + 4] = (byte)(vL >> 32);
+        aB[offset + 5] = (byte)(vL >> 40);
+        aB[offset + 6] = (byte)(vL >> 48);
+        aB[offset + 7] = (byte)(vL >> 56);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setLong(aB, offset, vL);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeLongLE(aB, offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vL >> 0 );
+        aB[offset + 1] = (byte)(vL >> 8 );
+        aB[offset + 2] = (byte)(vL >> 16);
+        aB[offset + 3] = (byte)(vL >> 24);
+        aB[offset + 4] = (byte)(vL >> 32);
+        aB[offset + 5] = (byte)(vL >> 40);
+        aB[offset + 6] = (byte)(vL >> 48);
+        aB[offset + 7] = (byte)(vL >> 56);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setLong(aB, offset, vL);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_leapi() {
+        storeLongLE(aB, offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        aB[offset + 4] = (byte)(vI >> 0 );
+        aB[offset + 5] = (byte)(vI >> 8 );
+        aB[offset + 6] = (byte)(vI >> 16);
+        aB[offset + 7] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI);
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setInt(aB, offset + 0, vI);
+        ByteArrayLittleEndian.setInt(aB, offset + 4, vI);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeIntLE(aB, offset + 0, vI);
+        storeIntLE(aB, offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        aB[offset + 4] = (byte)(vI >> 0 );
+        aB[offset + 5] = (byte)(vI >> 8 );
+        aB[offset + 6] = (byte)(vI >> 16);
+        aB[offset + 7] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI);
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI);
+        return aB;
+    }
+
+    /*
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setInt(aB, offset + 0, vI);
+        ByteArrayLittleEndian.setInt(aB, offset + 4, vI);
+        return aB;
+    }
+    */
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_leapi() {
+        storeIntLE(aB, offset + 0, vI);
+        storeIntLE(aB, offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public short[] store_S2_con_offs_allocate_direct() {
+        short[] aS = new short[RANGE];
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        return aS;
+    }
+
+    @Benchmark
+    public short[] store_S2_con_offs_nonalloc_direct() {
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        return aS;
+    }
+
+    @Benchmark
+    public short[] store_S4_con_offs_allocate_direct() {
+        short[] aS = new short[RANGE];
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        aS[offset + 2] = (short)0x0506;
+        aS[offset + 3] = (short)0x0708;
+        return aS;
+    }
+
+    @Benchmark
+    public short[] store_S4_con_offs_nonalloc_direct() {
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        aS[offset + 2] = (short)0x0506;
+        aS[offset + 3] = (short)0x0708;
+        return aS;
+    }
+
+    @Benchmark
+    public int[] store_I2_con_offs_allocate_direct() {
+        int[] aI = new int[RANGE];
+        aI[offset + 0] = 0x01020304;
+        aI[offset + 1] = 0x05060708;
+        return aI;
+    }
+
+    @Benchmark
+    public int[] store_I2_con_offs_nonalloc_direct() {
+        aI[offset + 0] = 0x01020304;
+        aI[offset + 1] = 0x05060708;
+        return aI;
+    }
+
+    @Benchmark
+    public int[] store_I2_zero_offs_allocate_direct() {
+        int[] aI = new int[RANGE];
+        aI[offset + 0] = 0;
+        aI[offset + 1] = 0;
+        return aI;
+    }
+
+    @Benchmark
+    public int[] store_I2_zero_offs_nonalloc_direct() {
+        aI[offset + 0] = 0;
+        aI[offset + 1] = 0;
+        return aI;
+    }
+
+    @Benchmark
+    public void store_unsafe_B8_L_offs_noalloc_direct() {
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, (byte)(vL >> 0 ));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 1, (byte)(vL >> 8 ));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 2, (byte)(vL >> 16));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 3, (byte)(vL >> 24));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, (byte)(vL >> 32));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 5, (byte)(vL >> 40));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 6, (byte)(vL >> 48));
+        UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 7, (byte)(vL >> 56));
+    }
+
+    @Benchmark
+    public void store_unsafe_B8_L_offs_noalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vL);
+    }
+
+    @Benchmark
+    public void store_unsafe_C4_L_offs_noalloc_direct() {
+        UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, (char)(vL >> 0 ));
+        UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 2, (char)(vL >> 16));
+        UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, (char)(vL >> 32));
+        UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 6, (char)(vL >> 48));
+    }
+
+    @Benchmark
+    public void store_unsafe_native_B8_L_offs_noalloc_direct() {
+        UNSAFE.putByte(null, native_adr + offset + 0, (byte)(vL >> 0 ));
+        UNSAFE.putByte(null, native_adr + offset + 1, (byte)(vL >> 8 ));
+        UNSAFE.putByte(null, native_adr + offset + 2, (byte)(vL >> 16));
+        UNSAFE.putByte(null, native_adr + offset + 3, (byte)(vL >> 24));
+        UNSAFE.putByte(null, native_adr + offset + 4, (byte)(vL >> 32));
+        UNSAFE.putByte(null, native_adr + offset + 5, (byte)(vL >> 40));
+        UNSAFE.putByte(null, native_adr + offset + 6, (byte)(vL >> 48));
+        UNSAFE.putByte(null, native_adr + offset + 7, (byte)(vL >> 56));
+    }
+
+    @Benchmark
+    public void store_unsafe_native_C4_L_offs_noalloc_direct() {
+        UNSAFE.putChar(null, native_adr + offset + 0, (char)(vL >> 0 ));
+        UNSAFE.putChar(null, native_adr + offset + 2, (char)(vL >> 16));
+        UNSAFE.putChar(null, native_adr + offset + 4, (char)(vL >> 32));
+        UNSAFE.putChar(null, native_adr + offset + 6, (char)(vL >> 48));
+    }
+
+    @Benchmark
+    public void store_unsafe_native_B8_L_offs_noalloc_unsafe() {
+        UNSAFE.putLongUnaligned(null, native_adr + offset + 0, vL);
+    }
+
+    @Fork(value = 1, jvmArgsPrepend = {
+        "-XX:+UnlockDiagnosticVMOptions", "-XX:-MergeStores"
+    })
+    public static class MergeStoresDisabled extends MergeStores {}
+}