From db5e1b1890ab06e1514bfd7356ead51ad4240977 Mon Sep 17 00:00:00 2001 From: Kuai Wei Date: Mon, 30 Dec 2024 10:54:44 +0800 Subject: [PATCH] [JIT] Backport 8318446: C2: optimize stores into primitive arrays by combining values into larger store Summary: include these patches for merge stores optimization 8318446: C2: optimize stores into primitive arrays by combining values into larger store 8319690: [AArch64] C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug" 8335390: C2 MergeStores: wrong result with Unsafe 8331311: C2: Big Endian Port of 8318446: optimize stores into primitive arrays by combining values into larger store 8331085: Crash in MergePrimitiveArrayStores::is_compatible_store() 8331252: C2: MergeStores: handle negative shift values 8331054: C2 MergeStores: assert failed: unexpected basic type after JDK-8318446 and JDK-8329555 8335392: C2 MergeStores: enhanced pointer parsing 8334342: Add MergeStore JMH benchmarks 8226411: C2: Avoid memory barriers around off-heap unsafe accesses Fix is_ConI() query after port 8318446 Fix for comments Testing: CI/CD Reviewers: zhuoren.wz, MaxXSoft Issue: https://github.com/dragonwell-project/dragonwell11/issues/920 Fix for comments Add missing assertion in round_down_power_of_2() --- .../share/compiler/compilerDirectives.hpp | 1 + src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp | 5 +- .../share/gc/shared/c2/barrierSetC2.cpp | 16 +- src/hotspot/share/oops/accessDecorators.hpp | 4 +- src/hotspot/share/opto/addnode.cpp | 4 +- src/hotspot/share/opto/addnode.hpp | 2 +- src/hotspot/share/opto/c2_globals.hpp | 6 + src/hotspot/share/opto/connode.hpp | 5 +- src/hotspot/share/opto/library_call.cpp | 12 +- src/hotspot/share/opto/memnode.cpp | 563 ++++++++ src/hotspot/share/opto/mempointer.cpp | 383 ++++++ src/hotspot/share/opto/mempointer.hpp | 618 +++++++++ src/hotspot/share/opto/noOverflowInt.hpp | 114 ++ src/hotspot/share/opto/node.hpp | 4 + src/hotspot/share/opto/phaseX.cpp | 10 +- .../gtest/opto/test_no_overflow_int.cpp | 175 +++ .../c2/TestMergeStoresNullAdrType.java | 56 + .../c2/TestMergeStoresUnsafeArrayPointer.java | 324 +++++ .../compiler/c2/TestUnalignedAccess.java | 172 +++ .../bench/vm/compiler/MergeStoreBench.java | 1132 +++++++++++++++++ .../bench/vm/compiler/MergeStores.java | 780 ++++++++++++ 21 files changed, 4368 insertions(+), 18 deletions(-) create mode 100644 src/hotspot/share/opto/mempointer.cpp create mode 100644 src/hotspot/share/opto/mempointer.hpp create mode 100644 src/hotspot/share/opto/noOverflowInt.hpp create mode 100644 test/hotspot/gtest/opto/test_no_overflow_int.cpp create mode 100644 test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java create mode 100644 test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java create mode 100644 test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java create mode 100644 test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java create mode 100644 test/micro/org/openjdk/bench/vm/compiler/MergeStores.java diff --git a/src/hotspot/share/compiler/compilerDirectives.hpp b/src/hotspot/share/compiler/compilerDirectives.hpp index e2861bb6044..7e0bc7acb01 100644 --- a/src/hotspot/share/compiler/compilerDirectives.hpp +++ b/src/hotspot/share/compiler/compilerDirectives.hpp @@ -61,6 +61,7 @@ cflags(PrintIntrinsics, bool, PrintIntrinsics, PrintIntrinsics) \ NOT_PRODUCT(cflags(TraceOptoPipelining, bool, TraceOptoPipelining, TraceOptoPipelining)) \ NOT_PRODUCT(cflags(TraceOptoOutput, bool, TraceOptoOutput, TraceOptoOutput)) \ +NOT_PRODUCT(cflags(TraceMergeStores, bool, TraceMergeStores, TraceMergeStores)) \ cflags(TraceSpilling, bool, TraceSpilling, TraceSpilling) \ cflags(Vectorize, bool, false, Vectorize) \ cflags(VectorizeDebug, uintx, 0, VectorizeDebug) \ diff --git a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp index 8d97939a459..d6e4b363323 100644 --- a/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp +++ b/src/hotspot/share/gc/g1/c2/g1BarrierSetC2.cpp @@ -607,12 +607,15 @@ Node* G1BarrierSetC2::load_at_resolved(C2Access& access, const Type* val_type) c Node* adr = access.addr().node(); Node* obj = access.base(); + bool anonymous = (decorators & C2_UNSAFE_ACCESS) != 0; bool mismatched = (decorators & C2_MISMATCHED) != 0; bool unknown = (decorators & ON_UNKNOWN_OOP_REF) != 0; bool in_heap = (decorators & IN_HEAP) != 0; + bool in_native = (decorators & IN_NATIVE) != 0; bool on_weak = (decorators & ON_WEAK_OOP_REF) != 0; bool is_unordered = (decorators & MO_UNORDERED) != 0; - bool need_cpu_mem_bar = !is_unordered || mismatched || !in_heap; + bool is_mixed = !in_heap && !in_native; + bool need_cpu_mem_bar = !is_unordered || mismatched || is_mixed; Node* offset = adr->is_AddP() ? adr->in(AddPNode::Offset) : kit->top(); Node* load = CardTableBarrierSetC2::load_at_resolved(access, val_type); diff --git a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp index 5452756444c..1a8021b4561 100644 --- a/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp +++ b/src/hotspot/share/gc/shared/c2/barrierSetC2.cpp @@ -40,13 +40,15 @@ void* C2Access::barrier_set_state() const { } bool C2Access::needs_cpu_membar() const { - bool mismatched = (_decorators & C2_MISMATCHED) != 0; + bool mismatched = (_decorators & C2_MISMATCHED) != 0; bool is_unordered = (_decorators & MO_UNORDERED) != 0; bool anonymous = (_decorators & C2_UNSAFE_ACCESS) != 0; - bool in_heap = (_decorators & IN_HEAP) != 0; + bool in_heap = (_decorators & IN_HEAP) != 0; + bool in_native = (_decorators & IN_NATIVE) != 0; + bool is_mixed = !in_heap && !in_native; - bool is_write = (_decorators & C2_WRITE_ACCESS) != 0; - bool is_read = (_decorators & C2_READ_ACCESS) != 0; + bool is_write = (_decorators & C2_WRITE_ACCESS) != 0; + bool is_read = (_decorators & C2_READ_ACCESS) != 0; bool is_atomic = is_read && is_write; if (is_atomic) { @@ -60,9 +62,11 @@ bool C2Access::needs_cpu_membar() const { // the barriers get omitted and the unsafe reference begins to "pollute" // the alias analysis of the rest of the graph, either Compile::can_alias // or Compile::must_alias will throw a diagnostic assert.) - if (!in_heap || !is_unordered || (mismatched && !_addr.type()->isa_aryptr())) { + if (is_mixed || !is_unordered || (mismatched && !_addr.type()->isa_aryptr())) { return true; } + } else { + assert(!is_mixed, "not unsafe"); } return false; @@ -78,7 +82,7 @@ Node* BarrierSetC2::store_at_resolved(C2Access& access, C2AccessValue& val) cons bool requires_atomic_access = (decorators & MO_UNORDERED) == 0; bool in_native = (decorators & IN_NATIVE) != 0; - assert(!in_native, "not supported yet"); + assert(!in_native || (unsafe && !access.is_oop()), "not supported yet"); if (access.type() == T_DOUBLE) { Node* new_val = kit->dstore_rounding(val.node()); diff --git a/src/hotspot/share/oops/accessDecorators.hpp b/src/hotspot/share/oops/accessDecorators.hpp index ab27c7e5240..c6d83c6bbdc 100644 --- a/src/hotspot/share/oops/accessDecorators.hpp +++ b/src/hotspot/share/oops/accessDecorators.hpp @@ -174,11 +174,11 @@ const DecoratorSet ON_DECORATOR_MASK = ON_STRONG_OOP_REF | ON_WEAK_OOP_REF | ON_PHANTOM_OOP_REF | ON_UNKNOWN_OOP_REF; // === Access Location === -// Accesses can take place in, e.g. the heap, old or young generation and different native roots. +// Accesses can take place in, e.g. the heap, old or young generation, different native roots, or native memory off the heap. // The location is important to the GC as it may imply different actions. The following decorators are used: // * IN_HEAP: The access is performed in the heap. Many barriers such as card marking will // be omitted if this decorator is not set. -// * IN_NATIVE: The access is performed in an off-heap data structure pointing into the Java heap. +// * IN_NATIVE: The access is performed in an off-heap data structure. const DecoratorSet IN_HEAP = UCONST64(1) << 19; const DecoratorSet IN_NATIVE = UCONST64(1) << 20; const DecoratorSet IN_DECORATOR_MASK = IN_HEAP | IN_NATIVE; diff --git a/src/hotspot/share/opto/addnode.cpp b/src/hotspot/share/opto/addnode.cpp index bdde4fe8dfe..1450b4d15ea 100644 --- a/src/hotspot/share/opto/addnode.cpp +++ b/src/hotspot/share/opto/addnode.cpp @@ -704,9 +704,9 @@ Node* AddPNode::Ideal_base_and_offset(Node* ptr, PhaseTransform* phase, //------------------------------unpack_offsets---------------------------------- // Collect the AddP offset values into the elements array, giving up // if there are more than length. -int AddPNode::unpack_offsets(Node* elements[], int length) { +int AddPNode::unpack_offsets(Node* elements[], int length) const { int count = 0; - Node* addr = this; + Node const* addr = this; Node* base = addr->in(AddPNode::Base); while (addr->is_AddP()) { if (addr->in(AddPNode::Base) != base) { diff --git a/src/hotspot/share/opto/addnode.hpp b/src/hotspot/share/opto/addnode.hpp index 30319a7150c..1897d013a7a 100644 --- a/src/hotspot/share/opto/addnode.hpp +++ b/src/hotspot/share/opto/addnode.hpp @@ -154,7 +154,7 @@ class AddPNode : public Node { // Collect the AddP offset values into the elements array, giving up // if there are more than length. - int unpack_offsets(Node* elements[], int length); + int unpack_offsets(Node* elements[], int length) const; // Do not match base-ptr edge virtual uint match_edge(uint idx) const; diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index ea5cd8299cd..363783e72d2 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -354,6 +354,12 @@ notproduct(bool, TraceNewVectors, false, \ "Trace creation of Vector nodes") \ \ + diagnostic(bool, MergeStores, true, \ + "Optimize stores by combining values into larger store") \ + \ + develop(bool, TraceMergeStores, false, \ + "Trace creation of merged stores") \ + \ product_pd(bool, OptoBundling, \ "Generate nops to fill i-cache lines") \ \ diff --git a/src/hotspot/share/opto/connode.hpp b/src/hotspot/share/opto/connode.hpp index 1d175461e30..4171df8c175 100644 --- a/src/hotspot/share/opto/connode.hpp +++ b/src/hotspot/share/opto/connode.hpp @@ -39,6 +39,7 @@ class ConNode : public TypeNode { ConNode( const Type *t ) : TypeNode(t->remove_speculative(),1) { init_req(0, (Node*)Compile::current()->root()); init_flags(Flag_is_Con); + init_class_id(Class_Con); } virtual int Opcode() const; virtual uint hash() const; @@ -53,7 +54,9 @@ class ConNode : public TypeNode { // Simple integer constants class ConINode : public ConNode { public: - ConINode( const TypeInt *t ) : ConNode(t) {} + ConINode( const TypeInt *t ) : ConNode(t) { + init_class_id(Class_ConI); + } virtual int Opcode() const; // Factory method: diff --git a/src/hotspot/share/opto/library_call.cpp b/src/hotspot/share/opto/library_call.cpp index 9fdc0835f65..d7b7540c75f 100644 --- a/src/hotspot/share/opto/library_call.cpp +++ b/src/hotspot/share/opto/library_call.cpp @@ -2201,10 +2201,14 @@ bool LibraryCallKit::inline_unsafe_access(bool is_store, const BasicType type, c offset = ConvL2X(offset); adr = make_unsafe_address(base, offset, type, kind == Relaxed); - if (_gvn.type(base)->isa_ptr() != TypePtr::NULL_PTR) { - heap_base_oop = base; - } else if (type == T_OBJECT) { - return false; // off-heap oop accesses are not supported + if (_gvn.type(base)->isa_ptr() == TypePtr::NULL_PTR) { + if (type != T_OBJECT) { + decorators |= IN_NATIVE; // off-heap primitive access + } else { + return false; // off-heap oop accesses are not supported + } + } else { + heap_base_oop = base; // on-heap or mixed access } // Can base be NULL? Otherwise, always on-heap access. diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index c33c488588a..bbd82b3deac 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -40,6 +40,7 @@ #include "opto/machnode.hpp" #include "opto/matcher.hpp" #include "opto/memnode.hpp" +#include "opto/mempointer.hpp" #include "opto/mulnode.hpp" #include "opto/narrowptrnode.hpp" #include "opto/phaseX.hpp" @@ -2561,6 +2562,558 @@ uint StoreNode::hash() const { return NO_HASH; } +// Link together multiple stores (B/S/C/I) into a longer one. +// +// Example: _store = StoreB[i+3] +// +// RangeCheck[i+0] RangeCheck[i+0] +// StoreB[i+0] +// RangeCheck[i+3] RangeCheck[i+3] +// StoreB[i+1] --> pass: fail: +// StoreB[i+2] StoreI[i+0] StoreB[i+0] +// StoreB[i+3] +// +// The 4 StoreB are merged into a single StoreI node. We have to be careful with RangeCheck[i+1]: before +// the optimization, if this RangeCheck[i+1] fails, then we execute only StoreB[i+0], and then trap. After +// the optimization, the new StoreI[i+0] is on the passing path of RangeCheck[i+3], and StoreB[i+0] on the +// failing path. +// +// Note: For normal array stores, every store at first has a RangeCheck. But they can be removed with: +// - RCE (RangeCheck Elimination): the RangeChecks in the loop are hoisted out and before the loop, +// and possibly no RangeChecks remain between the stores. +// - RangeCheck smearing: the earlier RangeChecks are adjusted such that they cover later RangeChecks, +// and those later RangeChecks can be removed. Example: +// +// RangeCheck[i+0] RangeCheck[i+0] <- before first store +// StoreB[i+0] StoreB[i+0] <- first store +// RangeCheck[i+1] --> smeared --> RangeCheck[i+3] <- only RC between first and last store +// StoreB[i+1] StoreB[i+1] <- second store +// RangeCheck[i+2] --> removed +// StoreB[i+2] StoreB[i+2] +// RangeCheck[i+3] --> removed +// StoreB[i+3] StoreB[i+3] <- last store +// +// Thus, it is a common pattern that between the first and last store in a chain +// of adjacent stores there remains exactly one RangeCheck, located between the +// first and the second store (e.g. RangeCheck[i+3]). +// +class MergePrimitiveStores : public StackObj { +private: + PhaseGVN* const _phase; + StoreNode* const _store; + + NOT_PRODUCT( const bool _trace;) + +public: + MergePrimitiveStores(PhaseGVN* phase, StoreNode* store) : + _phase(phase), _store(store) + NOT_PRODUCT( COMMA _trace(Compile::current()->directive()->TraceMergeStoresOption) ) + {} + + StoreNode* run(); + +private: + bool is_compatible_store(const StoreNode* other_store) const; + bool is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const; + bool is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const; + static bool is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out); + enum CFGStatus { CFG_SuccessNoRangeCheck, CFG_SuccessWithRangeCheck, CFG_Failure }; + static CFGStatus cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store); + + class Status { + private: + StoreNode* _found_store; + bool _found_range_check; + + Status(StoreNode* found_store, bool found_range_check) + : _found_store(found_store), _found_range_check(found_range_check) {} + + public: + StoreNode* found_store() const { return _found_store; } + bool found_range_check() const { return _found_range_check; } + static Status make_failure() { return Status(NULL, false); } + + static Status make(StoreNode* found_store, const CFGStatus cfg_status) { + if (cfg_status == CFG_Failure) { + return Status::make_failure(); + } + return Status(found_store, cfg_status == CFG_SuccessWithRangeCheck); + } + +#ifndef PRODUCT + void print_on(outputStream* st) const { + if (_found_store == NULL) { + st->print_cr("None"); + } else { + st->print_cr("Found[%d %s, %s]", _found_store->_idx, _found_store->Name(), + _found_range_check ? "RC" : "no-RC"); + } + } +#endif + }; + + Status find_adjacent_use_store(const StoreNode* def_store) const; + Status find_adjacent_def_store(const StoreNode* use_store) const; + Status find_use_store(const StoreNode* def_store) const; + Status find_def_store(const StoreNode* use_store) const; + Status find_use_store_unidirectional(const StoreNode* def_store) const; + Status find_def_store_unidirectional(const StoreNode* use_store) const; + + void collect_merge_list(Node_List& merge_list) const; + Node* make_merged_input_value(const Node_List& merge_list); + StoreNode* make_merged_store(const Node_List& merge_list, Node* merged_input_value); + +#ifndef PRODUCT + bool is_trace_basic() const { + return _trace; + } + + bool is_trace_pointer() const { + return _trace; + } + + bool is_trace_aliasing() const { + return _trace; + } + + bool is_trace_adjacency() const { + return _trace; + } + + bool is_trace_success() const { + return _trace; + } + +#endif + + NOT_PRODUCT( void trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const; ) +}; + +StoreNode* MergePrimitiveStores::run() { + // Check for B/S/C/I + int opc = _store->Opcode(); + if (opc != Op_StoreB && opc != Op_StoreC && opc != Op_StoreI) { + return NULL; + } + + NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] MergePrimitiveStores::run: "); _store->dump(); }) + + // The _store must be the "last" store in a chain. If we find a use we could merge with + // then that use or a store further down is the "last" store. + Status status_use = find_adjacent_use_store(_store); + NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] expect no use: "); status_use.print_on(tty); }) + if (status_use.found_store() != NULL) { + return NULL; + } + + // Check if we can merge with at least one def, so that we have at least 2 stores to merge. + Status status_def = find_adjacent_def_store(_store); + NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] expect def: "); status_def.print_on(tty); }) + if (status_def.found_store() == NULL) { + return NULL; + } + + ResourceMark rm; + Node_List merge_list; + collect_merge_list(merge_list); + + Node* merged_input_value = make_merged_input_value(merge_list); + if (merged_input_value == NULL) { return NULL; } + + StoreNode* merged_store = make_merged_store(merge_list, merged_input_value); + + NOT_PRODUCT( if (is_trace_success()) { trace(merge_list, merged_input_value, merged_store); } ) + + return merged_store; +} + +// Check compatibility between _store and other_store. +bool MergePrimitiveStores::is_compatible_store(const StoreNode* other_store) const { + int opc = _store->Opcode(); + assert(opc == Op_StoreB || opc == Op_StoreC || opc == Op_StoreI, "precondition"); + + if (other_store == NULL || + _store->Opcode() != other_store->Opcode()) { + return false; + } + + return true; +} + +bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const { + if (!is_adjacent_input_pair(def_store->in(MemNode::ValueIn), + use_store->in(MemNode::ValueIn), + def_store->memory_size())) { + return false; + } + + ResourceMark rm; +#ifndef PRODUCT + const TraceMemPointer trace(is_trace_pointer(), + is_trace_aliasing(), + is_trace_adjacency()); +#endif + const MemPointer pointer_use(use_store NOT_PRODUCT( COMMA trace )); + const MemPointer pointer_def(def_store NOT_PRODUCT( COMMA trace )); + return pointer_def.is_adjacent_to_and_before(pointer_use); +} + +bool MergePrimitiveStores::is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const { + // Pattern: [n1 = ConI, n2 = ConI] + if (n1->Opcode() == Op_ConI) { + return n2->Opcode() == Op_ConI; + } + + // Pattern: [n1 = base >> shift, n2 = base >> (shift + memory_size)] +#ifndef VM_LITTLE_ENDIAN + // Pattern: [n1 = base >> (shift + memory_size), n2 = base >> shift] + // Swapping n1 with n2 gives same pattern as on little endian platforms. + swap(n1, n2); +#endif // !VM_LITTLE_ENDIAN + Node const* base_n2; + jint shift_n2; + if (!is_con_RShift(n2, base_n2, shift_n2)) { + return false; + } + if (n1->Opcode() == Op_ConvL2I) { + // look through + n1 = n1->in(1); + } + Node const* base_n1; + jint shift_n1; + if (n1 == base_n2) { + // n1 = base = base >> 0 + base_n1 = n1; + shift_n1 = 0; + } else if (!is_con_RShift(n1, base_n1, shift_n1)) { + return false; + } + int bits_per_store = memory_size * 8; + if (base_n1 != base_n2 || + shift_n1 + bits_per_store != shift_n2 || + shift_n1 % bits_per_store != 0) { + return false; + } + + // both load from same value with correct shift + return true; +} + +// Detect pattern: n = base_out >> shift_out +bool MergePrimitiveStores::is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out) { + assert(n != NULL, "precondition"); + + int opc = n->Opcode(); + if (opc == Op_ConvL2I) { + n = n->in(1); + opc = n->Opcode(); + } + + if ((opc == Op_RShiftI || + opc == Op_RShiftL || + opc == Op_URShiftI || + opc == Op_URShiftL) && + n->in(2)->is_ConI()) { + base_out = n->in(1); + shift_out = n->in(2)->get_int(); + // The shift must be positive: + return shift_out >= 0; + } + return false; +} + +// Check if there is nothing between the two stores, except optionally a RangeCheck leading to an uncommon trap. +MergePrimitiveStores::CFGStatus MergePrimitiveStores::cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store) { + assert(use_store->in(MemNode::Memory) == def_store, "use-def relationship"); + + Node* ctrl_use = use_store->in(MemNode::Control); + Node* ctrl_def = def_store->in(MemNode::Control); + if (ctrl_use == NULL || ctrl_def == NULL) { + return CFG_Failure; + } + + if (ctrl_use == ctrl_def) { + // Same ctrl -> no RangeCheck in between. + // Check: use_store must be the only use of def_store. + if (def_store->outcnt() > 1) { + return CFG_Failure; + } + return CFG_SuccessNoRangeCheck; + } + + // Different ctrl -> could have RangeCheck in between. + // Check: 1. def_store only has these uses: use_store and MergeMem for uncommon trap, and + // 2. ctrl separated by RangeCheck. + if (def_store->outcnt() != 2) { + return CFG_Failure; // Cannot have exactly these uses: use_store and MergeMem for uncommon trap. + } + int use_store_out_idx = def_store->raw_out(0) == use_store ? 0 : 1; + Node* merge_mem = def_store->raw_out(1 - use_store_out_idx)->isa_MergeMem(); + if (merge_mem == NULL || + merge_mem->outcnt() != 1) { + return CFG_Failure; // Does not have MergeMem for uncommon trap. + } + if (!ctrl_use->is_IfProj() || + !ctrl_use->in(0)->is_RangeCheck() || + ctrl_use->in(0)->outcnt() != 2) { + return CFG_Failure; // Not RangeCheck. + } + ProjNode* other_proj = ctrl_use->as_IfProj()->other_if_proj(); + Node* trap = other_proj->is_uncommon_trap_proj(Deoptimization::Reason_range_check); + if (trap != merge_mem->unique_out() || + ctrl_use->in(0)->in(0) != ctrl_def) { + return CFG_Failure; // Not RangeCheck with merge_mem leading to uncommon trap. + } + + return CFG_SuccessWithRangeCheck; +} + +MergePrimitiveStores::Status MergePrimitiveStores::find_adjacent_use_store(const StoreNode* def_store) const { + Status status_use = find_use_store(def_store); + StoreNode* use_store = status_use.found_store(); + if (use_store != NULL && !is_adjacent_pair(use_store, def_store)) { + return Status::make_failure(); + } + return status_use; +} + +MergePrimitiveStores::Status MergePrimitiveStores::find_adjacent_def_store(const StoreNode* use_store) const { + Status status_def = find_def_store(use_store); + StoreNode* def_store = status_def.found_store(); + if (def_store != NULL && !is_adjacent_pair(use_store, def_store)) { + return Status::make_failure(); + } + return status_def; +} + +MergePrimitiveStores::Status MergePrimitiveStores::find_use_store(const StoreNode* def_store) const { + Status status_use = find_use_store_unidirectional(def_store); + +#ifdef ASSERT + StoreNode* use_store = status_use.found_store(); + if (use_store != NULL) { + Status status_def = find_def_store_unidirectional(use_store); + assert(status_def.found_store() == def_store && + status_def.found_range_check() == status_use.found_range_check(), + "find_use_store and find_def_store must be symmetric"); + } +#endif + + return status_use; +} + +MergePrimitiveStores::Status MergePrimitiveStores::find_def_store(const StoreNode* use_store) const { + Status status_def = find_def_store_unidirectional(use_store); + +#ifdef ASSERT + StoreNode* def_store = status_def.found_store(); + if (def_store != NULL) { + Status status_use = find_use_store_unidirectional(def_store); + assert(status_use.found_store() == use_store && + status_use.found_range_check() == status_def.found_range_check(), + "find_use_store and find_def_store must be symmetric"); + } +#endif + + return status_def; +} + +MergePrimitiveStores::Status MergePrimitiveStores::find_use_store_unidirectional(const StoreNode* def_store) const { + assert(is_compatible_store(def_store), "precondition: must be compatible with _store"); + + for (DUIterator_Fast imax, i = def_store->fast_outs(imax); i < imax; i++) { + StoreNode* use_store = def_store->fast_out(i)->isa_Store(); + if (is_compatible_store(use_store)) { + return Status::make(use_store, cfg_status_for_pair(use_store, def_store)); + } + } + + return Status::make_failure(); +} + +MergePrimitiveStores::Status MergePrimitiveStores::find_def_store_unidirectional(const StoreNode* use_store) const { + assert(is_compatible_store(use_store), "precondition: must be compatible with _store"); + + StoreNode* def_store = use_store->in(MemNode::Memory)->isa_Store(); + if (!is_compatible_store(def_store)) { + return Status::make_failure(); + } + + return Status::make(def_store, cfg_status_for_pair(use_store, def_store)); +} + +static int round_down_power_of_2(uint value) { + assert(value > 0, "Invalid value"); + return 1 << log2_uint(value); +} + +void MergePrimitiveStores::collect_merge_list(Node_List& merge_list) const { + // The merged store can be at most 8 bytes. + const uint merge_list_max_size = 8 / _store->memory_size(); + assert(merge_list_max_size >= 2 && + merge_list_max_size <= 8 && + is_power_of_2(merge_list_max_size), + "must be 2, 4 or 8"); + + // Traverse up the chain of adjacent def stores. + StoreNode* current = _store; + merge_list.push(current); + while (current != NULL && merge_list.size() < merge_list_max_size) { + Status status = find_adjacent_def_store(current); + NOT_PRODUCT( if (is_trace_basic()) { tty->print("[TraceMergeStores] find def: "); status.print_on(tty); }) + + current = status.found_store(); + if (current != NULL) { + merge_list.push(current); + + // We can have at most one RangeCheck. + if (status.found_range_check()) { + NOT_PRODUCT( if (is_trace_basic()) { tty->print_cr("[TraceMergeStores] found RangeCheck, stop traversal."); }) + break; + } + } + } + + NOT_PRODUCT( if (is_trace_basic()) { tty->print_cr("[TraceMergeStores] found:"); merge_list.dump(); }) + + // Truncate the merge_list to a power of 2. + const uint pow2size = round_down_power_of_2(merge_list.size()); + assert(pow2size >= 2, "must be merging at least 2 stores"); + while (merge_list.size() > pow2size) { merge_list.pop(); } + + NOT_PRODUCT( if (is_trace_basic()) { tty->print_cr("[TraceMergeStores] truncated:"); merge_list.dump(); }) +} + +// Merge the input values of the smaller stores to a single larger input value. +Node* MergePrimitiveStores::make_merged_input_value(const Node_List& merge_list) { + int new_memory_size = _store->memory_size() * merge_list.size(); + Node* first = merge_list.at(merge_list.size()-1); + Node* merged_input_value = NULL; + if (_store->in(MemNode::ValueIn)->Opcode() == Op_ConI) { + // Pattern: [ConI, ConI, ...] -> new constant + jlong con = 0; + jlong bits_per_store = _store->memory_size() * 8; + jlong mask = (((jlong)1) << bits_per_store) - 1; + for (uint i = 0; i < merge_list.size(); i++) { + jlong con_i = merge_list.at(i)->in(MemNode::ValueIn)->get_int(); +#ifdef VM_LITTLE_ENDIAN + con = con << bits_per_store; + con = con | (mask & con_i); +#else // VM_LITTLE_ENDIAN + con_i = (mask & con_i) << (i * bits_per_store); + con = con | con_i; +#endif // VM_LITTLE_ENDIAN + } + merged_input_value = _phase->longcon(con); + } else { + // Pattern: [base >> 24, base >> 16, base >> 8, base] -> base + // | | + // _store first + // + Node* hi = _store->in(MemNode::ValueIn); + Node* lo = first->in(MemNode::ValueIn); +#ifndef VM_LITTLE_ENDIAN + // `_store` and `first` are swapped in the diagram above + swap(hi, lo); +#endif // !VM_LITTLE_ENDIAN + Node const* hi_base; + jint hi_shift; + merged_input_value = lo; + bool is_true = is_con_RShift(hi, hi_base, hi_shift); + assert(is_true, "must detect con RShift"); + if (merged_input_value != hi_base && merged_input_value->Opcode() == Op_ConvL2I) { + // look through + merged_input_value = merged_input_value->in(1); + } + if (merged_input_value != hi_base) { + // merged_input_value is not the base + return NULL; + } + } + + if (_phase->type(merged_input_value)->isa_long() != NULL && new_memory_size <= 4) { + // Example: + // + // long base = ...; + // a[0] = (byte)(base >> 0); + // a[1] = (byte)(base >> 8); + // + merged_input_value = _phase->transform(new ConvL2INode(merged_input_value)); + } + + assert((_phase->type(merged_input_value)->isa_int() != NULL && new_memory_size <= 4) || + (_phase->type(merged_input_value)->isa_long() != NULL && new_memory_size == 8), + "merged_input_value is either int or long, and new_memory_size is small enough"); + + return merged_input_value; +} + +// // +// first_ctrl first_mem first_adr first_ctrl first_mem first_adr // +// | | | | | | // +// | | | | +---------------+ | // +// | | | | | | | // +// | | +---------+ | | +---------------+ // +// | | | | | | | | // +// +--------------+ | | v1 +------------------------------+ | | v1 // +// | | | | | | | | | | | | // +// RangeCheck first_store RangeCheck | | first_store // +// | | | | | | | // +// last_ctrl | +----> unc_trap last_ctrl | | +----> unc_trap // +// | | ===> | | | // +// +--------------+ | a2 v2 | | | // +// | | | | | | | | // +// | second_store | | | // +// | | | | | [v1 v2 ... vn] // +// ... ... | | | | // +// | | | | | v // +// +--------------+ | an vn +--------------+ | | merged_input_value // +// | | | | | | | | // +// last_store (= _store) merged_store // +// // +StoreNode* MergePrimitiveStores::make_merged_store(const Node_List& merge_list, Node* merged_input_value) { + Node* first_store = merge_list.at(merge_list.size()-1); + Node* last_ctrl = _store->in(MemNode::Control); // after (optional) RangeCheck + Node* first_mem = first_store->in(MemNode::Memory); + Node* first_adr = first_store->in(MemNode::Address); + + const TypePtr* new_adr_type = _store->adr_type(); + + int new_memory_size = _store->memory_size() * merge_list.size(); + BasicType bt = T_ILLEGAL; + switch (new_memory_size) { + case 2: bt = T_SHORT; break; + case 4: bt = T_INT; break; + case 8: bt = T_LONG; break; + } + + StoreNode* merged_store = StoreNode::make(*_phase, last_ctrl, first_mem, first_adr, + new_adr_type, merged_input_value, bt, MemNode::unordered); + + // Marking the store mismatched is sufficient to prevent reordering, since array stores + // are all on the same slice. Hence, we need no barriers. + merged_store->set_mismatched_access(); + + // Constants above may now also be be packed -> put candidate on worklist + _phase->is_IterGVN()->_worklist.push(first_mem); + + return merged_store; +} + +#ifndef PRODUCT +void MergePrimitiveStores::trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const { + stringStream ss; + ss.print_cr("[TraceMergeStores]: Replace"); + for (int i = (int)merge_list.size() - 1; i >= 0; i--) { + merge_list.at(i)->dump("\n", false, &ss); + } + ss.print_cr("[TraceMergeStores]: with"); + merged_input_value->dump("\n", false, &ss); + merged_store->dump("\n", false, &ss); + tty->print("%s", ss.as_string()); +} +#endif + //------------------------------Ideal------------------------------------------ // Change back-to-back Store(, p, x) -> Store(m, p, y) to Store(m, p, x). // When a store immediately follows a relevant allocation/initialization, @@ -2634,6 +3187,16 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) { } } + if (MergeStores && UseUnalignedAccesses) { + if (phase->C->post_loop_opts_phase()) { + MergePrimitiveStores merge(phase, this); + Node* progress = merge.run(); + if (progress != NULL) { return progress; } + } else { + phase->C->record_for_post_loop_opts_igvn(this); + } + } + return NULL; // No further progress } diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp new file mode 100644 index 00000000000..2a1921663d7 --- /dev/null +++ b/src/hotspot/share/opto/mempointer.cpp @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "opto/mempointer.hpp" +#include "utilities/resourceHash.hpp" + +// Recursively parse the pointer expression with a DFS all-path traversal +// (i.e. with node repetitions), starting at the pointer. +MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form() { + assert(_worklist.is_empty(), "no prior parsing"); + assert(_summands.is_empty(), "no prior parsing"); + + Node* pointer = _mem->in(MemNode::Address); + + // Start with the trivial summand. + _worklist.push(MemPointerSummand(pointer, NoOverflowInt(1))); + + // Decompose the summands until only terminal summands remain. This effectively + // parses the pointer expression recursively. + int traversal_count = 0; + while (_worklist.is_nonempty()) { + // Bail out if the graph is too complex. + if (traversal_count++ > 1000) { return MemPointerDecomposedForm::make_trivial(pointer); } + parse_sub_expression(_worklist.pop()); + } + + // Bail out if there is a constant overflow. + if (_con.is_NaN()) { return MemPointerDecomposedForm::make_trivial(pointer); } + + // Sorting by variable idx means that all summands with the same variable are consecutive. + // This simplifies the combining of summands with the same variable below. + _summands.sort(MemPointerSummand::cmp_by_variable_idx); + + // Combine summands for the same variable, adding up the scales. + int pos_put = 0; + int pos_get = 0; + while (pos_get < _summands.length()) { + const MemPointerSummand& summand = _summands.at(pos_get++); + Node* variable = summand.variable(); + NoOverflowInt scale = summand.scale(); + // Add up scale of all summands with the same variable. + while (pos_get < _summands.length() && _summands.at(pos_get).variable() == variable) { + MemPointerSummand s = _summands.at(pos_get++); + scale = scale + s.scale(); + } + // Bail out if scale is NaN. + if (scale.is_NaN()) { + return MemPointerDecomposedForm::make_trivial(pointer); + } + // Keep summands with non-zero scale. + if (!scale.is_zero()) { + _summands.at_put(pos_put++, MemPointerSummand(variable, scale)); + } + } + _summands.trunc_to(pos_put); + + return MemPointerDecomposedForm::make(pointer, _summands, _con); +} + +// Parse a sub-expression of the pointer, starting at the current summand. We parse the +// current node, and see if it can be decomposed into further summands, or if the current +// summand is terminal. +void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSummand& summand) { + Node* n = summand.variable(); + const NoOverflowInt scale = summand.scale(); + const NoOverflowInt one(1); + + int opc = n->Opcode(); + if (is_safe_to_decompose_op(opc, scale)) { + switch (opc) { + case Op_ConI: + case Op_ConL: + { + // Terminal: add to constant. + NoOverflowInt con = (opc == Op_ConI) ? NoOverflowInt(n->get_int()) + : NoOverflowInt(n->get_long()); + _con = _con + scale * con; + return; + } + case Op_AddP: + case Op_AddL: + case Op_AddI: + { + // Decompose addition. + Node* a = n->in((opc == Op_AddP) ? 2 : 1); + Node* b = n->in((opc == Op_AddP) ? 3 : 2); + _worklist.push(MemPointerSummand(a, scale)); + _worklist.push(MemPointerSummand(b, scale)); + return; + } + case Op_SubL: + case Op_SubI: + { + // Decompose subtraction. + Node* a = n->in(1); + Node* b = n->in(2); + + NoOverflowInt sub_scale = NoOverflowInt(-1) * scale; + + _worklist.push(MemPointerSummand(a, scale)); + _worklist.push(MemPointerSummand(b, sub_scale)); + return; + } + case Op_MulL: + case Op_MulI: + case Op_LShiftL: + case Op_LShiftI: + { + // Only multiplication with constants is allowed: factor * variable + // IGVN already folds constants to in(2). If we find a variable there + // instead, we cannot further decompose this summand, and have to add + // it to the terminal summands. + Node* variable = n->in(1); + Node* con = n->in(2); + if (!con->is_Con()) { break; } + NoOverflowInt factor; + switch (opc) { + case Op_MulL: // variable * con + factor = NoOverflowInt(con->get_long()); + break; + case Op_MulI: // variable * con + factor = NoOverflowInt(con->get_int()); + break; + case Op_LShiftL: // variable << con = variable * (1 << con) + factor = one << NoOverflowInt(con->get_int()); + break; + case Op_LShiftI: // variable << con = variable * (1 << con) + factor = one << NoOverflowInt(con->get_int()); + break; + } + + // Accumulate scale. + NoOverflowInt new_scale = scale * factor; + + _worklist.push(MemPointerSummand(variable, new_scale)); + return; + } + case Op_CastII: + // case Op_CastLL: + case Op_CastX2P: + case Op_ConvI2L: + // On 32bit systems we can also look through ConvL2I, since the final result will always + // be truncated back with ConvL2I. On 64bit systems we cannot decompose ConvL2I because + // such int values will eventually be expanded to long with a ConvI2L: + // + // valL = max_jint + 1 + // ConvI2L(ConvL2I(valL)) = ConvI2L(min_jint) = min_jint != max_jint + 1 = valL + // + NOT_LP64( case Op_ConvL2I: ) + { + // Decompose: look through. + Node* a = n->in(1); + _worklist.push(MemPointerSummand(a, scale)); + return; + } + default: + // All other operations cannot be further decomposed. We just add them to the + // terminal summands below. + break; + } + } + + // Default: we could not parse the "summand" further, i.e. it is terminal. + _summands.push(summand); +} + +// Check if the decomposition of operation opc is guaranteed to be safe. +// Please refer to the definition of "safe decomposition" in mempointer.hpp +bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const { +#ifndef _LP64 + // On 32-bit platforms, the pointer has 32bits, and thus any higher bits will always + // be truncated. Thus, it does not matter if we have int or long overflows. + // Simply put: all decompositions are (SAFE1). + return true; +#else + + switch (opc) { + // These operations are always safe to decompose, i.e. (SAFE1): + case Op_ConI: + case Op_ConL: + case Op_AddP: + case Op_AddL: + case Op_SubL: + case Op_MulL: + case Op_LShiftL: + case Op_CastII: + // case Op_CastLL: + case Op_CastX2P: + case Op_CastPP: + case Op_ConvI2L: + return true; + + // But on 64-bit platforms, these operations are not trivially safe to decompose: + case Op_AddI: // ConvI2L(a + b) != ConvI2L(a) + ConvI2L(b) + case Op_SubI: // ConvI2L(a - b) != ConvI2L(a) - ConvI2L(b) + case Op_MulI: // ConvI2L(a * conI) != ConvI2L(a) * ConvI2L(conI) + case Op_LShiftI: // ConvI2L(a << conI) != ConvI2L(a) << ConvI2L(conI) + break; // Analysis below. + + // All other operations are assumed not safe to decompose, or simply cannot be decomposed + default: + return false; + } + + const TypeAryPtr* ary_ptr_t = _mem->adr_type()->isa_aryptr(); + if (ary_ptr_t != NULL) { + // Array accesses that are not Unsafe always have a RangeCheck which ensures + // that there is no int overflow. And without overflows, all decompositions + // are (SAFE1). + if (!_mem->is_unsafe_access()) { + return true; + } + + // Intuition: In general, the decomposition of AddI, SubI, MulI or LShiftI is not safe, + // because of overflows. But under some conditions, we can prove that such a + // decomposition is (SAFE2). Intuitively, we want to prove that an overflow + // would mean that the pointers have such a large distance, that at least one + // must lie out of bounds. In the proof of the "MemPointer Lemma", we thus + // get a contradiction with the condition that both pointers are in bounds. + // + // We prove that the decomposition of AddI, SubI, MulI (with constant) and ShiftI (with + // constant) is (SAFE2), under the condition: + // + // abs(scale) % array_element_size_in_bytes = 0 + // + // First, we describe how the decomposition works: + // + // mp_i = con + sum(other_summands) + summand + // ------------------------- ------- + // rest scale * ConvI2L(op) + // + // We decompose the summand depending on the op, where we know that there is some + // integer y, such that: + // + // scale * ConvI2L(a + b) = scale * ConvI2L(a) + scale * ConvI2L(b) + scale * y * 2^32 + // scale * ConvI2L(a - b) = scale * ConvI2L(a) - scale * ConvI2L(b) + scale * y * 2^32 + // scale * ConvI2L(a * con) = scale * con * ConvI2L(a) + scale * y * 2^32 + // scale * ConvI2L(a << con) = scale * (1 << con) * ConvI2L(a) + scale * y * 2^32 + // \_______________________/ \_____________________________________/ \______________/ + // before decomposition after decomposition ("new_summands") overflow correction + // + // Thus, for AddI and SubI, we get: + // summand = new_summand1 + new_summand2 + scale * y * 2^32 + // + // mp_{i+1} = con + sum(other_summands) + new_summand1 + new_summand2 + // = con + sum(other_summands) + summand - scale * y * 2^32 + // = mp_i - scale * y * 2^32 + // + // And for MulI and ShiftI we get: + // summand = new_summand + scale * y * 2^32 + // + // mp_{i+1} = con + sum(other_summands) + new_summand + // = con + sum(other_summands) + summand - scale * y * 2^32 + // = mp_i - scale * y * 2^32 + // + // Further: + // abs(scale) % array_element_size_in_bytes = 0 + // implies that there is some integer z, such that: + // z * array_element_size_in_bytes = scale + // + // And hence, with "x = y * z", the decomposition is (SAFE2) under the assumed condition: + // mp_i = mp_{i+1} + scale * y * 2^32 + // = mp_{i+1} + z * array_element_size_in_bytes * y * 2^32 + // = mp_{i+1} + x * array_element_size_in_bytes * 2^32 + // + BasicType array_element_bt = ary_ptr_t->elem()->array_element_basic_type(); + if (is_java_primitive(array_element_bt)) { + NoOverflowInt array_element_size_in_bytes = NoOverflowInt(type2aelembytes(array_element_bt)); + if (scale.is_multiple_of(array_element_size_in_bytes)) { + return true; + } + } + } + + return false; +#endif +} + +// Compute the aliasing between two MemPointerDecomposedForm. We use the "MemPointer Lemma" to +// prove that the computed aliasing also applies for the underlying pointers. Note that the +// condition (S0) is already given, because the MemPointerDecomposedForm is always constructed +// using only safe decompositions. +// +// Pre-Condition: +// We assume that both pointers are in-bounds of their respective memory object. If this does +// not hold, for example, with the use of Unsafe, then we would already have undefined behavior, +// and we are allowed to do anything. +MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerDecomposedForm& other + NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const { +#ifndef PRODUCT + if (trace.is_trace_aliasing()) { + tty->print_cr("MemPointerDecomposedForm::get_aliasing_with:"); + print_on(tty); + other.print_on(tty); + } +#endif + + // "MemPointer Lemma" condition (S2): check if all summands are the same: + for (uint i = 0; i < SUMMANDS_SIZE; i++) { + const MemPointerSummand s1 = summands_at(i); + const MemPointerSummand s2 = other.summands_at(i); + if (s1 != s2) { +#ifndef PRODUCT + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing unknown, differ on summand %d.", i); + } +#endif + return MemPointerAliasing::make_unknown(); + } + } + + // "MemPointer Lemma" condition (S3): check that the constants do not differ too much: + const NoOverflowInt distance = other.con() - con(); + // We must check that: abs(distance) < 2^32 + // However, this is only false if: distance = min_jint + if (distance.is_NaN() || distance.value() == min_jint) { +#ifndef PRODUCT + if (trace.is_trace_aliasing()) { + tty->print(" -> Aliasing unknown, bad distance: "); + distance.print_on(tty); + tty->cr(); + } +#endif + return MemPointerAliasing::make_unknown(); + } + + // "MemPointer Lemma" condition (S1): + // Given that all summands are the same, we know that both pointers point into the + // same memory object. With the Pre-Condition, we know that both pointers are in + // bounds of that same memory object. + + // Hence, all 4 conditions of the "MemoryPointer Lemma" are established, and hence + // we know that the distance between the underlying pointers is equal to the distance + // we computed for the MemPointers: + // p_other - p_this = distance = other.con - this.con +#ifndef PRODUCT + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing always, distance = %d.", distance.value()); + } +#endif + return MemPointerAliasing::make_always(distance.value()); +} + +bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { + const MemPointerDecomposedForm& s1 = decomposed_form(); + const MemPointerDecomposedForm& s2 = other.decomposed_form(); + const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _trace )); + const jint size = mem()->memory_size(); + const bool is_adjacent = aliasing.is_always_at_distance(size); + +#ifndef PRODUCT + if (_trace.is_trace_adjacency()) { + tty->print("Adjacent: %s, because size = %d and aliasing = ", + is_adjacent ? "true" : "false", size); + aliasing.print_on(tty); + tty->cr(); + } +#endif + + return is_adjacent; +} diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp new file mode 100644 index 00000000000..6da90eb1a09 --- /dev/null +++ b/src/hotspot/share/opto/mempointer.hpp @@ -0,0 +1,618 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_OPTO_MEMPOINTER_HPP +#define SHARE_OPTO_MEMPOINTER_HPP + +#include "opto/memnode.hpp" +#include "opto/noOverflowInt.hpp" + +// The MemPointer is a shared facility to parse pointers and check the aliasing of pointers, +// e.g. checking if two stores are adjacent. +// +// ----------------------------------------------------------------------------------------- +// +// Intuition and Examples: +// We parse / decompose pointers into a linear form: +// +// pointer = SUM(scale_i * variable_i) + con +// +// where SUM() adds all "scale_i * variable_i" for each i together. +// +// The con and scale_i are compile-time constants (NoOverflowInt), and the variable_i are +// compile-time variables (C2 nodes). +// +// For the MemPointer, we do not explicitly track the base address. For Java heap pointers, the +// base address is just a variable in a summand with scale == 1. For native memory (C heap) +// pointers, the base address is null, and is hence implicitly a zero constant. +// +// +// Example 1: byte array access: +// +// array[i] +// +// pointer = array_base + ARRAY_BYTE_BASE_OFFSET + 1 * i +// = 1 * array_base + ARRAY_BYTE_BASE_OFFSET + 1 * i +// -------------------- ---------------------- -------------------- +// = scale_0 * variable_0 + con + scale_1 * variable_1 +// +// +// Example 2: int array access +// +// array[5 + i + 3 * j] +// +// pointer = array_base + ARRAY_INT_BASE_OFFSET + 4 * 5 + 4 * i + 4 * 3 * j +// = 1 * array_base + ARRAY_INT_BASE_OFFSET + 20 + 4 * i + 12 * j +// -------------------- ----------------------------- -------------------- -------------------- +// = scale_0 * variable_0 + con + scale_1 * variable_1 + scale_2 * variable_2 +// +// +// Example 3: Unsafe with int array +// +// UNSAFE.getInt(array, ARRAY_INT_BASE_OFFSET + 4 * i); +// +// pointer = array_base + ARRAY_INT_BASE_OFFSET + 4 * i +// = 1 * array_base + ARRAY_INT_BASE_OFFSET + 4 * i +// -------------------- --------------------- -------------------- +// = scale_0 * variable_0 + con + scale_1 * variable_1 +// +// +// Example 4: Unsafe with native memory address +// +// long address; +// UNSAFE.getInt(null, address + 4 * i); +// +// pointer = address + 4 * i +// = 1 * address + 0 + 4 * i +// -------------------- --- -------------------- +// = scale_0 * variable_0 + con + scale_1 * variable_1 +// +// +// Example 5: MemorySegment with byte array as backing type +// +// byte[] array = new byte[1000]; +// MemorySegment ms = MemorySegment.ofArray(array); +// assert ms.heapBase().get() == array: "array is base"; +// assert ms.address() == 0: "zero offset from base"; +// byte val = ms.get(ValueLayout.JAVA_BYTE, i); +// +// pointer = ms.heapBase() + ARRAY_BYTE_BASE_OFFSET + ms.address() + i +// = 1 * array_base + ARRAY_BYTE_BASE_OFFSET + 0 + 1 * i +// ----------------------- ------------------------------------- -------------------- +// = scale_0 * variable_0 + con + scale_1 * variable_1 +// +// +// Example 6: MemorySegment with native memory +// +// MemorySegment ms = Arena.ofAuto().allocate(1000, 1); +// assert ms.heapBase().isEmpty(): "null base"; +// assert ms.address() != 0: "non-zero native memory address"; +// short val = ms.get(ValueLayout.JAVA_SHORT, 2L * i); +// +// pointer = ms.heapBase() + ms.address() + 2 i +// = 0 + 1 * ms.address() + 2 * i +// ------------ ---------------------- -------------------- +// = con scale_0 * variable_0 + scale_1 * variable_1 +// +// +// Example 7: Non-linear access to int array +// +// array[5 + i + j * k] +// +// pointer = array_base + ARRAY_INT_BASE_OFFSET + 4 * 5 + 4 * i + 4 * j * k +// = 1 * array_base + ARRAY_INT_BASE_OFFSET + 20 + 4 * i + 4 * j * k +// -------------------- ----------------------------- -------------------- -------------------- +// = scale_0 * variable_0 + con + scale_1 * variable_1 + scale_2 * variable_2 +// +// Note: we simply stop parsing once a term is not linear. We keep "j * k" as its own variable. +// +// +// Example 8: Unsafe with native memory address, non-linear access +// +// UNSAFE.getInt(null, i * j); +// +// pointer = i * j +// = 0 + 1 * i * j +// --- -------------------- +// = con + scale_0 * variable_0 +// +// Note: we can always parse a pointer into its trivial linear form: +// +// pointer = 0 + 1 * pointer. +// +// ----------------------------------------------------------------------------------------- +// +// MemPointerDecomposedForm: +// When the pointer is parsed, it is decomposed into a SUM of summands plus a constant: +// +// pointer = SUM(summands) + con +// +// Where each summand_i in summands has the form: +// +// summand_i = scale_i * variable_i +// +// Hence, the full decomposed form is: +// +// pointer = SUM(scale_i * variable_i) + con +// +// Note: the scale_i are compile-time constants (NoOverflowInt), and the variable_i are +// compile-time variables (C2 nodes). +// On 64-bit systems, this decomposed form is computed with long-add/mul, on 32-bit systems +// it is computed with int-add/mul. +// +// MemPointerAliasing: +// The decomposed form allows us to determine the aliasing between two pointers easily. For +// example, if two pointers are identical, except for their constant: +// +// pointer1 = SUM(summands) + con1 +// pointer2 = SUM(summands) + con2 +// +// then we can easily compute the distance between the pointers (distance = con2 - con1), +// and determine if they are adjacent. +// +// MemPointerDecomposedFormParser: +// Any pointer can be parsed into this (default / trivial) decomposed form: +// +// pointer = 1 * pointer + 0 +// scale_0 * variable_0 + con +// +// However, this is not particularly useful to compute aliasing. We would like to decompose +// the pointer as far as possible, i.e. extract as many summands and add up the constants to +// a single constant. +// +// Example (normal int-array access): +// pointer1 = array[i + 0] = array_base + array_int_base_offset + 4L * ConvI2L(i + 0) +// pointer2 = array[i + 1] = array_base + array_int_base_offset + 4L * ConvI2L(i + 1) +// +// At first, computing the aliasing is not immediately straight-forward in the general case because +// the distance is hidden inside the ConvI2L. We can convert this (with array_int_base_offset = 16) +// into these decomposed forms: +// +// pointer1 = 1L * array_base + 4L * i + 16L +// pointer2 = 1L * array_base + 4L * i + 20L +// +// This allows us to easily see that these two pointers are adjacent (distance = 4). +// +// Hence, in MemPointerDecomposedFormParser::parse_decomposed_form, we start with the pointer as +// a trivial summand. A summand can either be decomposed further or it is terminal (cannot +// be decomposed further). We decompose the summands recursively until all remaining summands +// are terminal, see MemPointerDecomposedFormParser::parse_sub_expression. This effectively parses +// the pointer expression recursively. +// +// ----------------------------------------------------------------------------------------- +// +// We have to be careful on 64-bit systems with ConvI2L: decomposing its input is not +// correct in general, overflows may not be preserved in the decomposed form: +// +// AddI: ConvI2L(a + b) != ConvI2L(a) + ConvI2L(b) +// SubI: ConvI2L(a - b) != ConvI2L(a) - ConvI2L(b) +// MulI: ConvI2L(a * conI) != ConvI2L(a) * ConvI2L(conI) +// LShiftI: ConvI2L(a << conI) != ConvI2L(a) << ConvI2L(conI) +// +// If we want to prove the correctness of MemPointerAliasing, we need some guarantees, +// that the MemPointers adequately represent the underlying pointers, such that we can +// compute the aliasing based on the summands and constants. +// +// ----------------------------------------------------------------------------------------- +// +// Below, we will formulate a "MemPointer Lemma" that helps us to prove the correctness of +// the MemPointerAliasing computations. To prove the "MemPointer Lemma", we need to define +// the idea of a "safe decomposition", and then prove that all the decompositions we apply +// are such "safe decompositions". +// +// +// Definition: Safe decomposition +// Trivial decomposition: +// (SAFE0) The trivial decomposition from p to mp_0 = 0 + 1 * p is always safe. +// +// Non-trivial decomposition: +// We decompose summand in: +// mp_i = con + summand + SUM(other_summands) +// resulting in: +-------------------------+ +// mp_{i+1} = con + dec_con + SUM(dec_summands) + SUM(other_summands) +// = new_con + SUM(new_summands) +// where mp_i means that the original pointer p was decomposed i times. +// +// We call a non-trivial decomposition safe if either: +// (SAFE1) No matter the values of the summand variables: +// mp_i = mp_{i+1} +// +// (SAFE2) The pointer is on an array with a known array_element_size_in_bytes, +// and there is an integer x, such that: +// mp_i = mp_{i+1} + x * array_element_size_in_bytes * 2^32 +// +// Note: if "x = 0", we have "mp1 = mp2", and if "x != 0", then mp1 and mp2 +// have a distance at least twice as large as the array size, and so +// at least one of mp1 or mp2 must be out of bounds of the array. +// +// MemPointer Lemma: +// Given two pointers p1 and p2, and their respective MemPointers mp1 and mp2. +// If these conditions hold: +// (S0) mp1 and mp2 are constructed only with safe decompositions (SAFE0, SAFE1, SAFE2) +// from p1 and p2, respectively. +// (S1) Both p1 and p2 are within the bounds of the same memory object. +// (S2) The constants do not differ too much: abs(mp1.con - mp2.con) < 2^31. +// (S3) All summands of mp1 and mp2 are identical (i.e. only the constants are possibly different). +// +// then the pointer difference between p1 and p2 is identical to the difference between +// mp1 and mp2: +// p1 - p2 = mp1 - mp2 +// +// Note: MemPointerDecomposedForm::get_aliasing_with relies on this MemPointer Lemma to +// prove the correctness of its aliasing computation between two MemPointers. +// +// +// Note: MemPointerDecomposedFormParser::is_safe_to_decompose_op checks that all +// decompositions we apply are safe. +// +// +// Proof of the "MemPointer Lemma": +// Assume (S0-S3) and show that +// p1 - p2 = mp1 - mp2 +// +// We make a case distinction over the types of decompositions used in the construction of mp1 and mp2. +// +// Trivial Case: Only trivial (SAFE0) decompositions were used: +// mp1 = 0 + 1 * p1 = p1 +// mp2 = 0 + 1 * p2 = p2 +// => +// p1 - p2 = mp1 - mp2 +// +// Unsafe Case: We apply at least one unsafe decomposition: +// This is a contradiction to (S0) and we are done. +// +// Case 1: Only decomposition of type (SAFE0) and (SAFE1) are used: +// We make an induction proof over the decompositions from p1 to mp1, starting with +// the trivial decomposition (SAFE0): +// mp1_0 = 0 + 1 * p1 = p1 +// Then for the i-th non-trivial decomposition (SAFE1) we know that +// mp1_i = mp1_{i+1} +// and hence, after the n-th non-trivial decomposition from p1: +// p1 = mp1_0 = mp1_i = mp1_n = mp1 +// Analogously, we can prove: +// p2 = mp2 +// +// p1 = mp1 +// p2 = mp2 +// => +// p1 - p2 = mp1 - mp2 +// +// Case 2: At least one decomposition of type (SAFE2) and no unsafe decomposition is used. +// Given we have (SAFE2) decompositions, we know that we are operating on an array of +// known array_element_size_in_bytes. We can weaken the guarantees from (SAFE1) +// decompositions to the same guarantee as (SAFE2) decompositions. Hence all applied +// non-trivial decompositions satisfy: +// mp1_i = mp1_{i+1} + x1_i * array_element_size_in_bytes * 2^32 +// where x1_i = 0 for (SAFE1) decompositions. +// +// We make an induction proof over the decompositions from p1 to mp1, starting with +// the trivial decomposition (SAFE0): +// mp1_0 = 0 + 1 * p1 = p1 +// Then for the i-th non-trivial decomposition (SAFE1) or (SAFE2), we know that +// mp1_i = mp1_{i+1} + x1_i * array_element_size_in_bytes * 2^32 +// and hence, if mp1 was decomposed with n non-trivial decompositions (SAFE1) or (SAFE2) from p1: +// p1 = mp1 + x1 * array_element_size_in_bytes * 2^32 +// where +// x1 = SUM(x1_i) +// Analogously, we can prove: +// p2 = mp2 + x2 * array_element_size_in_bytes * 2^32 +// +// And hence, with x = x1 - x2 we have: +// p1 - p2 = mp1 - mp2 + x * array_element_size_in_bytes * 2^32 +// +// If "x = 0", then it follows: +// p1 - p2 = mp1 - mp2 +// +// If "x != 0", then: +// abs(p1 - p2) = abs(mp1 - mp2 + x * array_element_size_in_bytes * 2^32) +// >= abs(x * array_element_size_in_bytes * 2^32) - abs(mp1 - mp2) +// -- apply x != 0 -- +// >= array_element_size_in_bytes * 2^32 - abs(mp1 - mp2) +// -- apply (S3) -- +// = array_element_size_in_bytes * 2^32 - abs(mp1.con - mp2.con) +// -- apply (S2) -- +// > array_element_size_in_bytes * 2^32 - 2^31 +// -- apply array_element_size_in_bytes > 0 -- +// >= array_element_size_in_bytes * 2^31 +// >= max_possible_array_size_in_bytes +// >= array_size_in_bytes +// +// This shows that p1 and p2 have a distance greater than the array size, and hence at least one of the two +// pointers must be out of bounds. This contradicts our assumption (S1) and we are done. + + +#ifndef PRODUCT +class TraceMemPointer : public StackObj { +private: + const bool _is_trace_pointer; + const bool _is_trace_aliasing; + const bool _is_trace_adjacency; + +public: + TraceMemPointer(const bool is_trace_pointer, + const bool is_trace_aliasing, + const bool is_trace_adjacency) : + _is_trace_pointer( is_trace_pointer), + _is_trace_aliasing( is_trace_aliasing), + _is_trace_adjacency(is_trace_adjacency) + {} + + bool is_trace_pointer() const { return _is_trace_pointer; } + bool is_trace_aliasing() const { return _is_trace_aliasing; } + bool is_trace_adjacency() const { return _is_trace_adjacency; } +}; +#endif + +// Class to represent aliasing between two MemPointer. +class MemPointerAliasing { +public: + enum Aliasing { + Unknown, // Distance unknown. + // Example: two "int[]" with different variable index offsets. + // e.g. "array[i] vs array[j]". + // e.g. "array1[i] vs array2[j]". + Always}; // Constant distance = p1 - p2. + // Example: The same address expression, except for a constant offset + // e.g. "array[i] vs array[i+1]". +private: + const Aliasing _aliasing; + const jint _distance; + + MemPointerAliasing(const Aliasing aliasing, const jint distance) : + _aliasing(aliasing), + _distance(distance) + { + assert(_distance != min_jint, "given by condition (S3) of MemPointer Lemma"); + } + +public: + static MemPointerAliasing make_unknown() { + return MemPointerAliasing(Unknown, 0); + } + + static MemPointerAliasing make_always(const jint distance) { + return MemPointerAliasing(Always, distance); + } + + // Use case: exact aliasing and adjacency. + bool is_always_at_distance(const jint distance) const { + return _aliasing == Always && _distance == distance; + } + +#ifndef PRODUCT + void print_on(outputStream* st) const { + switch(_aliasing) { + case Unknown: st->print("Unknown"); break; + case Always: st->print("Always(%d)", _distance); break; + default: ShouldNotReachHere(); + } + } +#endif +}; + +// Summand of a MemPointerDecomposedForm: +// +// summand = scale * variable +// +// where variable is a C2 node. +class MemPointerSummand : public StackObj { +private: + Node* _variable; + NoOverflowInt _scale; + +public: + MemPointerSummand() : + _variable(NULL), + _scale(NoOverflowInt::make_NaN()) {} + MemPointerSummand(Node* variable, const NoOverflowInt& scale) : + _variable(variable), + _scale(scale) + { + assert(_variable != NULL, "must have variable"); + assert(!_scale.is_zero(), "non-zero scale"); + } + + Node* variable() const { return _variable; } + NoOverflowInt scale() const { return _scale; } + + static int cmp_by_variable_idx(MemPointerSummand* p1, MemPointerSummand* p2) { + if (p1->variable() == NULL) { + return (p2->variable() == NULL) ? 0 : 1; + } else if (p2->variable() == NULL) { + return -1; + } + + return p1->variable()->_idx - p2->variable()->_idx; + } + + friend bool operator==(const MemPointerSummand a, const MemPointerSummand b) { + // Both "null" -> equal. + if (a.variable() == NULL && b.variable() == NULL) { return true; } + + // Same variable and scale? + if (a.variable() != b.variable()) { return false; } + return a.scale() == b.scale(); + } + + friend bool operator!=(const MemPointerSummand a, const MemPointerSummand b) { + return !(a == b); + } + +#ifndef PRODUCT + void print_on(outputStream* st) const { + st->print("Summand["); + _scale.print_on(st); + tty->print(" * [%d %s]]", _variable->_idx, _variable->Name()); + } +#endif +}; + +// Decomposed form of the pointer sub-expression of "pointer". +// +// pointer = SUM(summands) + con +// +class MemPointerDecomposedForm : public StackObj { +private: + // We limit the number of summands to 10. This is just a best guess, and not at this + // point supported by evidence. But I think it is reasonable: usually, a pointer + // contains a base pointer (e.g. array pointer or null for native memory) and a few + // variables. It should be rare that we have more than 9 variables. + static const int SUMMANDS_SIZE = 10; + + Node* _pointer; // pointer node associated with this (sub)pointer + + MemPointerSummand _summands[SUMMANDS_SIZE]; + NoOverflowInt _con; + +public: + // Empty + MemPointerDecomposedForm() : _pointer(NULL), _con(NoOverflowInt::make_NaN()) {} + +private: + // Default / trivial: pointer = 0 + 1 * pointer + MemPointerDecomposedForm(Node* pointer) : _pointer(pointer), _con(NoOverflowInt(0)) { + assert(pointer != NULL, "pointer must be non-null"); + _summands[0] = MemPointerSummand(pointer, NoOverflowInt(1)); + } + + MemPointerDecomposedForm(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) + : _pointer(pointer), _con(con) { + assert(!_con.is_NaN(), "non-NaN constant"); + assert(summands.length() <= SUMMANDS_SIZE, "summands must fit"); + for (int i = 0; i < summands.length(); i++) { + MemPointerSummand s = summands.at(i); + assert(s.variable() != NULL, "variable cannot be null"); + assert(!s.scale().is_NaN(), "non-NaN scale"); + _summands[i] = s; + } + } + +public: + static MemPointerDecomposedForm make_trivial(Node* pointer) { + return MemPointerDecomposedForm(pointer); + } + + static MemPointerDecomposedForm make(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) { + if (summands.length() <= SUMMANDS_SIZE) { + return MemPointerDecomposedForm(pointer, summands, con); + } else { + return MemPointerDecomposedForm::make_trivial(pointer); + } + } + + MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other + NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const; + + const MemPointerSummand summands_at(const uint i) const { + assert(i < SUMMANDS_SIZE, "in bounds"); + return _summands[i]; + } + + const NoOverflowInt con() const { return _con; } + +#ifndef PRODUCT + void print_on(outputStream* st) const { + if (_pointer == NULL) { + st->print_cr("MemPointerDecomposedForm empty."); + return; + } + st->print("MemPointerDecomposedForm[%d %s: con = ", _pointer->_idx, _pointer->Name()); + _con.print_on(st); + for (int i = 0; i < SUMMANDS_SIZE; i++) { + const MemPointerSummand& summand = _summands[i]; + if (summand.variable() != NULL) { + st->print(", "); + summand.print_on(st); + } + } + st->print_cr("]"); + } +#endif +}; + +class MemPointerDecomposedFormParser : public StackObj { +private: + const MemNode* _mem; + + // Internal data-structures for parsing. + NoOverflowInt _con; + GrowableArray _worklist; + GrowableArray _summands; + + // Resulting decomposed-form. + MemPointerDecomposedForm _decomposed_form; + +public: + MemPointerDecomposedFormParser(const MemNode* mem) : _mem(mem), _con(NoOverflowInt(0)) { + _decomposed_form = parse_decomposed_form(); + } + + const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; } + +private: + MemPointerDecomposedForm parse_decomposed_form(); + void parse_sub_expression(const MemPointerSummand& summand); + + bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; +}; + +// Facility to parse the pointer of a Load or Store, so that aliasing between two such +// memory operations can be determined (e.g. adjacency). +class MemPointer : public StackObj { +private: + const MemNode* _mem; + const MemPointerDecomposedForm _decomposed_form; + + NOT_PRODUCT( const TraceMemPointer& _trace; ) + +public: + MemPointer(const MemNode* mem NOT_PRODUCT( COMMA const TraceMemPointer& trace)) : + _mem(mem), + _decomposed_form(init_decomposed_form(_mem)) + NOT_PRODUCT( COMMA _trace(trace) ) + { +#ifndef PRODUCT + if (_trace.is_trace_pointer()) { + tty->print_cr("MemPointer::MemPointer:"); + tty->print("mem: "); mem->dump(); + _mem->in(MemNode::Address)->dump(); + _decomposed_form.print_on(tty); + } +#endif + } + + const MemNode* mem() const { return _mem; } + const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; } + bool is_adjacent_to_and_before(const MemPointer& other) const; + +private: + static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem) { + assert(mem->is_Store(), "only stores are supported"); + ResourceMark rm; + MemPointerDecomposedFormParser parser(mem); + return parser.decomposed_form(); + } +}; + +#endif // SHARE_OPTO_MEMPOINTER_HPP diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp new file mode 100644 index 00000000000..227f815deb9 --- /dev/null +++ b/src/hotspot/share/opto/noOverflowInt.hpp @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#ifndef SHARE_OPTO_NOOVERFLOWINT_HPP +#define SHARE_OPTO_NOOVERFLOWINT_HPP + +#include "utilities/ostream.hpp" + +// Wrapper around jint, which detects overflow. +// If any operation overflows, then it returns a NaN. +class NoOverflowInt { +private: + bool _is_NaN; // overflow, uninitialized, etc. + jint _value; + +public: + // Default: NaN. + NoOverflowInt() : _is_NaN(true), _value(0) {} + + // Create from jlong (or jint) -> NaN if overflows jint. + NoOverflowInt(jlong value) : _is_NaN(true), _value(0) { + jint trunc = (jint)value; + if ((jlong)trunc == value) { + _is_NaN = false; + _value = trunc; + } + } + + static NoOverflowInt make_NaN() { return NoOverflowInt(); } + + bool is_NaN() const { return _is_NaN; } + jint value() const { assert(!is_NaN(), "NaN not allowed"); return _value; } + bool is_zero() const { return !is_NaN() && value() == 0; } + + friend NoOverflowInt operator+(const NoOverflowInt& a, const NoOverflowInt& b) { + if (a.is_NaN()) { return a; } + if (b.is_NaN()) { return b; } + return NoOverflowInt((jlong)a.value() + (jlong)b.value()); + } + + friend NoOverflowInt operator-(const NoOverflowInt& a, const NoOverflowInt& b) { + if (a.is_NaN()) { return a; } + if (b.is_NaN()) { return b; } + return NoOverflowInt((jlong)a.value() - (jlong)b.value()); + } + + friend NoOverflowInt operator*(const NoOverflowInt& a, const NoOverflowInt& b) { + if (a.is_NaN()) { return a; } + if (b.is_NaN()) { return b; } + return NoOverflowInt((jlong)a.value() * (jlong)b.value()); + } + + friend NoOverflowInt operator<<(const NoOverflowInt& a, const NoOverflowInt& b) { + if (a.is_NaN()) { return a; } + if (b.is_NaN()) { return b; } + jint shift = b.value(); + if (shift < 0 || shift > 31) { return make_NaN(); } + return NoOverflowInt((jlong)a.value() << shift); + } + + friend bool operator==(const NoOverflowInt& a, const NoOverflowInt& b) { + if (a.is_NaN()) { return false; } + if (b.is_NaN()) { return false; } + return a.value() == b.value(); + } + + NoOverflowInt abs() const { + if (is_NaN()) { return *this; } + if (value() >= 0) { return *this; } + return NoOverflowInt(0) - *this; + } + + bool is_multiple_of(const NoOverflowInt& other) const { + NoOverflowInt a = this->abs(); + NoOverflowInt b = other.abs(); + if (a.is_NaN()) { return false; } + if (b.is_NaN()) { return false; } + if (b.is_zero()) { return false; } + return a.value() % b.value() == 0; + } + +#ifndef PRODUCT + void print_on(outputStream* st) const { + if (is_NaN()) { + st->print("NaN"); + } else { + st->print("%d", value()); + } + } +#endif +}; + +#endif // SHARE_OPTO_NOOVERFLOWINT_HPP diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 1a8b0d0296f..2ddd824798c 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -61,6 +61,7 @@ class CmpNode; class CodeBuffer; class ConstraintCastNode; class ConNode; +class ConINode; class CompareAndSwapNode; class CompareAndExchangeNode; class CountedLoopNode; @@ -689,6 +690,8 @@ class Node { #if INCLUDE_SHENANDOAHGC DEFINE_CLASS_ID(ShenandoahBarrier, Type, 7) #endif + DEFINE_CLASS_ID(Con, Type, 8) + DEFINE_CLASS_ID(ConI, Con, 0) DEFINE_CLASS_ID(Proj, Node, 3) DEFINE_CLASS_ID(CatchProj, Proj, 0) @@ -825,6 +828,7 @@ class Node { DEFINE_CLASS_QUERY(CatchProj) DEFINE_CLASS_QUERY(CheckCastPP) DEFINE_CLASS_QUERY(CastII) + DEFINE_CLASS_QUERY(ConI) DEFINE_CLASS_QUERY(ConstraintCast) DEFINE_CLASS_QUERY(ClearArray) DEFINE_CLASS_QUERY(CMove) diff --git a/src/hotspot/share/opto/phaseX.cpp b/src/hotspot/share/opto/phaseX.cpp index 9cf53dc10e0..bcce06fb259 100644 --- a/src/hotspot/share/opto/phaseX.cpp +++ b/src/hotspot/share/opto/phaseX.cpp @@ -2230,7 +2230,15 @@ void PhasePeephole::print_statistics() { //------------------------------set_req_X-------------------------------------- void Node::set_req_X( uint i, Node *n, PhaseIterGVN *igvn ) { assert( is_not_dead(n), "can not use dead node"); - assert( igvn->hash_find(this) != this, "Need to remove from hash before changing edges" ); +#ifdef ASSERT + if (igvn->hash_find(this) == this) { + tty->print_cr("Need to remove from hash before changing edges"); + this->dump(1); + tty->print_cr("Set at i = %d", i); + n->dump(); + assert(false, "Need to remove from hash before changing edges"); + } +#endif Node *old = in(i); set_req(i, n); diff --git a/test/hotspot/gtest/opto/test_no_overflow_int.cpp b/test/hotspot/gtest/opto/test_no_overflow_int.cpp new file mode 100644 index 00000000000..7b4b4259bb8 --- /dev/null +++ b/test/hotspot/gtest/opto/test_no_overflow_int.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + * + */ + +#include "precompiled.hpp" +#include "opto/noOverflowInt.hpp" +#include "unittest.hpp" + +static void check_jlong(const jlong val) { + const NoOverflowInt x(val); + + if (val > max_jint || min_jint > val) { + ASSERT_TRUE(x.is_NaN()); + } else { + ASSERT_FALSE(x.is_NaN()); + ASSERT_EQ(x.value(), val); + } +} + +TEST_VM(opto, NoOverflowInt_check_jlong) { + jlong start = (jlong)min_jint - 10000LL; + jlong end = (jlong)max_jint + 10000LL; + for (jlong i = start; i < end; i+= 1000LL) { + check_jlong(i); + } + + check_jlong((jlong)min_jint - 1LL); + check_jlong((jlong)min_jint); + check_jlong((jlong)min_jint + 1LL); + check_jlong((jlong)max_jint - 1LL); + check_jlong((jlong)max_jint); + check_jlong((jlong)max_jint + 1LL); + + const NoOverflowInt nan; + ASSERT_TRUE(nan.is_NaN()); +} + +TEST_VM(opto, NoOverflowInt_add_sub) { + const NoOverflowInt nan; + const NoOverflowInt zero(0); + const NoOverflowInt one(1); + const NoOverflowInt two(2); + const NoOverflowInt big(1 << 30); + + ASSERT_EQ((one + two).value(), 3); + ASSERT_EQ((one - two).value(), -1); + ASSERT_TRUE((nan + one).is_NaN()); + ASSERT_TRUE((one + nan).is_NaN()); + ASSERT_TRUE((nan + nan).is_NaN()); + ASSERT_TRUE((nan - one).is_NaN()); + ASSERT_TRUE((one - nan).is_NaN()); + ASSERT_TRUE((nan - nan).is_NaN()); + + ASSERT_EQ((big + one).value(), (1 << 30) + 1); + ASSERT_TRUE((big + big).is_NaN()); + ASSERT_EQ((big - one).value(), (1 << 30) - 1); + ASSERT_EQ((big - big).value(), 0); + + ASSERT_EQ((big - one + big).value(), max_jint); + ASSERT_EQ((zero - big - big).value(), min_jint); + ASSERT_TRUE((zero - big - big - one).is_NaN()); +} + +TEST_VM(opto, NoOverflowInt_mul) { + const NoOverflowInt nan; + const NoOverflowInt zero(0); + const NoOverflowInt one(1); + const NoOverflowInt two(2); + const NoOverflowInt big(1 << 30); + + ASSERT_EQ((one * two).value(), 2); + ASSERT_TRUE((nan * one).is_NaN()); + ASSERT_TRUE((one * nan).is_NaN()); + ASSERT_TRUE((nan * nan).is_NaN()); + + ASSERT_EQ((big * one).value(), (1 << 30)); + ASSERT_EQ((one * big).value(), (1 << 30)); + ASSERT_EQ((big * zero).value(), 0); + ASSERT_EQ((zero * big).value(), 0); + ASSERT_TRUE((big * big).is_NaN()); + ASSERT_TRUE((big * two).is_NaN()); + + ASSERT_EQ(((big - one) * two).value(), max_jint - 1); + ASSERT_EQ(((one - big) * two).value(), min_jint + 2); + ASSERT_EQ(((zero - big) * two).value(), min_jint); + ASSERT_TRUE(((big + one) * two).is_NaN()); + ASSERT_TRUE(((zero - big - one) * two).is_NaN()); +} + +TEST_VM(opto, NoOverflowInt_lshift) { + const NoOverflowInt nan; + const NoOverflowInt zero(0); + const NoOverflowInt one(1); + const NoOverflowInt two(2); + const NoOverflowInt big(1 << 30); + + for (int i = 0; i < 31; i++) { + ASSERT_EQ((one << NoOverflowInt(i)).value(), 1LL << i); + } + for (int i = 31; i < 1000; i++) { + ASSERT_TRUE((one << NoOverflowInt(i)).is_NaN()); + } + for (int i = -1000; i < 0; i++) { + ASSERT_TRUE((one << NoOverflowInt(i)).is_NaN()); + } + + ASSERT_EQ((NoOverflowInt(3) << NoOverflowInt(2)).value(), 3 * 4); + ASSERT_EQ((NoOverflowInt(11) << NoOverflowInt(5)).value(), 11 * 32); + ASSERT_EQ((NoOverflowInt(-13) << NoOverflowInt(4)).value(), -13 * 16); +} + +TEST_VM(opto, NoOverflowInt_misc) { + const NoOverflowInt nan; + const NoOverflowInt zero(0); + const NoOverflowInt one(1); + const NoOverflowInt two(2); + const NoOverflowInt big(1 << 30); + + // operator== + ASSERT_FALSE(nan == nan); + ASSERT_FALSE(nan == zero); + ASSERT_FALSE(zero == nan); + ASSERT_TRUE(zero == zero); + ASSERT_TRUE(one == one); + ASSERT_TRUE((one + two) == (two + one)); + ASSERT_TRUE((big + two) == (two + big)); + ASSERT_FALSE((big + big) == (big + big)); + ASSERT_TRUE((big - one + big) == (big - one + big)); + + // abs + for (int i = 0; i < (1 << 31); i += 1024) { + ASSERT_EQ(NoOverflowInt(i).abs().value(), i); + ASSERT_EQ(NoOverflowInt(-i).abs().value(), i); + } + ASSERT_EQ(NoOverflowInt(max_jint).abs().value(), max_jint); + ASSERT_EQ(NoOverflowInt(min_jint + 1).abs().value(), max_jint); + ASSERT_TRUE(NoOverflowInt(min_jint).abs().is_NaN()); + ASSERT_TRUE(NoOverflowInt(nan).abs().is_NaN()); + + // is_multiple_of + ASSERT_TRUE(one.is_multiple_of(one)); + ASSERT_FALSE(one.is_multiple_of(nan)); + ASSERT_FALSE(nan.is_multiple_of(one)); + ASSERT_FALSE(nan.is_multiple_of(nan)); + for (int i = 0; i < (1 << 31); i += 1023) { + ASSERT_TRUE(NoOverflowInt(i).is_multiple_of(one)); + ASSERT_TRUE(NoOverflowInt(-i).is_multiple_of(one)); + ASSERT_FALSE(NoOverflowInt(i).is_multiple_of(zero)); + ASSERT_FALSE(NoOverflowInt(-i).is_multiple_of(zero)); + } + ASSERT_TRUE(NoOverflowInt(33 * 7).is_multiple_of(NoOverflowInt(33))); + ASSERT_TRUE(NoOverflowInt(13 * 5).is_multiple_of(NoOverflowInt(5))); + ASSERT_FALSE(NoOverflowInt(7).is_multiple_of(NoOverflowInt(5))); +} + diff --git a/test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java b/test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java new file mode 100644 index 00000000000..f267c14a733 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/TestMergeStoresNullAdrType.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.c2; + +/* + * @test + * @bug 8318446 8331085 + * @summary Test merge stores, when "adr_type() == nullptr" because of TOP somewhere in the address. + * @run main/othervm -XX:CompileCommand=compileonly,compiler.c2.TestMergeStoresNullAdrType::test + * -XX:-TieredCompilation -Xcomp + * compiler.c2.TestMergeStoresNullAdrType + * @run main compiler.c2.TestMergeStoresNullAdrType + */ + +public class TestMergeStoresNullAdrType { + static int arr[] = new int[100]; + + static void test() { + boolean b = false; + for (int k = 269; k > 10; --k) { + b = b; + int j = 6; + while ((j -= 3) > 0) { + if (b) { + } else { + arr[j] >>= 2; + } + } + } + } + + public static void main(String[] args) { + test(); + } +} diff --git a/test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java b/test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java new file mode 100644 index 00000000000..3b65272c3c7 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/TestMergeStoresUnsafeArrayPointer.java @@ -0,0 +1,324 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8335390 + * @summary Test merge stores for some Unsafe store address patterns. + * @modules java.base/jdk.internal.misc + * @requires vm.bits == 64 + * @requires os.maxMemory > 8G + * @run main/othervm -XX:CompileCommand=compileonly,compiler.c2.TestMergeStoresUnsafeArrayPointer::test* + * -Xbatch + * -Xmx8g + * compiler.c2.TestMergeStoresUnsafeArrayPointer + * @run main/othervm -Xmx8g + * compiler.c2.TestMergeStoresUnsafeArrayPointer + */ + +package compiler.c2; +import jdk.internal.misc.Unsafe; + +public class TestMergeStoresUnsafeArrayPointer { + static final Unsafe UNSAFE = Unsafe.getUnsafe(); + + // We allocate a big int array of length: + static final int SIZE = (1 << 30) + 100; + + // This gives us a memory region of 4x as many bytes: + static final long BYTE_SIZE = 4L * SIZE; // = 1L << 32 + 400L + + // We set an "anchor" in the middle of this memory region, in bytes: + static final long ANCHOR = BYTE_SIZE / 2; + + static int four = 4; + static int max_int = Integer.MAX_VALUE; + static int min_int = Integer.MIN_VALUE; + static int val_2_to_30 = (1 << 30); + static int large_by_53 = (int)((1L << 31) / 53L + 1L); + + public static void main(String[] args) { + System.out.println("Allocate big array of SIZE = " + SIZE); + int[] big = new int[SIZE]; + + // Each test is executed a few times, so that we can see the difference between + // interpreter and compiler. + int errors = 0; + + long val = 0; + System.out.println("test1"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test1(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test1 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + val = 0; + System.out.println("test2"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test2(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test2 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + val = 0; + System.out.println("test3"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test3(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test3 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + val = 0; + System.out.println("test4"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test4(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test4 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + val = 0; + System.out.println("test5"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test5(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test5 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + val = 0; + System.out.println("test6"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test6(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test6 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + val = 0; + System.out.println("test7"); + for (int i = 0; i < 100_000; i++) { + testClear(big); + test7(big, ANCHOR); + long sum = testSum(big); + if (i == 0) { + val = sum; + } else { + if (sum != val) { + System.out.println("ERROR: test7 had wrong value: " + val + " != " + sum); + errors++; + break; + } + } + } + + // No result verification here. We only want to make sure we do not hit asserts. + System.out.println("test8 and test9"); + for (int i = 0; i < 100_000; i++) { + test8a(big, ANCHOR); + test8b(big, ANCHOR); + test8c(big, ANCHOR); + test8d(big, ANCHOR); + test9a(big, ANCHOR); + test9b(big, ANCHOR); + test9c(big, ANCHOR); + } + + if (errors > 0) { + throw new RuntimeException("ERRORS: " + errors); + } + System.out.println("PASSED"); + } + + // Only clear and sum over relevant parts of array to make the test fast. + static void testClear(int[] a) { + for (int j = 0 ; j < 100; j++) { a[j] = j; } + for (int j = a.length/2 - 100; j < a.length/2 + 100; j++) { a[j] = j; } + for (int j = a.length - 100; j < a.length + 0; j++) { a[j] = j; } + } + + static long testSum(int[] a) { + long sum = 0; + for (int j = 0 ; j < 100; j++) { sum += a[j]; } + for (int j = a.length/2 - 100; j < a.length/2 + 100; j++) { sum += a[j]; } + for (int j = a.length - 100; j < a.length + 0; j++) { sum += a[j]; } + return sum; + } + + // Reference: expected to merge. + static void test1(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putInt(a, base + 0, 0x42424242); + UNSAFE.putInt(a, base + 4, 0x66666666); + } + + // Test: if MergeStores is applied this can lead to wrong results + static void test2(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + ANCHOR; + UNSAFE.putInt(a, base + 0 + (long)(four + Integer.MAX_VALUE), 0x42424242); + UNSAFE.putInt(a, base + Integer.MAX_VALUE + (long)(four + 4 ), 0x66666666); + } + + // Test: if MergeStores is applied this can lead to wrong results + // -> AddI needs overflow check. + static void test3(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putInt(a, base + (long)(max_int + 0), 0x42424242); + UNSAFE.putInt(a, base + (long)(max_int + 4), 0x66666666); + } + + // Test: "max_int - four" cannot be parsed further, but would not make a difference here. + static void test4(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putInt(a, base + (long)(min_int - four) + 0, 0x42424242); + UNSAFE.putInt(a, base + (long)(min_int - four) + 4, 0x66666666); + } + + // Test: if MergeStores is applied this can lead to wrong results + // -> SubI needs overflow check. + static void test5(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putInt(a, base + (long)(min_int) - (long)(four) + 0, 0x42424242); // no overflow + UNSAFE.putInt(a, base + (long)(min_int - four) + 4, 0x66666666); // overflow + } + + // Test: if MergeStores is applied this can lead to wrong results + // -> LShiftI needs overflow check. + static void test6(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putInt(a, base + (long)(2 * val_2_to_30) + 0, 0x42424242); // overflow + UNSAFE.putInt(a, base + 2L * (long)(val_2_to_30) + 4, 0x66666666); // no overflow + } + + // Test: if MergeStores is applied this can lead to wrong results + // -> MulI needs overflow check. + static void test7(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putInt(a, base + (long)(53 * large_by_53) + 0, 0x42424242); // overflow + UNSAFE.putInt(a, base + 53L * (long)(large_by_53) + 4, 0x66666666); // no overflow + } + + // Test: check if large distance leads to assert + static void test8a(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base + (1L << 11) + 0, (byte)42); + UNSAFE.putByte(a, base + (1L << 11) + (1L << 30), (byte)11); + } + + // Test: check if large distance leads to assert + static void test8b(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base + (1L << 11) + (1L << 30), (byte)11); + UNSAFE.putByte(a, base + (1L << 11) + 0, (byte)42); + } + + // Test: check if large distance leads to assert + static void test8c(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base - (1L << 11) - 0, (byte)42); + UNSAFE.putByte(a, base - (1L << 11) - (1L << 30), (byte)11); + } + + // Test: check if large distance leads to assert + static void test8d(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base - (1L << 11) - (1L << 30), (byte)11); + UNSAFE.putByte(a, base - (1L << 11) - 0, (byte)42); + } + + // Test: check if large distance leads to assert + // case: bad distance: NaN + static void test9a(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base - 100, (byte)42); + UNSAFE.putByte(a, base - 100 + (1L << 31), (byte)11); + } + + // Test: check if large distance leads to assert + // case: just before NaN, it is still a valid distance for MemPointer aliasing. + static void test9b(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base - 100, (byte)42); + UNSAFE.putByte(a, base - 100 + (1L << 31) - 1, (byte)11); + } + + // Test: check if large distance leads to assert + // case: constant too large + static void test9c(int[] a, long anchor) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET + anchor; + UNSAFE.putByte(a, base, (byte)42); + UNSAFE.putByte(a, base + (1L << 31), (byte)11); + } +} diff --git a/test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java b/test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java new file mode 100644 index 00000000000..d05dbad4a73 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/TestUnalignedAccess.java @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.c2; + +import jdk.internal.misc.Unsafe; +import jdk.test.lib.Asserts; + +/** + * @test TestUnalignedAccess + * @summary AArch64: C2 compilation hits offset_ok_for_immed: assert "c2 compiler bug". + * @bug 8319690 + * @library /test/lib + * @modules java.base/jdk.internal.misc + * @run main/othervm compiler.c2.TestUnalignedAccess + * @run main/othervm -Xcomp -XX:-TieredCompilation -Xmx1g + * -XX:CompileCommand=compileonly,compiler.c2.TestUnalignedAccess*:: + * compiler.c2.TestUnalignedAccess + */ + +public class TestUnalignedAccess { + + public static final int LEN = 2040; + + static final Unsafe UNSAFE = Unsafe.getUnsafe(); + static void sink(int x) {} + + public static long lseed = 1; + public static int iseed = 2; + public static short sseed = 3; + public static byte bseed = 4; + public static long lres = lseed; + public static int ires = iseed; + public static short sres = sseed; + public static byte bres = bseed; + + public static class TestLong { + + private static final byte[] BYTES = new byte[LEN]; + private static final long rawdata = 0xbeef; + private static final long data; + + static { + sink(2); + // Signed immediate byte offset: range -256 to 255 + // Positive immediate byte offset: a multiple of 8 in the range 0 to 32760 + // Other immediate byte offsets can't be encoded in the instruction field. + + // 1030 can't be encoded as "base + offset" mode into the instruction field. + UNSAFE.putLongUnaligned(BYTES, 1030, rawdata); + lres += UNSAFE.getLongUnaligned(BYTES, 1030); + // 127 can be encoded into simm9 field. + UNSAFE.putLongUnaligned(BYTES, 127, lres); + lres += UNSAFE.getLongUnaligned(BYTES, 127); + // 1096 can be encoded into uimm12 field. + UNSAFE.putLongUnaligned(BYTES, 1096, lres); + data = UNSAFE.getLongUnaligned(BYTES, 1096); + } + + } + + public static class TestInt { + + private static final byte[] BYTES = new byte[LEN]; + private static final int rawdata = 0xbeef; + private static final int data; + static { + sink(2); + // Signed immediate byte offset: range -256 to 255 + // Positive immediate byte offset, a multiple of 4 in the range 0 to 16380 + // Other immediate byte offsets can't be encoded in the instruction field. + + // 274 can't be encoded as "base + offset" mode into the instruction field. + UNSAFE.putIntUnaligned(BYTES, 274, rawdata); + ires += UNSAFE.getIntUnaligned(BYTES, 274); + // 255 can be encoded into simm9 field. + UNSAFE.putIntUnaligned(BYTES, 255, ires); + ires += UNSAFE.getIntUnaligned(BYTES, 255); + // 528 can be encoded into uimm12 field. + UNSAFE.putIntUnaligned(BYTES, 528, ires); + data = UNSAFE.getIntUnaligned(BYTES, 528); + } + + } + + public static class TestShort { + + private static final byte[] BYTES = new byte[LEN]; + private static final short rawdata = (short)0xbeef; + private static final short data; + static { + sink(2); + // Signed immediate byte offset: range -256 to 255 + // Positive immediate byte offset: a multiple of 2 in the range 0 to 8190 + // Other immediate byte offsets can't be encoded in the instruction field. + + // 257 can't be encoded as "base + offset" mode into the instruction field. + UNSAFE.putShortUnaligned(BYTES, 257, rawdata); + sres = (short) (sres + UNSAFE.getShortUnaligned(BYTES, 257)); + // 253 can be encoded into simm9 field. + UNSAFE.putShortUnaligned(BYTES, 253, sres); + sres = (short) (sres + UNSAFE.getShortUnaligned(BYTES, 253)); + // 272 can be encoded into uimm12 field. + UNSAFE.putShortUnaligned(BYTES, 272, sres); + data = UNSAFE.getShortUnaligned(BYTES, 272); + } + + } + + public static class TestByte { + + private static final byte[] BYTES = new byte[LEN]; + private static final byte rawdata = (byte)0x3f; + private static final byte data; + static { + sink(2); + // Signed immediate byte offset: range -256 to 255 + // Positive immediate byte offset: range 0 to 4095 + // Other immediate byte offsets can't be encoded in the instruction field. + + // 272 can be encoded into simm9 field. + UNSAFE.putByte(BYTES, 272, rawdata); + bres = (byte) (bres + UNSAFE.getByte(BYTES, 272)); + // 53 can be encoded into simm9 field. + UNSAFE.putByte(BYTES, 53, bres); + bres = (byte) (bres + UNSAFE.getByte(BYTES, 53)); + // 1027 can be encoded into uimm12 field. + UNSAFE.putByte(BYTES, 1027, bres); + data = UNSAFE.getByte(BYTES, 1027); + } + + } + + static void test() { + TestLong ta = new TestLong(); + Asserts.assertEquals(ta.data, (ta.rawdata + lseed) * 2, "putUnaligned long failed!"); + + TestInt tb = new TestInt(); + Asserts.assertEquals(tb.data, (tb.rawdata + iseed) * 2, "putUnaligned int failed!"); + + TestShort tc = new TestShort(); + Asserts.assertEquals(tc.data, (short) (((short) (tc.rawdata + sseed)) * 2), "putUnaligned short failed!"); + + TestByte td = new TestByte(); + Asserts.assertEquals(td.data, (byte) (((byte) (td.rawdata + bseed)) * 2), "put byte failed!"); + } + + public static void main(String[] strArr) { + test(); + } +} diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java new file mode 100644 index 00000000000..26c8287c4de --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStoreBench.java @@ -0,0 +1,1132 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, Alibaba Group Holding Limited. All Rights Reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +import java.lang.invoke.MethodHandles; +import java.lang.invoke.VarHandle; +import java.lang.reflect.Field; +import java.nio.ByteOrder; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +import jdk.internal.misc.Unsafe; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@State(Scope.Thread) +@Warmup(iterations = 10, time = 500, timeUnit = TimeUnit.MILLISECONDS) +@Measurement(iterations = 5, time = 1000, timeUnit = TimeUnit.MILLISECONDS) +@Fork(value = 3, jvmArgsAppend = {"--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED"}) +public class MergeStoreBench { + private static final Unsafe UNSAFE = Unsafe.getUnsafe(); + + final static VarHandle INT_L = MethodHandles.byteArrayViewVarHandle(int[].class , ByteOrder.LITTLE_ENDIAN); + final static VarHandle INT_B = MethodHandles.byteArrayViewVarHandle(int[].class , ByteOrder.BIG_ENDIAN); + final static VarHandle LONG_L = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.LITTLE_ENDIAN); + final static VarHandle LONG_B = MethodHandles.byteArrayViewVarHandle(long[].class, ByteOrder.BIG_ENDIAN); + final static VarHandle CHAR_L = MethodHandles.byteArrayViewVarHandle(char[].class, ByteOrder.LITTLE_ENDIAN); + final static VarHandle CHAR_B = MethodHandles.byteArrayViewVarHandle(char[].class, ByteOrder.BIG_ENDIAN); + + final static int NUMBERS = 8192; + + final byte[] bytes4 = new byte[NUMBERS * 4]; + final byte[] bytes8 = new byte[NUMBERS * 8]; + final int [] ints = new int [NUMBERS ]; + final long[] longs = new long[NUMBERS ]; + final char[] chars = new char[NUMBERS ]; + + @Setup + public void setup() { + Random r = new Random(); + for (int i = 0; i < ints.length; i++) { + ints[i] = r.nextInt(); + INT_L.set(bytes4, i * 4, i); + } + + for (int i = 0; i < longs.length; i++) { + longs[i] = r.nextLong(); + LONG_L.set(bytes8, i * 8, i); + } + } + + /* + * The names of these cases have the following `B/L/V/U` suffixes, which are: + * ``` + * B BigEndian + * L LittleEndian + * V VarHandle + * U Unsafe + * R ReverseBytes + * C Unsafe.getChar & putChar + * S Unsafe.getShort & putShort + * ``` + */ + + @Benchmark + public void getIntB(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntB(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntBU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntBU(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntBV(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += (int) INT_B.get(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntL(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntL(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntLU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntLU(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntLV(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += (int) INT_L.get(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntRB(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntRB(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntRBU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntRBU(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntRL(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntRL(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntRLU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += getIntRLU(bytes4, i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void getIntRU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += Integer.reverseBytes( + UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4)); + } + BH.consume(sum); + } + + @Benchmark + public void getIntU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += UNSAFE.getInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4); + } + BH.consume(sum); + } + + @Benchmark + public void setIntB(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntB(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntBU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntBU(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntBV(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + INT_B.set(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntL(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntL(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntLU(Blackhole BH) { + int sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntLU(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntLV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + INT_L.set(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntRB(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntRB(bytes4, i * 4, ints[i]); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntRBU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntRBU(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntRL(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntRL(bytes4, i * 4, ints[i]); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntRLU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + setIntRLU(bytes4, i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntRU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + v = Integer.reverseBytes(v); + UNSAFE.putInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setIntU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + int v = ints[i]; + UNSAFE.putInt(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 4, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void getLongB(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongB(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongBU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongBU(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongBV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += (long) LONG_B.get(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongL(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongL(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongLU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongLU(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongLV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < ints.length; i++) { + sum += (long) LONG_L.get(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongRB(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongRB(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongRBU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongRBU(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongRL(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongRL(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongRLU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += getLongRLU(bytes8, i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void getLongRU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += Long.reverseBytes( + UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8)); + } + BH.consume(sum); + } + + @Benchmark + public void getLongU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + sum += UNSAFE.getLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8); + } + BH.consume(sum); + } + + @Benchmark + public void setLongB(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongB(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongBU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongBU(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongBV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + LONG_B.set(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongL(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongL(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongLU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongLU(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongLV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + LONG_L.set(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongRB(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongRB(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongRBU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongRBU(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongRL(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongRL(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongRLU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + setLongRLU(bytes8, i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongRU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + v = Long.reverseBytes(v); + UNSAFE.putLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void setLongU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + long v = longs[i]; + UNSAFE.putLong(bytes8, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 8, v); + sum += v; + } + BH.consume(sum); + } + + @Benchmark + public void getCharB(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = getCharB(bytes4, i); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void getCharBV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = (char) CHAR_B.get(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void getCharBU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = getCharBU(bytes4, i); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void getCharL(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = getCharL(bytes4, i); + sum += c; + } + BH.consume(sum); + } + @Benchmark + public void getCharLU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = getCharLU(bytes4, i); + sum += c; + } + BH.consume(sum); + } + + + @Benchmark + public void getCharLV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = (char) CHAR_L.get(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void getCharC(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + char c = UNSAFE.getChar(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void setCharBS(Blackhole BH) { + long sum = 0; + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + putShortB(bytes4, i * 2, c); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void setCharBV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + CHAR_B.set(bytes4, i * 2, c); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void setCharLS(Blackhole BH) { + long sum = 0; + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + putShortL(bytes4, i * 2, c); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void setCharLV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + CHAR_L.set(bytes4, i * 2, c); + sum += c; + } + BH.consume(sum); + } + + @Benchmark + public void setCharC(Blackhole BH) { + long sum = 0; + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + UNSAFE.putChar(bytes4, Unsafe.ARRAY_BYTE_BASE_OFFSET + i * 2, c); + sum += c; + } + BH.consume(sum); + } + + /* + * putChars4 Test whether four constant chars can be MergeStored + * + */ + @Benchmark + public void putChars4B(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4B(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4BU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4BU(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4BV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4BV(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4L(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4L(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4LU(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4LU(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4LV(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4LV(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4C(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4C(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + @Benchmark + public void putChars4S(Blackhole BH) { + long sum = 0; + for (int i = 0; i < longs.length; i++) { + putChars4S(bytes8, i * 4); + sum += longs[i]; + } + BH.consume(sum); + } + + static int getIntB(byte[] array, int offset) { + return ((array[offset ] & 0xff) << 24) + | ((array[offset + 1] & 0xff) << 16) + | ((array[offset + 2] & 0xff) << 8) + | ((array[offset + 3] & 0xff) ); + } + + static int getIntBU(byte[] array, int offset) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + return ((UNSAFE.getByte(array, address ) & 0xff) << 24) + | ((UNSAFE.getByte(array, address + 1) & 0xff) << 16) + | ((UNSAFE.getByte(array, address + 2) & 0xff) << 8) + | ((UNSAFE.getByte(array, address + 3) & 0xff) ); + } + + static int getIntL(byte[] array, int offset) { + return ((array[offset ] & 0xff) ) + | ((array[offset + 1] & 0xff) << 8) + | ((array[offset + 2] & 0xff) << 16) + | ((array[offset + 3] & 0xff) << 24); + } + + static int getIntRB(byte[] array, int offset) { + return Integer.reverseBytes(getIntB(array, offset)); + } + + static int getIntRBU(byte[] array, int offset) { + return Integer.reverseBytes(getIntBU(array, offset)); + } + + static int getIntRL(byte[] array, int offset) { + return Integer.reverseBytes(getIntL(array, offset)); + } + + static int getIntRLU(byte[] array, int offset) { + return Integer.reverseBytes(getIntLU(array, offset)); + } + + static void setIntB(byte[] array, int offset, int value) { + array[offset ] = (byte) (value >> 24); + array[offset + 1] = (byte) (value >> 16); + array[offset + 2] = (byte) (value >> 8); + array[offset + 3] = (byte) (value ); + } + + static void setIntBU(byte[] array, int offset, int value) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + UNSAFE.putByte(array, address , (byte) (value >> 24)); + UNSAFE.putByte(array, address + 1, (byte) (value >> 16)); + UNSAFE.putByte(array, address + 2, (byte) (value >> 8)); + UNSAFE.putByte(array, address + 3, (byte) (value )); + } + + public static void setIntL(byte[] array, int offset, int value) { + array[offset ] = (byte) value; + array[offset + 1] = (byte) (value >> 8); + array[offset + 2] = (byte) (value >> 16); + array[offset + 3] = (byte) (value >> 24); + } + + public static void setIntLU(byte[] array, int offset, int value) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + UNSAFE.putByte(array, address , (byte) value ); + UNSAFE.putByte(array, address + 1, (byte) (value >> 8)); + UNSAFE.putByte(array, address + 2, (byte) (value >> 16)); + UNSAFE.putByte(array, address + 3, (byte) (value >> 24)); + } + + public static void setIntRL(byte[] array, int offset, int value) { + value = Integer.reverseBytes(value); + setIntL(array, offset, value); + } + + public static void setIntRLU(byte[] array, int offset, int value) { + value = Integer.reverseBytes(value); + setIntLU(array, offset, value); + } + + public static void setIntRB(byte[] array, int offset, int value) { + value = Integer.reverseBytes(value); + setIntB(array, offset, value); + } + + public static void setIntRBU(byte[] array, int offset, int value) { + value = Integer.reverseBytes(value); + setIntBU(array, offset, value); + } + + static long getLongB(byte[] array, int offset) { + return (((long) array[offset ] & 0xff) << 56) + | (((long) array[offset + 1] & 0xff) << 48) + | (((long) array[offset + 2] & 0xff) << 40) + | (((long) array[offset + 3] & 0xff) << 32) + | (((long) array[offset + 4] & 0xff) << 24) + | (((long) array[offset + 5] & 0xff) << 16) + | (((long) array[offset + 6] & 0xff) << 8) + | (((long) array[offset + 7] & 0xff) ); + } + + static long getLongBU(byte[] array, int offset) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + return (((long)(UNSAFE.getByte(array, address) & 0xff)) << 56) + | (((long)(UNSAFE.getByte(array, address + 1) & 0xff)) << 48) + | (((long)(UNSAFE.getByte(array, address + 2) & 0xff)) << 40) + | (((long)(UNSAFE.getByte(array, address + 3) & 0xff)) << 32) + | (((long)(UNSAFE.getByte(array, address + 4) & 0xff)) << 24) + | (((long)(UNSAFE.getByte(array, address + 5) & 0xff)) << 16) + | (((long)(UNSAFE.getByte(array, address + 6) & 0xff)) << 8) + | (((long)(UNSAFE.getByte(array, address + 7) & 0xff)) ); + } + + public static long getLongL(byte[] array, int offset) { + return (((long) array[offset ] & 0xff) ) + | (((long) array[offset + 1] & 0xff) << 8) + | (((long) array[offset + 2] & 0xff) << 16) + | (((long) array[offset + 3] & 0xff) << 24) + | (((long) array[offset + 4] & 0xff) << 32) + | (((long) array[offset + 5] & 0xff) << 40) + | (((long) array[offset + 6] & 0xff) << 48) + | (((long) array[offset + 7] & 0xff) << 56); + } + + static long getLongLU(byte[] array, int offset) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + return (((long)(UNSAFE.getByte(array, address ) & 0xff)) ) + | (((long)(UNSAFE.getByte(array, address + 1) & 0xff)) << 8) + | (((long)(UNSAFE.getByte(array, address + 2) & 0xff)) << 16) + | (((long)(UNSAFE.getByte(array, address + 3) & 0xff)) << 24) + | (((long)(UNSAFE.getByte(array, address + 4) & 0xff)) << 32) + | (((long)(UNSAFE.getByte(array, address + 5) & 0xff)) << 40) + | (((long)(UNSAFE.getByte(array, address + 6) & 0xff)) << 48) + | (((long)(UNSAFE.getByte(array, address + 7) & 0xff)) << 56); + } + + static long getLongRB(byte[] array, int offset) { + return getLongB(array, offset); + } + + static long getLongRBU(byte[] array, int offset) { + return getLongBU(array, offset); + } + + static long getLongRL(byte[] array, int offset) { + return getLongL(array, offset); + } + + static long getLongRLU(byte[] array, int offset) { + return getLongLU(array, offset); + } + + static void setLongB(byte[] array, int offset, long value) { + array[offset] = (byte) (value >> 56); + array[offset + 1] = (byte) (value >> 48); + array[offset + 2] = (byte) (value >> 40); + array[offset + 3] = (byte) (value >> 32); + array[offset + 4] = (byte) (value >> 24); + array[offset + 5] = (byte) (value >> 16); + array[offset + 6] = (byte) (value >> 8); + array[offset + 7] = (byte) (value ); + } + + public static void setLongL(byte[] array, int offset, long value) { + array[offset] = (byte) value ; + array[offset + 1] = (byte) (value >> 8 ); + array[offset + 2] = (byte) (value >> 16); + array[offset + 3] = (byte) (value >> 24); + array[offset + 4] = (byte) (value >> 32); + array[offset + 5] = (byte) (value >> 40); + array[offset + 6] = (byte) (value >> 48); + array[offset + 7] = (byte) (value >> 56); + } + + public static void setLongRL(byte[] array, int offset, long value) { + value = Long.reverseBytes(value); + setLongL(array, offset, value); + } + + public static void setLongRLU(byte[] array, int offset, long value) { + value = Long.reverseBytes(value); + setLongLU(array, offset, value); + } + + public static void setLongRB(byte[] array, int offset, long value) { + value = Long.reverseBytes(value); + setLongB(array, offset, value); + } + + public static void setLongRBU(byte[] array, int offset, long value) { + value = Long.reverseBytes(value); + setLongBU(array, offset, value); + } + + public static void setLongBU(byte[] array, int offset, long value) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + UNSAFE.putByte(array, address , (byte) (value >> 56)); + UNSAFE.putByte(array, address + 1, (byte) (value >> 48)); + UNSAFE.putByte(array, address + 2, (byte) (value >> 40)); + UNSAFE.putByte(array, address + 3, (byte) (value >> 32)); + UNSAFE.putByte(array, address + 4, (byte) (value >> 24)); + UNSAFE.putByte(array, address + 5, (byte) (value >> 16)); + UNSAFE.putByte(array, address + 6, (byte) (value >> 8)); + UNSAFE.putByte(array, address + 7, (byte) value ); + } + + public static void setLongLU(byte[] array, int offset, long value) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + UNSAFE.putByte(array, address , (byte) value ); + UNSAFE.putByte(array, address + 1, (byte) (value >> 8)); + UNSAFE.putByte(array, address + 2, (byte) (value >> 16)); + UNSAFE.putByte(array, address + 3, (byte) (value >> 24)); + UNSAFE.putByte(array, address + 4, (byte) (value >> 32)); + UNSAFE.putByte(array, address + 5, (byte) (value >> 40)); + UNSAFE.putByte(array, address + 6, (byte) (value >> 48)); + UNSAFE.putByte(array, address + 7, (byte) (value >> 56)); + } + + public static int getIntLU(byte[] array, int offset) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + offset; + return ((UNSAFE.getByte(array, address ) & 0xff) ) + | ((UNSAFE.getByte(array, address + 1) & 0xff) << 8) + | ((UNSAFE.getByte(array, address + 2) & 0xff) << 16) + | ((UNSAFE.getByte(array, address + 3) & 0xff) << 24); + } + + public static char getCharB(byte[] val, int index) { + index <<= 1; + return (char)(((val[index ] & 0xff) << 8) + | ((val[index + 1] & 0xff))); + } + + public static char getCharBR(byte[] val, int index) { + return Character.reverseBytes(getCharB(val, index)); + } + + public static char getCharL(byte[] val, int index) { + index <<= 1; + return (char)(((val[index ] & 0xff)) + | ((val[index + 1] & 0xff) << 8)); + } + + public static char getCharLR(byte[] val, int index) { + return Character.reverseBytes(getCharL(val, index)); + } + + public static char getCharBU(byte[] array, int offset) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1); + return (char) (((UNSAFE.getByte(array, address ) & 0xff) << 8) + | ((UNSAFE.getByte(array, address + 1) & 0xff) )); + } + + public static char getCharLU(byte[] array, int offset) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1); + return (char) (((UNSAFE.getByte(array, address ) & 0xff) ) + | ((UNSAFE.getByte(array, address + 1) & 0xff) << 8)); + } + + public void putChars4B(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + putShortB(bytes, offset , c0); + putShortB(bytes, offset + 1, c1); + putShortB(bytes, offset + 2, c2); + putShortB(bytes, offset + 3, c3); + } + + public void putChars4BU(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + putShortBU(bytes, offset , c0); + putShortBU(bytes, offset + 1, c1); + putShortBU(bytes, offset + 2, c2); + putShortBU(bytes, offset + 3, c3); + } + + public void putChars4BV(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + offset <<= 1; + CHAR_B.set(bytes, offset , c0); + CHAR_B.set(bytes, offset + 2, c1); + CHAR_B.set(bytes, offset + 4, c2); + CHAR_B.set(bytes, offset + 6, c3); + } + + public void putChars4L(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + putShortL(bytes, offset , c0); + putShortL(bytes, offset + 1, c1); + putShortL(bytes, offset + 2, c2); + putShortL(bytes, offset + 3, c3); + } + + public void putChars4LV(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + offset <<= 1; + CHAR_L.set(bytes, offset , c0); + CHAR_L.set(bytes, offset + 2, c1); + CHAR_L.set(bytes, offset + 4, c2); + CHAR_L.set(bytes, offset + 6, c3); + } + + public void putChars4LU(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + putShortLU(bytes, offset , c0); + putShortLU(bytes, offset + 1, c1); + putShortLU(bytes, offset + 2, c2); + putShortLU(bytes, offset + 3, c3); + } + + public void putChars4C(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1); + UNSAFE.putChar(bytes, address , c0); + UNSAFE.putChar(bytes, address + 2, c1); + UNSAFE.putChar(bytes, address + 4, c2); + UNSAFE.putChar(bytes, address + 6, c3); + } + + public void putChars4S(byte[] bytes, int offset) { + char c0 = 'n', c1 = 'u', c2 = 'l', c3 = 'l'; + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1); + UNSAFE.putShort(bytes, address , (short) c0); + UNSAFE.putShort(bytes, address + 2, (short) c1); + UNSAFE.putShort(bytes, address + 4, (short) c2); + UNSAFE.putShort(bytes, address + 6, (short) c3); + } + + private static void putShortB(byte[] val, int index, int c) { + index <<= 1; + val[index ] = (byte)(c >> 8); + val[index + 1] = (byte)(c ); + } + + public static void putShortBU(byte[] array, int offset, int c) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1); + UNSAFE.putByte(array, address , (byte) (c >> 8)); + UNSAFE.putByte(array, address + 1, (byte) (c )); + } + + private static void putShortL(byte[] val, int index, int c) { + index <<= 1; + val[index ] = (byte)(c ); + val[index + 1] = (byte)(c >> 8); + } + + public static void putShortLU(byte[] array, int offset, int c) { + final long address = Unsafe.ARRAY_BYTE_BASE_OFFSET + (offset << 1); + UNSAFE.putByte(array, address , (byte) (c )); + UNSAFE.putByte(array, address + 1, (byte) (c >> 8)); + } +} diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java new file mode 100644 index 00000000000..4db148b454c --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java @@ -0,0 +1,780 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + + +import jdk.internal.misc.Unsafe; +// import jdk.internal.util.ByteArrayLittleEndian; +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) ++@Warmup(iterations = 2, time = 1) ++@Measurement(iterations = 3, time = 1) ++@Fork(value = 1, jvmArgs = { + "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "--add-exports", "java.base/jdk.internal.util=ALL-UNNAMED"}) +@State(Scope.Thread) +public class MergeStores { + + public static final int RANGE = 100; + + static Unsafe UNSAFE = Unsafe.getUnsafe(); + + @Param("1") + public static short vS; + + @Param("1") + public static int vI; + + @Param("1") + public static long vL; + + public static int offset = 5; + public static byte[] aB = new byte[RANGE]; + public static short[] aS = new short[RANGE]; + public static int[] aI = new int[RANGE]; + public static long native_adr = UNSAFE.allocateMemory(RANGE * 8); + + // ------------------------------------------- + // ------- Little-Endian API ---------- + // ------------------------------------------- + + // Store a short LE into an array using store bytes in an array + static void storeShortLE(byte[] bytes, int offset, short value) { + storeBytes(bytes, offset, (byte)(value >> 0), + (byte)(value >> 8)); + } + + // Store an int LE into an array using store bytes in an array + static void storeIntLE(byte[] bytes, int offset, int value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24)); + } + + // Store an int LE into an array using store bytes in an array + static void storeLongLE(byte[] bytes, int offset, long value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24), + (byte)(value >> 32), + (byte)(value >> 40), + (byte)(value >> 48), + (byte)(value >> 56)); + } + + // Store 2 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + } + + // Store 4 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + } + + // Store 8 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3, + byte b4, byte b5, byte b6, byte b7) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + bytes[offset + 4] = b4; + bytes[offset + 5] = b5; + bytes[offset + 6] = b6; + bytes[offset + 7] = b7; + } + + // -------------------------------- BENCHMARKS -------------------------------- + + @Benchmark + public void baseline() { + } + + @Benchmark + public byte[] baseline_allocate() { + byte[] aB = new byte[RANGE]; + return aB; + } + + @Benchmark + public byte[] store_B2_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeShortLE(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_unsafe() { + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_leapi() { + storeShortLE(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vS >> 0 ); + aB[offset + 1] = (byte)(vS >> 8 ); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_S_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setShort(aB, offset, vS); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_S_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeShortLE(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vS >> 0 ); + aB[offset + 1] = (byte)(vS >> 8 ); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_unsafe() { + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS); + return aB; + } + + /* + @Benchmark + public byte[] store_B2_S_offs_nonalloc_bale() { + ByteArrayLittleEndian.setShort(aB, offset, vS); + return aB; + } + */ + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_leapi() { + storeShortLE(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B4_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + aB[2] = (byte)0x03; + aB[3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + aB[3] = (byte)0x03; + aB[4] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset, 0x04030201); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_unsafe() { + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset, 0x04030201); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_leapi() { + storeIntLE(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_I_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_I_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_unsafe() { + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B4_I_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_leapi() { + storeIntLE(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + aB[2] = (byte)0x03; + aB[3] = (byte)0x04; + aB[4] = (byte)0x05; + aB[5] = (byte)0x06; + aB[6] = (byte)0x07; + aB[7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + aB[3] = (byte)0x03; + aB[4] = (byte)0x04; + aB[5] = (byte)0x05; + aB[6] = (byte)0x06; + aB[7] = (byte)0x07; + aB[8] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + aB[offset + 4] = (byte)0x05; + aB[offset + 5] = (byte)0x06; + aB[offset + 6] = (byte)0x07; + aB[offset + 7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeLongLE(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + aB[offset + 4] = (byte)0x05; + aB[offset + 5] = (byte)0x06; + aB[offset + 6] = (byte)0x07; + aB[offset + 7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_leapi() { + storeLongLE(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vL >> 0 ); + aB[offset + 1] = (byte)(vL >> 8 ); + aB[offset + 2] = (byte)(vL >> 16); + aB[offset + 3] = (byte)(vL >> 24); + aB[offset + 4] = (byte)(vL >> 32); + aB[offset + 5] = (byte)(vL >> 40); + aB[offset + 6] = (byte)(vL >> 48); + aB[offset + 7] = (byte)(vL >> 56); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_L_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setLong(aB, offset, vL); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_L_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeLongLE(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vL >> 0 ); + aB[offset + 1] = (byte)(vL >> 8 ); + aB[offset + 2] = (byte)(vL >> 16); + aB[offset + 3] = (byte)(vL >> 24); + aB[offset + 4] = (byte)(vL >> 32); + aB[offset + 5] = (byte)(vL >> 40); + aB[offset + 6] = (byte)(vL >> 48); + aB[offset + 7] = (byte)(vL >> 56); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_L_offs_nonalloc_bale() { + ByteArrayLittleEndian.setLong(aB, offset, vL); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_leapi() { + storeLongLE(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + aB[offset + 4] = (byte)(vI >> 0 ); + aB[offset + 5] = (byte)(vI >> 8 ); + aB[offset + 6] = (byte)(vI >> 16); + aB[offset + 7] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI); + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_I2_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset + 0, vI); + ByteArrayLittleEndian.setInt(aB, offset + 4, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_I2_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset + 0, vI); + storeIntLE(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + aB[offset + 4] = (byte)(vI >> 0 ); + aB[offset + 5] = (byte)(vI >> 8 ); + aB[offset + 6] = (byte)(vI >> 16); + aB[offset + 7] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI); + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI); + return aB; + } + + /* + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset + 0, vI); + ByteArrayLittleEndian.setInt(aB, offset + 4, vI); + return aB; + } + */ + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_leapi() { + storeIntLE(aB, offset + 0, vI); + storeIntLE(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public short[] store_S2_con_offs_allocate_direct() { + short[] aS = new short[RANGE]; + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + return aS; + } + + @Benchmark + public short[] store_S2_con_offs_nonalloc_direct() { + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + return aS; + } + + @Benchmark + public short[] store_S4_con_offs_allocate_direct() { + short[] aS = new short[RANGE]; + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + aS[offset + 2] = (short)0x0506; + aS[offset + 3] = (short)0x0708; + return aS; + } + + @Benchmark + public short[] store_S4_con_offs_nonalloc_direct() { + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + aS[offset + 2] = (short)0x0506; + aS[offset + 3] = (short)0x0708; + return aS; + } + + @Benchmark + public int[] store_I2_con_offs_allocate_direct() { + int[] aI = new int[RANGE]; + aI[offset + 0] = 0x01020304; + aI[offset + 1] = 0x05060708; + return aI; + } + + @Benchmark + public int[] store_I2_con_offs_nonalloc_direct() { + aI[offset + 0] = 0x01020304; + aI[offset + 1] = 0x05060708; + return aI; + } + + @Benchmark + public int[] store_I2_zero_offs_allocate_direct() { + int[] aI = new int[RANGE]; + aI[offset + 0] = 0; + aI[offset + 1] = 0; + return aI; + } + + @Benchmark + public int[] store_I2_zero_offs_nonalloc_direct() { + aI[offset + 0] = 0; + aI[offset + 1] = 0; + return aI; + } + + @Benchmark + public void store_unsafe_B8_L_offs_noalloc_direct() { + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, (byte)(vL >> 0 )); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 1, (byte)(vL >> 8 )); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 2, (byte)(vL >> 16)); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 3, (byte)(vL >> 24)); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, (byte)(vL >> 32)); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 5, (byte)(vL >> 40)); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 6, (byte)(vL >> 48)); + UNSAFE.putByte(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 7, (byte)(vL >> 56)); + } + + @Benchmark + public void store_unsafe_B8_L_offs_noalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vL); + } + + @Benchmark + public void store_unsafe_C4_L_offs_noalloc_direct() { + UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, (char)(vL >> 0 )); + UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 2, (char)(vL >> 16)); + UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, (char)(vL >> 32)); + UNSAFE.putChar(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 6, (char)(vL >> 48)); + } + + @Benchmark + public void store_unsafe_native_B8_L_offs_noalloc_direct() { + UNSAFE.putByte(null, native_adr + offset + 0, (byte)(vL >> 0 )); + UNSAFE.putByte(null, native_adr + offset + 1, (byte)(vL >> 8 )); + UNSAFE.putByte(null, native_adr + offset + 2, (byte)(vL >> 16)); + UNSAFE.putByte(null, native_adr + offset + 3, (byte)(vL >> 24)); + UNSAFE.putByte(null, native_adr + offset + 4, (byte)(vL >> 32)); + UNSAFE.putByte(null, native_adr + offset + 5, (byte)(vL >> 40)); + UNSAFE.putByte(null, native_adr + offset + 6, (byte)(vL >> 48)); + UNSAFE.putByte(null, native_adr + offset + 7, (byte)(vL >> 56)); + } + + @Benchmark + public void store_unsafe_native_C4_L_offs_noalloc_direct() { + UNSAFE.putChar(null, native_adr + offset + 0, (char)(vL >> 0 )); + UNSAFE.putChar(null, native_adr + offset + 2, (char)(vL >> 16)); + UNSAFE.putChar(null, native_adr + offset + 4, (char)(vL >> 32)); + UNSAFE.putChar(null, native_adr + offset + 6, (char)(vL >> 48)); + } + + @Benchmark + public void store_unsafe_native_B8_L_offs_noalloc_unsafe() { + UNSAFE.putLongUnaligned(null, native_adr + offset + 0, vL); + } + + @Fork(value = 1, jvmArgsPrepend = { + "-XX:+UnlockDiagnosticVMOptions", "-XX:-MergeStores" + }) + public static class MergeStoresDisabled extends MergeStores {} +}