Skip to content

Commit

Permalink
[Backport] Part of 8252848: Optimize small primitive arrayCopy operat…
Browse files Browse the repository at this point in the history
…ions through partial inlining using AVX-512 masked instructions

Summary: Introduce part of 8252848. In Dragonwell 11, we do not want to optimize small primitive arrayCopy operations through partial inlining using AVX-512 masked instructions. C2_MacroAssembler::genmask(Register dst, Register len, Register temp) has the empty body.

Test Plan: ci jtreg

Reviewed-by: JoshuaZhuwj

Issue: #701
  • Loading branch information
JinZhonghui authored Oct 30, 2023
1 parent e41131b commit 6a0328a
Show file tree
Hide file tree
Showing 16 changed files with 275 additions and 5 deletions.
16 changes: 16 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2815,6 +2815,22 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, bool mer
emit_operand(dst, src);
}

void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
if (merge) {
attributes.reset_is_clear_context();
}
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}

void Assembler::evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1542,6 +1542,7 @@ class Assembler : public AbstractAssembler {
void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len);
Expand Down
25 changes: 25 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -987,6 +987,22 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}

void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
/* Not introduce full 8252848, will be changed in JDK-8261553 and JDK-8262355
if (ArrayCopyPartialInlineSize <= 32) {
mov64(dst, 1);
shlxq(dst, dst, len);
decq(dst);
} else {
mov64(dst, -1);
movq(temp, len);
negptr(temp);
addptr(temp, 64);
shrxq(dst, dst, temp);
}
*/
}
#endif // _LP64

void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
Expand Down Expand Up @@ -1033,6 +1049,15 @@ void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, X
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}


void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
XMMRegister dst, XMMRegister src,
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);

void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);

// extract
void extract(BasicType typ, Register dst, XMMRegister src, int idx);
XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
Expand All @@ -90,6 +93,7 @@
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void genmask(Register dst, Register len, Register temp);
#endif // _LP64

// dst = reduce(op, src2) using vtmp as temps
Expand Down
49 changes: 49 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10578,6 +10578,55 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
bind(done);
}


void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
switch(type) {
case T_BYTE:
case T_BOOLEAN:
evmovdqub(dst, kmask, src, false, vector_len);
break;
case T_CHAR:
case T_SHORT:
evmovdquw(dst, kmask, src, false, vector_len);
break;
case T_INT:
case T_FLOAT:
evmovdqul(dst, kmask, src, false, vector_len);
break;
case T_LONG:
case T_DOUBLE:
evmovdquq(dst, kmask, src, false, vector_len);
break;
default:
fatal("Unexpected type argument %s", type2name(type));
break;
}
}

void MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
switch(type) {
case T_BYTE:
case T_BOOLEAN:
evmovdqub(dst, kmask, src, true, vector_len);
break;
case T_CHAR:
case T_SHORT:
evmovdquw(dst, kmask, src, true, vector_len);
break;
case T_INT:
case T_FLOAT:
evmovdqul(dst, kmask, src, true, vector_len);
break;
case T_LONG:
case T_DOUBLE:
evmovdquq(dst, kmask, src, true, vector_len);
break;
default:
fatal("Unexpected type argument %s", type2name(type));
break;
}
}

Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
switch (cond) {
// Note some conditions are synonyms for others
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1129,10 +1129,14 @@ class MacroAssembler: public Assembler {
void vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg = rscratch1);

// AVX512 Unaligned
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);

void evmovdqub(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdqub(dst, mask, src, merge, vector_len); }
void evmovdqub(XMMRegister dst, KRegister mask, AddressLiteral src, bool merge, int vector_len, Register scratch_reg);

void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len) { Assembler::evmovdquw(dst, src, merge, vector_len); }
Expand Down
65 changes: 65 additions & 0 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1624,6 +1624,13 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (UseAVX < 3) {
return false;
}
break;
case Op_SqrtF:
if (UseSSE < 1) {
return false;
Expand Down Expand Up @@ -1692,6 +1699,16 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false;
}
break;
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
if (!VM_Version::supports_avx512bw()) {
return false;
}
if ((size_in_bits != 512) && !VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_CMoveVD:
if (vlen != 4) {
return false; // implementation limitation (only vcmov4D_reg is present)
Expand Down Expand Up @@ -7887,3 +7904,51 @@ instruct vpternlog_mem(vec dst, vec src2, memory src3, immU8 func) %{
%}
ins_pipe( pipe_slow );
%}

#ifdef _LP64
// ---------------------------------- Masked Block Copy ------------------------------------

instruct vmasked_load64(vec dst, memory mem, rRegL mask) %{
match(Set dst (LoadVectorMasked mem mask));
format %{ "vector_masked_load $dst, $mem, $mask \t! vector masked copy" %}
ins_encode %{
BasicType elmType = this->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(this);
__ kmovql(k2, $mask$$Register);
__ evmovdqu(elmType, k2, $dst$$XMMRegister, $mem$$Address, vector_len);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_gen(rRegL dst, rRegL len, rRegL tempLen) %{
match(Set dst (VectorMaskGen len));
effect(TEMP_DEF dst, TEMP tempLen);
format %{ "vector_mask_gen $len \t! vector mask generator" %}
ins_encode %{
__ genmask($dst$$Register, $len$$Register, $tempLen$$Register);
%}
ins_pipe( pipe_slow );
%}

instruct vmask_gen_imm(rRegL dst, immL len) %{
match(Set dst (VectorMaskGen len));
format %{ "vector_mask_gen $len \t! vector mask generator" %}
ins_encode %{
__ mov64($dst$$Register, (0xFFFFFFFFFFFFFFFFUL >> (64 -$len$$constant)));
%}
ins_pipe( pipe_slow );
%}

instruct vmasked_store64(memory mem, vec src, rRegL mask) %{
match(Set mem (StoreVectorMasked mem (Binary src mask)));
format %{ "vector_masked_store $mem, $src, $mask \t! vector masked store" %}
ins_encode %{
const MachNode* src_node = static_cast<const MachNode*>(this->in(this->operand_index($src)));
BasicType elmType = src_node->bottom_type()->is_vect()->element_basic_type();
int vector_len = vector_length_encoding(src_node);
__ kmovql(k2, $mask$$Register);
__ evmovdqu(elmType, k2, $mem$$Address, $src$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
#endif // _LP64
2 changes: 2 additions & 0 deletions src/hotspot/share/adlc/forms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ Form::DataType Form::is_load_from_memory(const char *opType) const {
if( strcmp(opType,"LoadS")==0 ) return Form::idealS;
if( strcmp(opType,"LoadVector")==0 ) return Form::idealV;
if( strcmp(opType,"LoadVectorGather")==0 ) return Form::idealV;
if( strcmp(opType,"LoadVectorMasked")==0 ) return Form::idealV;
assert( strcmp(opType,"Load") != 0, "Must type Loads" );
return Form::none;
}
Expand All @@ -284,6 +285,7 @@ Form::DataType Form::is_store_to_memory(const char *opType) const {
if( strcmp(opType,"StoreNKlass")==0) return Form::idealNKlass;
if( strcmp(opType,"StoreVector")==0 ) return Form::idealV;
if( strcmp(opType,"StoreVectorScatter")==0 ) return Form::idealV;
if( strcmp(opType,"StoreVectorMasked")==0 ) return Form::idealV;
assert( strcmp(opType,"Store") != 0, "Must type Stores" );
return Form::none;
}
Expand Down
7 changes: 4 additions & 3 deletions src/hotspot/share/adlc/formssel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -779,8 +779,9 @@ bool InstructForm::captures_bottom_type(FormDict &globals) const {
!strcmp(_matrule->_rChild->_opType,"ShenandoahCompareAndExchangeN") ||
#endif
!strcmp(_matrule->_rChild->_opType,"StrInflatedCopy") ||
!strcmp(_matrule->_rChild->_opType,"VectorMaskGen")||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeP") ||
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true;
!strcmp(_matrule->_rChild->_opType,"CompareAndExchangeN"))) return true;
else if ( is_ideal_load() == Form::idealP ) return true;
else if ( is_ideal_store() != Form::none ) return true;

Expand Down Expand Up @@ -3511,7 +3512,7 @@ int MatchNode::needs_ideal_memory_edge(FormDict &globals) const {
"StoreB","StoreC","Store" ,"StoreFP",
"LoadI", "LoadL", "LoadP" ,"LoadN", "LoadD" ,"LoadF" ,
"LoadB" , "LoadUB", "LoadUS" ,"LoadS" ,"Load" ,
"StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter",
"StoreVector", "LoadVector", "LoadVectorGather", "StoreVectorScatter", "LoadVectorMasked", "StoreVectorMasked",
"LoadRange", "LoadKlass", "LoadNKlass", "LoadL_unaligned", "LoadD_unaligned",
"LoadPLocked",
"StorePConditional", "StoreIConditional", "StoreLConditional",
Expand Down Expand Up @@ -4195,7 +4196,7 @@ bool MatchRule::is_vector() const {
"VectorRearrange","VectorLoadShuffle", "VectorLoadConst",
"VectorCastB2X", "VectorCastS2X", "VectorCastI2X",
"VectorCastL2X", "VectorCastF2X", "VectorCastD2X",
"VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret",
"VectorMaskWrapper", "VectorMaskCmp", "VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
"FmaVD", "FmaVF","PopCountVI",
// Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/share/opto/classes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,9 @@ macro(LoadVector)
macro(LoadVectorGather)
macro(StoreVector)
macro(StoreVectorScatter)
macro(LoadVectorMasked)
macro(StoreVectorMasked)
macro(VectorMaskGen)
macro(Pack)
macro(PackB)
macro(PackS)
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/share/opto/compile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3743,6 +3743,9 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {
case Op_StoreVector:
case Op_LoadVectorGather:
case Op_StoreVectorScatter:
case Op_VectorMaskGen:
case Op_LoadVectorMasked:
case Op_StoreVectorMasked:
break;

case Op_AddReductionVI:
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/share/opto/lcm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,7 @@ void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_
case Op_StoreN:
case Op_StoreVector:
case Op_StoreVectorScatter:
case Op_StoreVectorMasked:
case Op_StoreNKlass:
for (uint k = 1; k < m->req(); k++) {
Node *in = m->in(k);
Expand Down
7 changes: 7 additions & 0 deletions src/hotspot/share/opto/matcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2209,6 +2209,7 @@ void Matcher::find_shared( Node *n ) {
case Op_FmaVD:
case Op_FmaVF:
case Op_MacroLogicV:
case Op_LoadVectorMasked:
case Op_ThreadRefetch: // This must be added, otherwise we couldn't match the ThreadRefetchNode.
set_shared(n); // Force result into register (it will be anyways)
break;
Expand Down Expand Up @@ -2379,6 +2380,12 @@ void Matcher::find_shared( Node *n ) {
n->del_req(3);
break;
}
case Op_StoreVectorMasked: {
Node* pair = new BinaryNode(n->in(3), n->in(4));
n->set_req(3, pair);
n->del_req(4);
break;
}
case Op_LoopLimit: {
Node *pair1 = new BinaryNode(n->in(1),n->in(2));
n->set_req(1,pair1);
Expand Down
8 changes: 6 additions & 2 deletions src/hotspot/share/opto/node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ class TypeNode;
class UnlockNode;
class VectorNode;
class LoadVectorNode;
class LoadVectorMaskedNode;
class StoreVectorMaskedNode;
class LoadVectorGatherNode;
class StoreVectorNode;
class StoreVectorScatterNode;
Expand Down Expand Up @@ -699,13 +701,15 @@ class Node {
DEFINE_CLASS_ID(Parm, Proj, 4)
DEFINE_CLASS_ID(MachProj, Proj, 5)

DEFINE_CLASS_ID(Mem, Node, 4)
DEFINE_CLASS_ID(Load, Mem, 0)
DEFINE_CLASS_ID(Mem, Node, 4)
DEFINE_CLASS_ID(Load, Mem, 0)
DEFINE_CLASS_ID(LoadVector, Load, 0)
DEFINE_CLASS_ID(LoadVectorGather, LoadVector, 0)
DEFINE_CLASS_ID(LoadVectorMasked, LoadVector, 1)
DEFINE_CLASS_ID(Store, Mem, 1)
DEFINE_CLASS_ID(StoreVector, Store, 0)
DEFINE_CLASS_ID(StoreVectorScatter, StoreVector, 0)
DEFINE_CLASS_ID(StoreVectorMasked, StoreVector, 1)
DEFINE_CLASS_ID(LoadStore, Mem, 2)
DEFINE_CLASS_ID(LoadStoreConditional, LoadStore, 0)
DEFINE_CLASS_ID(CompareAndSwap, LoadStoreConditional, 0)
Expand Down
Loading

0 comments on commit 6a0328a

Please sign in to comment.