Skip to content

Commit

Permalink
[Backport] 8262355: Support for AVX-512 opmask register allocation.
Browse files Browse the repository at this point in the history
Summary: 8262355: Support for AVX-512 opmask register allocation.
	Add VectorMaskGen, LoadVectorMasked and StoreVectorMasked related code.

Test Plan: ci jtreg

Reviewed-by: JoshuaZhuwj

Issue: #607
  • Loading branch information
JinZhonghui committed Oct 27, 2023
1 parent e81c508 commit c85ad35
Show file tree
Hide file tree
Showing 37 changed files with 1,883 additions and 233 deletions.
32 changes: 32 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2495,6 +2495,22 @@ void Assembler::kmovwl(KRegister dst, Address src) {
emit_operand((Register)dst, src);
}

void Assembler::kmovwl(Address dst, KRegister src) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int8((unsigned char)0x91);
emit_operand((Register)src, dst);
}

void Assembler::kmovwl(KRegister dst, KRegister src) {
assert(VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F, &attributes);
emit_int16((unsigned char)0x90, (0xC0 | encode));
}

void Assembler::kmovdl(KRegister dst, Register src) {
assert(VM_Version::supports_avx512bw(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
Expand Down Expand Up @@ -2815,6 +2831,22 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, bool mer
emit_operand(dst, src);
}

void Assembler::evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
if (merge) {
attributes.reset_is_clear_context();
}
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}

void Assembler::evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1494,6 +1494,8 @@ class Assembler : public AbstractAssembler {
void kmovwl(KRegister dst, Register src);
void kmovwl(KRegister dst, Address src);
void kmovwl(Register dst, KRegister src);
void kmovwl(Address dst, KRegister src);
void kmovwl(KRegister dst, KRegister src);
void kmovdl(KRegister dst, Register src);
void kmovdl(Register dst, KRegister src);
void kmovql(KRegister dst, KRegister src);
Expand Down Expand Up @@ -1542,6 +1544,7 @@ class Assembler : public AbstractAssembler {
void evmovdqub(XMMRegister dst, Address src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, XMMRegister src, bool merge, int vector_len);
void evmovdqub(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
void evmovdqub(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, XMMRegister src, bool merge, int vector_len);
void evmovdquw(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
void evmovdquw(XMMRegister dst, Address src, bool merge, int vector_len);
Expand Down
28 changes: 19 additions & 9 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -988,10 +988,11 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
}

void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
assert(ArrayCopyPartialInlineSize <= 64,"");
mov64(dst, -1L);
bzhiq(dst, dst, len);
void C2_MacroAssembler::genmask(KRegister dst, Register len, Register temp) {
// assert(ArrayCopyPartialInlineSize <= 64,""); JDK-8261553 not introduced
mov64(temp, -1L);
bzhiq(temp, temp, len);
kmovql(dst, temp);
}
#endif // _LP64

Expand Down Expand Up @@ -1039,6 +1040,15 @@ void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, X
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}

void C2_MacroAssembler::evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len) {
MacroAssembler::evmovdqu(type, kmask, dst, src, vector_len);
}


void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
XMMRegister dst, XMMRegister src,
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
Expand Down Expand Up @@ -1240,7 +1250,8 @@ void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask
}
}

void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
XMMRegister vtmp1, XMMRegister vtmp2, KRegister mask) {
switch(vlen) {
case 4:
assert(vtmp1 != xnoreg, "required.");
Expand Down Expand Up @@ -1278,14 +1289,13 @@ void C2_MacroAssembler::vectortest(int bt, int vlen, XMMRegister src1, XMMRegist
break;
case 64:
{
KRegister ktemp = k2; // Use a hardcoded temp due to no k register allocation.
assert((vtmp1 == xnoreg) && (vtmp2 == xnoreg), "required.");
evpcmpeqb(ktemp, src1, src2, Assembler::AVX_512bit);
evpcmpeqb(mask, src1, src2, Assembler::AVX_512bit);
if (bt == BoolTest::ne) {
ktestql(ktemp, ktemp);
ktestql(mask, mask);
} else {
assert(bt == BoolTest::overflow, "required");
kortestql(ktemp, ktemp);
kortestql(mask, mask);
}
}
break;
Expand Down
7 changes: 5 additions & 2 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@
void evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len);
void evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len);

void evmovdqu(BasicType type, KRegister kmask, XMMRegister dst, Address src, int vector_len);
void evmovdqu(BasicType type, KRegister kmask, Address dst, XMMRegister src, int vector_len);

// extract
void extract(BasicType typ, Register dst, XMMRegister src, int idx);
XMMRegister get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex);
Expand All @@ -75,7 +78,7 @@

// vector test
void vectortest(int bt, int vlen, XMMRegister src1, XMMRegister src2,
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg);
XMMRegister vtmp1 = xnoreg, XMMRegister vtmp2 = xnoreg, KRegister mask = knoreg);

// blend
void evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch = rscratch1);
Expand All @@ -90,7 +93,7 @@
void reduceI(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
#ifdef _LP64
void reduceL(int opcode, int vlen, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2);
void genmask(Register dst, Register len, Register temp);
void genmask(KRegister dst, Register len, Register temp);
#endif // _LP64

// dst = reduce(op, src2) using vtmp as temps
Expand Down
37 changes: 33 additions & 4 deletions src/hotspot/cpu/x86/gc/z/zBarrierSetAssembler_x86.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2018, 2019, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2018, 2021, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -150,7 +150,6 @@ void ZBarrierSetAssembler::load_at(MacroAssembler* masm,
// Call VM
call_vm(masm, ZBarrierSetRuntime::load_barrier_on_oop_field_preloaded_addr(decorators), dst, scratch);

// Restore registers
__ movdqu(xmm0, Address(rsp, xmm_size * 0));
__ movdqu(xmm1, Address(rsp, xmm_size * 1));
__ movdqu(xmm2, Address(rsp, xmm_size * 2));
Expand Down Expand Up @@ -305,7 +304,7 @@ void ZBarrierSetAssembler::generate_c1_load_barrier_stub(LIR_Assembler* ce,
__ addptr(rsp, 2 * BytesPerWord);

// Verify result
__ verify_oop(rax, "Bad oop");
__ verify_oop(rax);

// Move result into place
if (ref != rax) {
Expand Down Expand Up @@ -395,6 +394,7 @@ class ZSaveLiveRegisters {

MacroAssembler* const _masm;
GrowableArray<Register> _gp_registers;
GrowableArray<KRegister> _opmask_registers;
GrowableArray<XMMRegisterData> _xmm_registers;
int _spill_size;
int _spill_offset;
Expand Down Expand Up @@ -451,11 +451,21 @@ class ZSaveLiveRegisters {
__ movq(Address(rsp, _spill_offset), reg);
}

void opmask_register_save(KRegister reg) {
_spill_offset -= 8;
__ kmovql(Address(rsp, _spill_offset), reg);
}

void gp_register_restore(Register reg) {
__ movq(reg, Address(rsp, _spill_offset));
_spill_offset += 8;
}

void opmask_register_restore(KRegister reg) {
__ kmovql(reg, Address(rsp, _spill_offset));
_spill_offset += 8;
}

void initialize(ZLoadBarrierStubC2* stub) {
// Create mask of caller saved registers that need to
// be saved/restored if live
Expand All @@ -478,6 +488,7 @@ class ZSaveLiveRegisters {
}

int gp_spill_size = 0;
int opmask_spill_size = 0;
int xmm_spill_size = 0;

// Record registers that needs to be saved/restored
Expand All @@ -492,6 +503,13 @@ class ZSaveLiveRegisters {
_gp_registers.append(vm_reg->as_Register());
gp_spill_size += 8;
}
} else if (vm_reg->is_KRegister()) {
// All opmask registers are caller saved, thus spill the ones
// which are live.
if (_opmask_registers.find(vm_reg->as_KRegister()) == -1) {
_opmask_registers.append(vm_reg->as_KRegister());
opmask_spill_size += 8;
}
} else if (vm_reg->is_XMMRegister()) {
// We encode in the low order 4 bits of the opto_reg, how large part of the register is live
const VMReg vm_reg_base = OptoReg::as_VMReg(opto_reg & ~15);
Expand Down Expand Up @@ -519,13 +537,14 @@ class ZSaveLiveRegisters {
_xmm_registers.sort(xmm_compare_register_size);

// Stack pointer must be 16 bytes aligned for the call
_spill_offset = _spill_size = align_up(xmm_spill_size + gp_spill_size, 16);
_spill_offset = _spill_size = align_up(xmm_spill_size + gp_spill_size + opmask_spill_size, 16);
}

public:
ZSaveLiveRegisters(MacroAssembler* masm, ZLoadBarrierStubC2* stub) :
_masm(masm),
_gp_registers(),
_opmask_registers(),
_xmm_registers(),
_spill_size(0),
_spill_offset(0) {
Expand Down Expand Up @@ -575,9 +594,19 @@ class ZSaveLiveRegisters {
for (int i = 0; i < _gp_registers.length(); i++) {
gp_register_save(_gp_registers.at(i));
}

// Save opmask registers
for (int i = 0; i < _opmask_registers.length(); i++) {
opmask_register_save(_opmask_registers.at(i));
}
}

~ZSaveLiveRegisters() {
// Restore opmask registers
for (int i = _opmask_registers.length() - 1; i >= 0; i--) {
opmask_register_restore(_opmask_registers.at(i));
}

// Restore general purpose registers
for (int i = _gp_registers.length() - 1; i >= 0; i--) {
gp_register_restore(_gp_registers.at(i));
Expand Down
Loading

0 comments on commit c85ad35

Please sign in to comment.