From 4dd002b0929d265a2342e50abe531c2e7d27f50e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1t=C3=A9=20Tokodi?= Date: Tue, 22 Oct 2024 02:02:30 +0200 Subject: [PATCH] WIP: Support Short (8/16 bit) atomic RMW operations on RISCV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Máté Tokodi mate.tokodi@szteszoftver.hu --- src/jit/Backend.cpp | 4 + src/jit/ByteCodeParser.cpp | 40 ++-- src/jit/Compiler.h | 1 + src/jit/MemoryInl.h | 194 ++++++++++++++++-- .../extended/threads/atomic_with_offsets.wast | 12 ++ 5 files changed, 223 insertions(+), 28 deletions(-) diff --git a/src/jit/Backend.cpp b/src/jit/Backend.cpp index f876a53d7..808790940 100644 --- a/src/jit/Backend.cpp +++ b/src/jit/Backend.cpp @@ -1057,6 +1057,10 @@ void JITCompiler::compileFunction(JITFunction* jitFunc, bool isExternal) ASSERT(m_context.trapBlocksStart == 0); m_context.trapBlocksStart = 1; } + + if (sljit_emit_atomic_load(m_compiler, SLJIT_MOV_U16 | SLJIT_ATOMIC_TEST, SLJIT_R0, SLJIT_R1) != SLJIT_ERR_UNSUPPORTED) { + m_options |= JITCompiler::kHasShortAtomic; + } } #ifdef WALRUS_JITPERF diff --git a/src/jit/ByteCodeParser.cpp b/src/jit/ByteCodeParser.cpp index a1c0c9abc..5f96349dd 100644 --- a/src/jit/ByteCodeParser.cpp +++ b/src/jit/ByteCodeParser.cpp @@ -1907,24 +1907,27 @@ static void compileFunction(JITCompiler* compiler) } break; } - case ByteCode::I32AtomicRmwAddOpcode: case ByteCode::I32AtomicRmw8AddUOpcode: case ByteCode::I32AtomicRmw16AddUOpcode: - case ByteCode::I32AtomicRmwSubOpcode: case ByteCode::I32AtomicRmw8SubUOpcode: case ByteCode::I32AtomicRmw16SubUOpcode: - case ByteCode::I32AtomicRmwAndOpcode: case ByteCode::I32AtomicRmw8AndUOpcode: case ByteCode::I32AtomicRmw16AndUOpcode: - case ByteCode::I32AtomicRmwOrOpcode: case ByteCode::I32AtomicRmw8OrUOpcode: case ByteCode::I32AtomicRmw16OrUOpcode: - case ByteCode::I32AtomicRmwXorOpcode: case ByteCode::I32AtomicRmw8XorUOpcode: case ByteCode::I32AtomicRmw16XorUOpcode: - case ByteCode::I32AtomicRmwXchgOpcode: case ByteCode::I32AtomicRmw8XchgUOpcode: case ByteCode::I32AtomicRmw16XchgUOpcode: { + compiler->increaseStackTmpSize(16); + FALLTHROUGH; + } + case ByteCode::I32AtomicRmwAddOpcode: + case ByteCode::I32AtomicRmwSubOpcode: + case ByteCode::I32AtomicRmwAndOpcode: + case ByteCode::I32AtomicRmwOrOpcode: + case ByteCode::I32AtomicRmwXorOpcode: + case ByteCode::I32AtomicRmwXchgOpcode: { info = Instruction::kIs32Bit; requiredInit = OTAtomicRmwI32; FALLTHROUGH; @@ -1945,21 +1948,24 @@ static void compileFunction(JITCompiler* compiler) } case ByteCode::I64AtomicRmw8AddUOpcode: case ByteCode::I64AtomicRmw16AddUOpcode: - case ByteCode::I64AtomicRmw32AddUOpcode: case ByteCode::I64AtomicRmw8SubUOpcode: case ByteCode::I64AtomicRmw16SubUOpcode: - case ByteCode::I64AtomicRmw32SubUOpcode: case ByteCode::I64AtomicRmw8AndUOpcode: case ByteCode::I64AtomicRmw16AndUOpcode: - case ByteCode::I64AtomicRmw32AndUOpcode: case ByteCode::I64AtomicRmw8OrUOpcode: case ByteCode::I64AtomicRmw16OrUOpcode: - case ByteCode::I64AtomicRmw32OrUOpcode: case ByteCode::I64AtomicRmw8XorUOpcode: case ByteCode::I64AtomicRmw16XorUOpcode: - case ByteCode::I64AtomicRmw32XorUOpcode: case ByteCode::I64AtomicRmw8XchgUOpcode: - case ByteCode::I64AtomicRmw16XchgUOpcode: + case ByteCode::I64AtomicRmw16XchgUOpcode: { + compiler->increaseStackTmpSize(16); + FALLTHROUGH; + } + case ByteCode::I64AtomicRmw32AddUOpcode: + case ByteCode::I64AtomicRmw32SubUOpcode: + case ByteCode::I64AtomicRmw32AndUOpcode: + case ByteCode::I64AtomicRmw32OrUOpcode: + case ByteCode::I64AtomicRmw32XorUOpcode: case ByteCode::I64AtomicRmw32XchgUOpcode: { Instruction* instr = compiler->append(byteCode, Instruction::Atomic, opcode, 2, 1); instr->addInfo(info); @@ -1973,9 +1979,12 @@ static void compileFunction(JITCompiler* compiler) operands[2] = STACK_OFFSET(atomicRmw->dstOffset()); break; } - case ByteCode::I32AtomicRmwCmpxchgOpcode: case ByteCode::I32AtomicRmw8CmpxchgUOpcode: case ByteCode::I32AtomicRmw16CmpxchgUOpcode: { + compiler->increaseStackTmpSize(16); + FALLTHROUGH; + } + case ByteCode::I32AtomicRmwCmpxchgOpcode: { info = Instruction::kIs32Bit; requiredInit = OTAtomicRmwCmpxchgI32; FALLTHROUGH; @@ -1990,7 +1999,10 @@ static void compileFunction(JITCompiler* compiler) FALLTHROUGH; } case ByteCode::I64AtomicRmw8CmpxchgUOpcode: - case ByteCode::I64AtomicRmw16CmpxchgUOpcode: + case ByteCode::I64AtomicRmw16CmpxchgUOpcode: { + compiler->increaseStackTmpSize(16); + FALLTHROUGH; + } case ByteCode::I64AtomicRmw32CmpxchgUOpcode: { Instruction* instr = compiler->append(byteCode, Instruction::Atomic, opcode, 3, 1); instr->addInfo(info); diff --git a/src/jit/Compiler.h b/src/jit/Compiler.h index 33002ce2a..e832d2e35 100644 --- a/src/jit/Compiler.h +++ b/src/jit/Compiler.h @@ -718,6 +718,7 @@ class JITCompiler { #endif static const uint32_t kHasCondMov = 1 << 0; + static const uint32_t kHasShortAtomic = 1 << 1; JITCompiler(Module* module, uint32_t JITFlags); diff --git a/src/jit/MemoryInl.h b/src/jit/MemoryInl.h index 4ed7abbb3..c9a737b64 100644 --- a/src/jit/MemoryInl.h +++ b/src/jit/MemoryInl.h @@ -1153,11 +1153,14 @@ static void emitAtomicRmwCmpxchg64(sljit_compiler* compiler, Instruction* instr) static void emitAtomic(sljit_compiler* compiler, Instruction* instr) { + bool noShortAtomic = !(CompileContext::get(compiler)->compiler->options() & JITCompiler::kHasShortAtomic); sljit_s32 operationSize = SLJIT_MOV; sljit_s32 size = 0; sljit_s32 offset = 0; sljit_s32 operation; uint32_t options = MemAddress::CheckNaturalAlignment | MemAddress::AbsoluteAddress; + sljit_sw stackTmpStart = CompileContext::get(compiler)->stackTmpStart; + switch (instr->opcode()) { case ByteCode::I64AtomicRmwCmpxchgOpcode: { @@ -1390,21 +1393,79 @@ static void emitAtomic(sljit_compiler* compiler, Instruction* instr) sljit_s32 baseReg = SLJIT_EXTRACT_REG(addr.memArg.arg); sljit_s32 tmpReg = srcReg; - sljit_emit_atomic_load(compiler, operationSize, SLJIT_TMP_DEST_REG, baseReg); + JITArg memValue(operands + 0); + sljit_s32 memValueReg = SLJIT_EXTRACT_REG(memValue.arg); + sljit_s32 maskReg = SLJIT_TMP_R2; + sljit_s32 tempReg = noShortAtomic ? SLJIT_TMP_R0 : SLJIT_TMP_DEST_REG; + + if (SLJIT_IS_IMM(memValueReg)) { + return; + } + + if (noShortAtomic && size <= 2) { +#if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) + maskReg = SLJIT_TMP_R1; +#endif /* SLJIT_32BIT_ARCHITECTURE */ + operationSize = SLJIT_MOV32; + + sljit_emit_op2(compiler, SLJIT_AND, maskReg, 0, baseReg, 0, SLJIT_IMM, 0x3); + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, maskReg, 0, SLJIT_IMM, 3); // multiply by 8 + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET, maskReg, 0); + + sljit_emit_op2(compiler, SLJIT_AND, baseReg, 0, baseReg, 0, SLJIT_IMM, ~0x3); + + sljit_emit_op2(compiler, SLJIT_AND, srcReg, 0, srcReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8)); + sljit_emit_op2(compiler, SLJIT_SHL, srcReg, 0, srcReg, 0, maskReg, 0); + + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8), maskReg, 0); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), stackTmpStart + WORD_LOW_OFFSET, maskReg, 0); + } + sljit_emit_atomic_load(compiler, operationSize, tempReg, baseReg); + + if (noShortAtomic && size <= 2) { + sljit_emit_op1(compiler, SLJIT_MOV, memValueReg, 0, tempReg, 0); + } if (operation != OP_XCHG) { tmpReg = instr->requiredReg(1); - sljit_emit_op2(compiler, operation, tmpReg, 0, SLJIT_TMP_DEST_REG, 0, srcReg, 0); } - sljit_emit_atomic_store(compiler, operationSize | SLJIT_SET_ATOMIC_STORED, tmpReg, baseReg, SLJIT_TMP_DEST_REG); + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_XOR, maskReg, 0, maskReg, 0, SLJIT_IMM, -1); + sljit_emit_op2(compiler, SLJIT_AND, memValueReg, 0, memValueReg, 0, maskReg, 0); + } + + if (operation != OP_XCHG) { + sljit_emit_op2(compiler, operation, tmpReg, 0, tempReg, 0, srcReg, 0); + } + + sljit_s32 returnReg = tempReg; + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_OR, tmpReg, 0, tmpReg, 0, memValueReg, 0); + } + +#if (defined SLJIT_CONFIG_ARM_32 && SLJIT_CONFIG_ARM_32) + returnReg = memValueReg; + sljit_emit_op1(compiler, SLJIT_MOV, memValueReg, 0, tempReg, 0); +#endif /* SLJIT_CONFIG_ARM_32 */ + + sljit_emit_atomic_store(compiler, operationSize | SLJIT_SET_ATOMIC_STORED, tmpReg, baseReg, tempReg); sljit_set_label(sljit_emit_jump(compiler, SLJIT_ATOMIC_NOT_STORED), restartOnFailure); + + if (noShortAtomic && size <= 2) { + sljit_emit_op1(compiler, SLJIT_MOV, maskReg, 0, SLJIT_MEM1(SLJIT_SP), stackTmpStart + WORD_LOW_OFFSET); + sljit_emit_op2(compiler, SLJIT_AND, returnReg, 0, returnReg, 0, maskReg, 0); + + sljit_emit_op1(compiler, SLJIT_MOV, maskReg, 0, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET); + sljit_emit_op2(compiler, SLJIT_LSHR, returnReg, 0, returnReg, 0, maskReg, 0); + } + + sljit_emit_op1(compiler, SLJIT_MOV, dst.arg, dst.argw, returnReg, 0); #if (defined SLJIT_32BIT_ARCHITECTURE && SLJIT_32BIT_ARCHITECTURE) if (dstPair.arg2 != 0) { sljit_emit_op1(compiler, SLJIT_MOV, dstPair.arg2, dstPair.arg2w, SLJIT_IMM, 0); } #endif /* SLJIT_32BIT_ARCHITECTURE */ - sljit_emit_op1(compiler, SLJIT_MOV, dst.arg, dst.argw, SLJIT_TMP_DEST_REG, 0); return; } @@ -1417,6 +1478,7 @@ static void emitAtomic(sljit_compiler* compiler, Instruction* instr) JITArgPair dstPair, srcExpectedPair; sljit_s32 tmpReg; sljit_s32 srcExpectedReg; + sljit_s32 srcValueReg; dstPair.arg2 = 0; @@ -1428,6 +1490,7 @@ static void emitAtomic(sljit_compiler* compiler, Instruction* instr) dst = JITArg(operands + 3); tmpReg = GET_SOURCE_REG(tmp.arg, instr->requiredReg(1)); srcExpectedReg = GET_SOURCE_REG(srcExpected.arg, instr->requiredReg(2)); + srcValueReg = GET_TARGET_REG(srcValue.arg, instr->requiredReg(0)); } else { JITArgPair tmpPair(operands + 0); JITArgPair srcValuePair(operands + 2); @@ -1439,6 +1502,7 @@ static void emitAtomic(sljit_compiler* compiler, Instruction* instr) srcValue.arg = srcValuePair.arg1; srcValue.argw = srcValuePair.arg1w; + srcValueReg = GET_TARGET_REG(srcValuePair.arg1, instr->requiredReg(0)); dst.arg = dstPair.arg1; dst.argw = dstPair.arg1w; sljit_emit_op1(compiler, SLJIT_MOV, dstPair.arg2, dstPair.arg2w, SLJIT_IMM, 0); @@ -1450,28 +1514,90 @@ static void emitAtomic(sljit_compiler* compiler, Instruction* instr) struct sljit_label* restartOnFailure = sljit_emit_label(compiler); sljit_s32 baseReg = SLJIT_EXTRACT_REG(addr.memArg.arg); + sljit_s32 memValueReg = tmpReg; + sljit_s32 maskReg = SLJIT_TMP_R1; + sljit_s32 tempReg = noShortAtomic ? SLJIT_TMP_R0 : SLJIT_TMP_DEST_REG; + + if (SLJIT_IS_IMM(memValueReg)) { + return; + } + + if (noShortAtomic && size <= 2) { + if (!(operationSize & SLJIT_32) && operationSize != SLJIT_MOV32) { + operationSize = SLJIT_MOV; + } else { + operationSize = SLJIT_MOV32; + } + } + if (!(operationSize & SLJIT_32) && operationSize != SLJIT_MOV32) { compareTopFalse = sljit_emit_cmp(compiler, SLJIT_NOT_EQUAL, SLJIT_IMM, 0, srcExpectedPair.arg2, srcExpectedPair.arg2w); } + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_AND, maskReg, 0, baseReg, 0, SLJIT_IMM, 0x3); + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, maskReg, 0, SLJIT_IMM, 3); // multiply by 8 + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET, maskReg, 0); + + sljit_emit_op2(compiler, SLJIT_AND, baseReg, 0, baseReg, 0, SLJIT_IMM, ~0x3); + + sljit_emit_op2(compiler, SLJIT_AND, srcValueReg, 0, srcValueReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8)); + sljit_emit_op2(compiler, SLJIT_SHL, srcValueReg, 0, srcValueReg, 0, maskReg, 0); + + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8), maskReg, 0); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), stackTmpStart + WORD_LOW_OFFSET, maskReg, 0); + sljit_emit_op1(compiler, SLJIT_MOV, maskReg, 0, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET); + } + sljit_emit_op1(compiler, SLJIT_MOV, tmpReg, 0, srcValue.arg, srcValue.argw); - sljit_emit_atomic_load(compiler, operationSize, SLJIT_TMP_DEST_REG, baseReg); - compareFalse = sljit_emit_cmp(compiler, SLJIT_NOT_EQUAL, SLJIT_TMP_DEST_REG, 0, srcExpectedReg, 0); - sljit_emit_atomic_store(compiler, operationSize | SLJIT_SET_ATOMIC_STORED, tmpReg, baseReg, SLJIT_TMP_DEST_REG); + sljit_emit_atomic_load(compiler, operationSize, tempReg, baseReg); + + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_AND, tmpReg, 0, tempReg, 0, maskReg, 0); + } + + compareFalse = sljit_emit_cmp(compiler, SLJIT_NOT_EQUAL, (noShortAtomic && size <= 2 ? tmpReg : tempReg), 0, srcExpectedReg, 0); + + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_AND, tmpReg, 0, tmpReg, 0, maskReg, 0); + + sljit_emit_op1(compiler, SLJIT_MOV, srcExpectedReg, 0, tempReg, 0); + sljit_emit_op2(compiler, SLJIT_XOR, maskReg, 0, maskReg, 0, SLJIT_IMM, -1); + sljit_emit_op2(compiler, SLJIT_AND, srcExpectedReg, 0, srcExpectedReg, 0, maskReg, 0); + sljit_emit_op2(compiler, SLJIT_OR, tmpReg, 0, srcValueReg, 0, srcExpectedReg, 0); + } + + if (noShortAtomic) { + sljit_emit_op1(compiler, operationSize, maskReg, 0, tempReg, 0); + } + sljit_emit_atomic_store(compiler, operationSize | SLJIT_SET_ATOMIC_STORED, tmpReg, baseReg, tempReg); + if (noShortAtomic) { + sljit_emit_op1(compiler, operationSize, tempReg, 0, maskReg, 0); + } sljit_set_label(sljit_emit_jump(compiler, SLJIT_ATOMIC_NOT_STORED), restartOnFailure); storeSuccess = sljit_emit_jump(compiler, SLJIT_ATOMIC_STORED); if (!(operationSize & SLJIT_32) && operationSize != SLJIT_MOV32) { sljit_set_label(compareTopFalse, sljit_emit_label(compiler)); - sljit_emit_op1(compiler, operationSize, SLJIT_TMP_DEST_REG, 0, addr.memArg.arg, addr.memArg.argw); + sljit_emit_op1(compiler, operationSize, tempReg, 0, addr.memArg.arg, addr.memArg.argw); } - sljit_set_label(compareFalse, sljit_emit_label(compiler)); sljit_set_label(storeSuccess, sljit_emit_label(compiler)); + sljit_set_label(compareFalse, sljit_emit_label(compiler)); + + if (noShortAtomic && size <= 2) { + sljit_emit_op1(compiler, SLJIT_MOV, maskReg, 0, SLJIT_MEM1(SLJIT_SP), stackTmpStart + WORD_LOW_OFFSET); + sljit_emit_op2(compiler, SLJIT_AND, tmpReg, 0, tempReg, 0, maskReg, 0); + + sljit_emit_op1(compiler, SLJIT_MOV, maskReg, 0, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET); + sljit_emit_op2(compiler, SLJIT_LSHR, tmpReg, 0, tmpReg, 0, maskReg, 0); + tempReg = tmpReg; + } + if (dstPair.arg2 != 0) { sljit_emit_op1(compiler, SLJIT_MOV, dstPair.arg2, dstPair.arg2w, SLJIT_IMM, 0); } - sljit_emit_op1(compiler, SLJIT_MOV, dst.arg, dst.argw, SLJIT_TMP_DEST_REG, 0); + sljit_emit_op1(compiler, SLJIT_MOV, dst.arg, dst.argw, tempReg, 0); #else /* !SLJIT_32BIT_ARCHITECTURE */ sljit_s32 tmpReg; sljit_s32 srcExpectedReg; @@ -1481,19 +1607,59 @@ static void emitAtomic(sljit_compiler* compiler, Instruction* instr) JITArg dst(operands + 3); tmpReg = GET_SOURCE_REG(tmp.arg, instr->requiredReg(1)); srcExpectedReg = GET_SOURCE_REG(srcExpected.arg, instr->requiredReg(2)); + sljit_s32 tempReg = SLJIT_TMP_DEST_REG; + sljit_s32 tempReg2 = SLJIT_TMP_R1; + sljit_s32 maskReg = SLJIT_TMP_R2; struct sljit_jump* compareFalse; struct sljit_label* restartOnFailure = sljit_emit_label(compiler); sljit_s32 baseReg = SLJIT_EXTRACT_REG(addr.memArg.arg); sljit_emit_op1(compiler, SLJIT_MOV, tmpReg, 0, srcValue.arg, srcValue.argw); - sljit_emit_atomic_load(compiler, operationSize, SLJIT_TMP_DEST_REG, baseReg); - compareFalse = sljit_emit_cmp(compiler, SLJIT_NOT_EQUAL, SLJIT_TMP_DEST_REG, 0, srcExpectedReg, 0); - sljit_emit_atomic_store(compiler, operationSize | SLJIT_SET_ATOMIC_STORED, tmpReg, baseReg, SLJIT_TMP_DEST_REG); + + if (noShortAtomic && size <= 2) { + operationSize = SLJIT_MOV_P; + + sljit_emit_op2(compiler, SLJIT_AND, maskReg, 0, baseReg, 0, SLJIT_IMM, 0x7); + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, maskReg, 0, SLJIT_IMM, 3); // multiply by 8 + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET, maskReg, 0); + + sljit_emit_op2(compiler, SLJIT_AND, baseReg, 0, baseReg, 0, SLJIT_IMM, ~0x7); + + sljit_emit_op2(compiler, SLJIT_AND, tmpReg, 0, tmpReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8)); + sljit_emit_op2(compiler, SLJIT_SHL, tmpReg, 0, tmpReg, 0, maskReg, 0); + + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8), maskReg, 0); + sljit_emit_op1(compiler, SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), stackTmpStart + WORD_LOW_OFFSET, maskReg, 0); + sljit_emit_op1(compiler, SLJIT_MOV, maskReg, 0, SLJIT_MEM1(SLJIT_SP), stackTmpStart + 8 + WORD_LOW_OFFSET); + } + + sljit_emit_atomic_load(compiler, operationSize, tempReg, baseReg); + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_LSHR, tempReg2, 0, tempReg, 0, maskReg, 0); + sljit_emit_op2(compiler, SLJIT_AND, tempReg2, 0, tempReg2, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8)); + } + compareFalse = sljit_emit_cmp(compiler, SLJIT_NOT_EQUAL, (noShortAtomic && size <= 2 ? tempReg2 : tempReg), 0, srcExpectedReg, 0); + + if (noShortAtomic && size <= 2) { + sljit_emit_op1(compiler, SLJIT_MOV, srcExpectedReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8)); + sljit_emit_op2(compiler, SLJIT_SHL, maskReg, 0, srcExpectedReg, 0, maskReg, 0); + sljit_emit_op2(compiler, SLJIT_XOR, maskReg, 0, maskReg, 0, SLJIT_IMM, -1); + sljit_emit_op2(compiler, SLJIT_AND, tempReg2, 0, tempReg, 0, maskReg, 0); + sljit_emit_op2(compiler, SLJIT_OR, tmpReg, 0, tmpReg, 0, tempReg2, 0); + sljit_emit_op1(compiler, SLJIT_MOV, tempReg2, 0, tempReg, 0); + } + + sljit_emit_atomic_store(compiler, operationSize | SLJIT_SET_ATOMIC_STORED, tmpReg, baseReg, tempReg); sljit_set_label(sljit_emit_jump(compiler, SLJIT_ATOMIC_NOT_STORED), restartOnFailure); sljit_set_label(compareFalse, sljit_emit_label(compiler)); - sljit_emit_op1(compiler, SLJIT_MOV, dst.arg, dst.argw, SLJIT_TMP_DEST_REG, 0); + + if (noShortAtomic && size <= 2) { + sljit_emit_op2(compiler, SLJIT_AND, tempReg, 0, tempReg, 0, SLJIT_IMM, (0xffffffff) >> ((4 - size) * 8)); + } + + sljit_emit_op1(compiler, SLJIT_MOV, dst.arg, dst.argw, tempReg, 0); #endif /* SLJIT_32BIT_ARCHITECTURE */ } diff --git a/test/extended/threads/atomic_with_offsets.wast b/test/extended/threads/atomic_with_offsets.wast index 55a92ab15..a3ac42ea5 100644 --- a/test/extended/threads/atomic_with_offsets.wast +++ b/test/extended/threads/atomic_with_offsets.wast @@ -113,6 +113,10 @@ (assert_return (invoke "i64.atomic.rmw16.xchg_u" (i32.const 31) (i64.const 0xbeefbeefbeefbeef)) (i64.const 0x1111)) (assert_return (invoke "i64.atomic.load" (i32.const 63828)) (i64.const 0x1111beef11111111)) +(invoke "initOffset" (i64.const 0x1111111111111111) (i32.const 63848)) +(assert_return (invoke "i64.atomic.rmw16.xchg_u" (i32.const 29) (i64.const 0xbeefbeefbeefbeef)) (i64.const 0x1111)) +(assert_return (invoke "i64.atomic.load" (i32.const 63828)) (i64.const 0x11111111beef1111)) + (invoke "initOffset" (i64.const 0x1111111111111111) (i32.const 2872)) (assert_return (invoke "i32.atomic.rmw16.cmpxchg_u" (i32.const 47) (i32.const 0x11111111) (i32.const 0xcafecafe)) (i32.const 0x1111)) (assert_return (invoke "i64.atomic.load" (i32.const 2852)) (i64.const 0x1111111111111111)) @@ -128,3 +132,11 @@ (invoke "initOffset" (i64.const 0x1111111111111111) (i32.const 24000)) (assert_return (invoke "i64.atomic.rmw16.cmpxchg_u" (i32.const 21169) (i64.const 0x1111) (i64.const 0xbeefbeefbeefbeef)) (i64.const 0x1111)) (assert_return (invoke "i64.atomic.load" (i32.const 23980)) (i64.const 0x111111111111beef)) + +(invoke "initOffset" (i64.const 0x1111111111111111) (i32.const 8216)) +(assert_return (invoke "i64.atomic.rmw8.cmpxchg_u" (i32.const 5390) (i64.const 0x11) (i64.const 0x4242424242424242)) (i64.const 0x11)) +(assert_return (invoke "i64.atomic.load" (i32.const 8196)) (i64.const 0x1111421111111111)) + +(invoke "initOffset" (i64.const 0x1111111111111111) (i32.const 24000)) +(assert_return (invoke "i64.atomic.rmw16.cmpxchg_u" (i32.const 21171) (i64.const 0x1111) (i64.const 0xbeefbeefbeefbeef)) (i64.const 0x1111)) +(assert_return (invoke "i64.atomic.load" (i32.const 23980)) (i64.const 0x11111111beef1111))