Skip to content

Commit

Permalink
Merge branch 'wheremyfoodat:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
Ishan09811 authored Mar 15, 2024
2 parents 126169e + a00a5e0 commit 941c7f3
Show file tree
Hide file tree
Showing 11 changed files with 526 additions and 194 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ build/

.vs/
.vscode/*.log

.cache/
ipch/
*.aps
*.ncb
Expand Down
6 changes: 6 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,9 @@
[submodule "third_party/dynarmic"]
path = third_party/dynarmic
url = https://github.com/Panda3DS-emu/dynarmic
[submodule "third_party/nihstro"]
path = third_party/nihstro
url = https://github.com/neobrain/nihstro.git
[submodule "third_party/Catch2"]
path = third_party/Catch2
url = https://github.com/catchorg/Catch2.git
356 changes: 183 additions & 173 deletions CMakeLists.txt

Large diffs are not rendered by default.

7 changes: 4 additions & 3 deletions include/PICA/dynapica/shader_rec_emitter_arm64.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
oaknut::Label emitLog2Func();
oaknut::Label emitExp2Func();

// Emit a PICA200-compliant multiplication that handles "0 * inf = 0"
void emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0);

template <typename T>
T getLabelPointer(const oaknut::Label& label) {
auto pointer = reinterpret_cast<u8*>(oaknut::CodeBlock::ptr()) + label.offset();
Expand Down Expand Up @@ -123,9 +126,7 @@ class ShaderEmitter : private oaknut::CodeBlock, public oaknut::CodeGenerator {
ShaderEmitter() : oaknut::CodeBlock(allocSize), oaknut::CodeGenerator(oaknut::CodeBlock::ptr()) {}

// PC must be a valid entrypoint here. It doesn't have that much overhead in this case, so we use std::array<>::at() to assert it does
InstructionCallback getInstructionCallback(u32 pc) {
return getLabelPointer<InstructionCallback>(instructionLabels.at(pc));
}
InstructionCallback getInstructionCallback(u32 pc) { return getLabelPointer<InstructionCallback>(instructionLabels.at(pc)); }

PrologueCallback getPrologueCallback() { return prologueCb; }
void compile(const PICAShader& shaderUnit);
Expand Down
50 changes: 42 additions & 8 deletions src/core/PICA/dynapica/shader_rec_emitter_arm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ using namespace Helpers;
using namespace oaknut;
using namespace oaknut::util;

// TODO: Expose safe/unsafe optimizations to the user
constexpr bool useSafeMUL = true;

// Similar to the x64 recompiler, we use an odd internal ABI, which abuses the fact that we'll very rarely be calling C++ functions
// So to avoid pushing and popping, we'll be making use of volatile registers as much as possible
static constexpr QReg scratch1 = Q0;
Expand Down Expand Up @@ -474,14 +477,18 @@ void ShaderEmitter::recDP3(const PICAShader& shader, u32 instruction) {
const u32 dest = getBits<21, 5>(instruction);
const u32 writeMask = getBits<0, 4>(operandDescriptor);

// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
// Set W component of src1 to 0.0, so that the w factor of the following dp4 will become 0, making it equivalent to a dp3
INS(src1_vec.Selem()[3], WZR);

// Now do a full DP4
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); // Do a piecewise multiplication of the vectors first
// Do a piecewise multiplication of the vectors first
if constexpr (useSafeMUL) {
emitSafeMUL(src1_vec, src2_vec, scratch1);
} else {
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
}
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product

Expand All @@ -500,11 +507,15 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
const u32 dest = getBits<21, 5>(instruction);
const u32 writeMask = getBits<0, 4>(operandDescriptor);

// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);

FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4()); // Do a piecewise multiplication of the vectors first
// Do a piecewise multiplication of the vectors first
if constexpr (useSafeMUL) {
emitSafeMUL(src1_vec, src2_vec, scratch1);
} else {
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
}
FADDP(src1_vec.S4(), src1_vec.S4(), src1_vec.S4()); // Now add the adjacent components together
FADDP(src1_vec.toS(), src1_vec.toD().S2()); // Again for the bottom 2 lanes. Now the bottom lane contains the dot product

Expand All @@ -515,6 +526,20 @@ void ShaderEmitter::recDP4(const PICAShader& shader, u32 instruction) {
storeRegister(src1_vec, shader, dest, operandDescriptor);
}

void ShaderEmitter::emitSafeMUL(oaknut::QReg src1, oaknut::QReg src2, oaknut::QReg scratch0) {
// 0 * inf and inf * 0 in the PICA should return 0 instead of NaN
// This can be done by checking for NaNs before and after a multiplication

// FMULX returns 2.0 in the case of 0.0 * inf or inf * 0.0
// Both a FMUL and FMULX are done and the results are compared to each other
// In the case that the results are diferent(a 0.0*inf happened), then
// a 0.0 is written
FMULX(scratch1.S4(), src1.S4(), src2.S4());
FMUL(src1.S4(), src1.S4(), src2.S4());
CMEQ(scratch1.S4(), scratch1.S4(), src1.S4());
AND(src1.B16(), src1.B16(), scratch1.B16());
}

void ShaderEmitter::recADD(const PICAShader& shader, u32 instruction) {
const u32 operandDescriptor = shader.operandDescriptors[instruction & 0x7f];
const u32 src1 = getBits<12, 7>(instruction);
Expand Down Expand Up @@ -561,10 +586,15 @@ void ShaderEmitter::recMUL(const PICAShader& shader, u32 instruction) {
const u32 idx = getBits<19, 2>(instruction);
const u32 dest = getBits<21, 5>(instruction);

// TODO: Safe multiplication equivalent (Multiplication is not IEEE compliant on the PICA)
loadRegister<1>(src1_vec, shader, src1, idx, operandDescriptor);
loadRegister<2>(src2_vec, shader, src2, 0, operandDescriptor);
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());

if constexpr (useSafeMUL) {
emitSafeMUL(src1_vec, src2_vec, scratch1);
} else {
FMUL(src1_vec.S4(), src1_vec.S4(), src2_vec.S4());
}

storeRegister(src1_vec, shader, dest, operandDescriptor);
}

Expand Down Expand Up @@ -632,8 +662,12 @@ void ShaderEmitter::recMAD(const PICAShader& shader, u32 instruction) {
loadRegister<2>(src2_vec, shader, src2, isMADI ? 0 : idx, operandDescriptor);
loadRegister<3>(src3_vec, shader, src3, isMADI ? idx : 0, operandDescriptor);

// TODO: Safe PICA multiplication
FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
if constexpr (useSafeMUL) {
emitSafeMUL(src1_vec, src2_vec, scratch1);
FADD(src3_vec.S4(), src3_vec.S4(), src1_vec.S4());
} else {
FMLA(src3_vec.S4(), src1_vec.S4(), src2_vec.S4());
}
storeRegister(src3_vec, shader, dest, operandDescriptor);
}

Expand Down
30 changes: 23 additions & 7 deletions src/core/PICA/shader_interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ void PICAShader::flr(u32 instruction) {
u32 componentMask = operandDescriptor & 0xf;
for (int i = 0; i < 4; i++) {
if (componentMask & (1 << i)) {
destVector[3 - i] = f24::fromFloat32(std::floor(srcVector[3 - 1].toFloat32()));
destVector[3 - i] = f24::fromFloat32(std::floor(srcVector[3 - i].toFloat32()));
}
}
}
Expand All @@ -244,8 +244,12 @@ void PICAShader::max(u32 instruction) {
u32 componentMask = operandDescriptor & 0xf;
for (int i = 0; i < 4; i++) {
if (componentMask & (1 << i)) {
const auto maximum = srcVec1[3 - i] > srcVec2[3 - i] ? srcVec1[3 - i] : srcVec2[3 - i];
destVector[3 - i] = maximum;
const float inputA = srcVec1[3 - i].toFloat32();
const float inputB = srcVec2[3 - i].toFloat32();
// max(NaN, 2.f) -> NaN
// max(2.f, NaN) -> 2
const auto& maximum = std::isinf(inputB) ? inputB : std::max(inputB, inputA);
destVector[3 - i] = f24::fromFloat32(maximum);
}
}
}
Expand All @@ -266,8 +270,12 @@ void PICAShader::min(u32 instruction) {
u32 componentMask = operandDescriptor & 0xf;
for (int i = 0; i < 4; i++) {
if (componentMask & (1 << i)) {
const auto mininum = srcVec1[3 - i] < srcVec2[3 - i] ? srcVec1[3 - i] : srcVec2[3 - i];
destVector[3 - i] = mininum;
const float inputA = srcVec1[3 - i].toFloat32();
const float inputB = srcVec2[3 - i].toFloat32();
// min(NaN, 2.f) -> NaN
// min(2.f, NaN) -> 2
const auto& mininum = std::min(inputB, inputA);
destVector[3 - i] = f24::fromFloat32(mininum);
}
}
}
Expand Down Expand Up @@ -382,7 +390,11 @@ void PICAShader::rcp(u32 instruction) {
vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);

vec4f& destVector = getDest(dest);
f24 res = f24::fromFloat32(1.0f) / srcVec1[0];
float input = srcVec1[0].toFloat32();
if (input == -0.0f) {
input = 0.0f;
}
const f24 res = f24::fromFloat32(1.0f / input);

u32 componentMask = operandDescriptor & 0xf;
for (int i = 0; i < 4; i++) {
Expand All @@ -402,7 +414,11 @@ void PICAShader::rsq(u32 instruction) {
vec4f srcVec1 = getSourceSwizzled<1>(src1, operandDescriptor);

vec4f& destVector = getDest(dest);
f24 res = f24::fromFloat32(1.0f / std::sqrt(srcVec1[0].toFloat32()));
float input = srcVec1[0].toFloat32();
if (input == -0.0f) {
input = 0.0f;
}
const f24 res = f24::fromFloat32(1.0f / std::sqrt(input));

u32 componentMask = operandDescriptor & 0xf;
for (int i = 0; i < 4; i++) {
Expand Down
6 changes: 5 additions & 1 deletion src/core/services/boss.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ namespace BOSSCommands {
GetTaskIdList = 0x000E0000,
GetNsDataIdList = 0x00100102,
GetNsDataIdList1 = 0x00110102,
GetNsDataIdList2 = 0x00120102,
GetNsDataIdList3 = 0x00130102,
SendProperty = 0x00140082,
ReceiveProperty = 0x00160082,
GetTaskServiceStatus = 0x001B0042,
Expand All @@ -40,7 +42,9 @@ void BOSSService::handleSyncRequest(u32 messagePointer) {
case BOSSCommands::GetErrorCode: getErrorCode(messagePointer); break;
case BOSSCommands::GetNewArrivalFlag: getNewArrivalFlag(messagePointer); break;
case BOSSCommands::GetNsDataIdList:
case BOSSCommands::GetNsDataIdList1:
case BOSSCommands::GetNsDataIdList1:
case BOSSCommands::GetNsDataIdList2:
case BOSSCommands::GetNsDataIdList3:
getNsDataIdList(messagePointer, command); break;
case BOSSCommands::GetOptoutFlag: getOptoutFlag(messagePointer); break;
case BOSSCommands::GetStorageEntryInfo: getStorageEntryInfo(messagePointer); break;
Expand Down
Loading

0 comments on commit 941c7f3

Please sign in to comment.